file_factory.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. import mimetypes
  2. from collections.abc import Callable, Mapping, Sequence
  3. from typing import Any, cast
  4. import httpx
  5. from sqlalchemy import select
  6. from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS
  7. from core.file import File, FileBelongsTo, FileTransferMethod, FileType, FileUploadConfig
  8. from core.helper import ssrf_proxy
  9. from extensions.ext_database import db
  10. from models import MessageFile, ToolFile, UploadFile
  11. def build_from_message_files(
  12. *,
  13. message_files: Sequence["MessageFile"],
  14. tenant_id: str,
  15. config: FileUploadConfig,
  16. ) -> Sequence[File]:
  17. results = [
  18. build_from_message_file(message_file=file, tenant_id=tenant_id, config=config)
  19. for file in message_files
  20. if file.belongs_to != FileBelongsTo.ASSISTANT
  21. ]
  22. return results
  23. def build_from_message_file(
  24. *,
  25. message_file: "MessageFile",
  26. tenant_id: str,
  27. config: FileUploadConfig,
  28. ):
  29. mapping = {
  30. "transfer_method": message_file.transfer_method,
  31. "url": message_file.url,
  32. "id": message_file.id,
  33. "type": message_file.type,
  34. "upload_file_id": message_file.upload_file_id,
  35. }
  36. return build_from_mapping(
  37. mapping=mapping,
  38. tenant_id=tenant_id,
  39. config=config,
  40. )
  41. def build_from_mapping(
  42. *,
  43. mapping: Mapping[str, Any],
  44. tenant_id: str,
  45. config: FileUploadConfig | None = None,
  46. ) -> File:
  47. config = config or FileUploadConfig()
  48. transfer_method = FileTransferMethod.value_of(mapping.get("transfer_method"))
  49. build_functions: dict[FileTransferMethod, Callable] = {
  50. FileTransferMethod.LOCAL_FILE: _build_from_local_file,
  51. FileTransferMethod.REMOTE_URL: _build_from_remote_url,
  52. FileTransferMethod.TOOL_FILE: _build_from_tool_file,
  53. }
  54. build_func = build_functions.get(transfer_method)
  55. if not build_func:
  56. raise ValueError(f"Invalid file transfer method: {transfer_method}")
  57. file = build_func(
  58. mapping=mapping,
  59. tenant_id=tenant_id,
  60. transfer_method=transfer_method,
  61. )
  62. if not _is_file_valid_with_config(
  63. input_file_type=mapping.get("type", FileType.CUSTOM),
  64. file_extension=file.extension,
  65. file_transfer_method=file.transfer_method,
  66. config=config,
  67. ):
  68. raise ValueError(f"File validation failed for file: {file.filename}")
  69. return file
  70. def build_from_mappings(
  71. *,
  72. mappings: Sequence[Mapping[str, Any]],
  73. config: FileUploadConfig | None = None,
  74. tenant_id: str,
  75. ) -> Sequence[File]:
  76. files = [
  77. build_from_mapping(
  78. mapping=mapping,
  79. tenant_id=tenant_id,
  80. config=config,
  81. )
  82. for mapping in mappings
  83. ]
  84. if (
  85. config
  86. # If image config is set.
  87. and config.image_config
  88. # And the number of image files exceeds the maximum limit
  89. and sum(1 for _ in (filter(lambda x: x.type == FileType.IMAGE, files))) > config.image_config.number_limits
  90. ):
  91. raise ValueError(f"Number of image files exceeds the maximum limit {config.image_config.number_limits}")
  92. if config and config.number_limits and len(files) > config.number_limits:
  93. raise ValueError(f"Number of files exceeds the maximum limit {config.number_limits}")
  94. return files
  95. def _build_from_local_file(
  96. *,
  97. mapping: Mapping[str, Any],
  98. tenant_id: str,
  99. transfer_method: FileTransferMethod,
  100. ) -> File:
  101. stmt = select(UploadFile).where(
  102. UploadFile.id == mapping.get("upload_file_id"),
  103. UploadFile.tenant_id == tenant_id,
  104. )
  105. row = db.session.scalar(stmt)
  106. if row is None:
  107. raise ValueError("Invalid upload file")
  108. file_type = FileType(mapping.get("type"))
  109. file_type = _standardize_file_type(file_type, extension="." + row.extension, mime_type=row.mime_type)
  110. return File(
  111. id=mapping.get("id"),
  112. filename=row.name,
  113. extension="." + row.extension,
  114. mime_type=row.mime_type,
  115. tenant_id=tenant_id,
  116. type=file_type,
  117. transfer_method=transfer_method,
  118. remote_url=row.source_url,
  119. related_id=mapping.get("upload_file_id"),
  120. size=row.size,
  121. )
  122. def _build_from_remote_url(
  123. *,
  124. mapping: Mapping[str, Any],
  125. tenant_id: str,
  126. transfer_method: FileTransferMethod,
  127. ) -> File:
  128. url = mapping.get("url")
  129. if not url:
  130. raise ValueError("Invalid file url")
  131. mime_type, filename, file_size = _get_remote_file_info(url)
  132. extension = mimetypes.guess_extension(mime_type) or "." + filename.split(".")[-1] if "." in filename else ".bin"
  133. file_type = FileType(mapping.get("type"))
  134. file_type = _standardize_file_type(file_type, extension=extension, mime_type=mime_type)
  135. return File(
  136. id=mapping.get("id"),
  137. filename=filename,
  138. tenant_id=tenant_id,
  139. type=file_type,
  140. transfer_method=transfer_method,
  141. remote_url=url,
  142. mime_type=mime_type,
  143. extension=extension,
  144. size=file_size,
  145. )
  146. def _get_remote_file_info(url: str):
  147. file_size = -1
  148. filename = url.split("/")[-1].split("?")[0] or "unknown_file"
  149. mime_type = mimetypes.guess_type(filename)[0] or ""
  150. resp = ssrf_proxy.head(url, follow_redirects=True)
  151. resp = cast(httpx.Response, resp)
  152. if resp.status_code == httpx.codes.OK:
  153. if content_disposition := resp.headers.get("Content-Disposition"):
  154. filename = str(content_disposition.split("filename=")[-1].strip('"'))
  155. file_size = int(resp.headers.get("Content-Length", file_size))
  156. mime_type = mime_type or str(resp.headers.get("Content-Type", ""))
  157. return mime_type, filename, file_size
  158. def _build_from_tool_file(
  159. *,
  160. mapping: Mapping[str, Any],
  161. tenant_id: str,
  162. transfer_method: FileTransferMethod,
  163. ) -> File:
  164. tool_file = (
  165. db.session.query(ToolFile)
  166. .filter(
  167. ToolFile.id == mapping.get("tool_file_id"),
  168. ToolFile.tenant_id == tenant_id,
  169. )
  170. .first()
  171. )
  172. if tool_file is None:
  173. raise ValueError(f"ToolFile {mapping.get('tool_file_id')} not found")
  174. extension = "." + tool_file.file_key.split(".")[-1] if "." in tool_file.file_key else ".bin"
  175. file_type = FileType(mapping.get("type"))
  176. file_type = _standardize_file_type(file_type, extension=extension, mime_type=tool_file.mimetype)
  177. return File(
  178. id=mapping.get("id"),
  179. tenant_id=tenant_id,
  180. filename=tool_file.name,
  181. type=file_type,
  182. transfer_method=transfer_method,
  183. remote_url=tool_file.original_url,
  184. related_id=tool_file.id,
  185. extension=extension,
  186. mime_type=tool_file.mimetype,
  187. size=tool_file.size,
  188. )
  189. def _is_file_valid_with_config(
  190. *,
  191. input_file_type: str,
  192. file_extension: str,
  193. file_transfer_method: FileTransferMethod,
  194. config: FileUploadConfig,
  195. ) -> bool:
  196. if (
  197. config.allowed_file_types
  198. and input_file_type not in config.allowed_file_types
  199. and input_file_type != FileType.CUSTOM
  200. ):
  201. return False
  202. if (
  203. input_file_type == FileType.CUSTOM
  204. and config.allowed_file_extensions is not None
  205. and file_extension not in config.allowed_file_extensions
  206. ):
  207. return False
  208. if config.allowed_file_upload_methods and file_transfer_method not in config.allowed_file_upload_methods:
  209. return False
  210. if input_file_type == FileType.IMAGE and config.image_config:
  211. if config.image_config.transfer_methods and file_transfer_method not in config.image_config.transfer_methods:
  212. return False
  213. return True
  214. def _standardize_file_type(file_type: FileType, /, *, extension: str = "", mime_type: str = "") -> FileType:
  215. """
  216. If custom type, try to guess the file type by extension and mime_type.
  217. """
  218. if file_type != FileType.CUSTOM:
  219. return FileType(file_type)
  220. guessed_type = None
  221. if extension:
  222. guessed_type = _get_file_type_by_extension(extension)
  223. if guessed_type is None and mime_type:
  224. guessed_type = _get_file_type_by_mimetype(mime_type)
  225. return guessed_type or FileType.CUSTOM
  226. def _get_file_type_by_extension(extension: str) -> FileType | None:
  227. extension = extension.lstrip(".")
  228. if extension in IMAGE_EXTENSIONS:
  229. return FileType.IMAGE
  230. elif extension in VIDEO_EXTENSIONS:
  231. return FileType.VIDEO
  232. elif extension in AUDIO_EXTENSIONS:
  233. return FileType.AUDIO
  234. elif extension in DOCUMENT_EXTENSIONS:
  235. return FileType.DOCUMENT
  236. def _get_file_type_by_mimetype(mime_type: str) -> FileType | None:
  237. if "image" in mime_type:
  238. file_type = FileType.IMAGE
  239. elif "video" in mime_type:
  240. file_type = FileType.VIDEO
  241. elif "audio" in mime_type:
  242. file_type = FileType.AUDIO
  243. elif "text" in mime_type or "pdf" in mime_type:
  244. file_type = FileType.DOCUMENT
  245. else:
  246. file_type = FileType.CUSTOM
  247. return file_type