file_extractor.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import tempfile
  2. from pathlib import Path
  3. from typing import List, Union
  4. from langchain.document_loaders import TextLoader, Docx2txtLoader
  5. from langchain.schema import Document
  6. from core.data_loader.loader.csv import CSVLoader
  7. from core.data_loader.loader.excel import ExcelLoader
  8. from core.data_loader.loader.html import HTMLLoader
  9. from core.data_loader.loader.markdown import MarkdownLoader
  10. from core.data_loader.loader.pdf import PdfLoader
  11. from extensions.ext_storage import storage
  12. from models.model import UploadFile
  13. class FileExtractor:
  14. @classmethod
  15. def load(cls, upload_file: UploadFile, return_text: bool = False) -> Union[List[Document] | str]:
  16. with tempfile.TemporaryDirectory() as temp_dir:
  17. suffix = Path(upload_file.key).suffix
  18. file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
  19. storage.download(upload_file.key, file_path)
  20. input_file = Path(file_path)
  21. delimiter = '\n'
  22. if input_file.suffix == '.xlsx':
  23. loader = ExcelLoader(file_path)
  24. elif input_file.suffix == '.pdf':
  25. loader = PdfLoader(file_path, upload_file=upload_file)
  26. elif input_file.suffix in ['.md', '.markdown']:
  27. loader = MarkdownLoader(file_path, autodetect_encoding=True)
  28. elif input_file.suffix in ['.htm', '.html']:
  29. loader = HTMLLoader(file_path)
  30. elif input_file.suffix == '.docx':
  31. loader = Docx2txtLoader(file_path)
  32. elif input_file.suffix == '.csv':
  33. loader = CSVLoader(file_path, autodetect_encoding=True)
  34. else:
  35. # txt
  36. loader = TextLoader(file_path, autodetect_encoding=True)
  37. return delimiter.join([document.page_content for document in loader.load()]) if return_text else loader.load()