| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 | import loggingfrom typing import List, Optionalfrom extensions.ext_storage import storagefrom langchain.document_loaders import PyPDFium2Loaderfrom langchain.document_loaders.base import BaseLoaderfrom langchain.schema import Documentfrom models.model import UploadFilelogger = logging.getLogger(__name__)class PdfLoader(BaseLoader):    """Load pdf files.    Args:        file_path: Path to the file to load.    """    def __init__(        self,        file_path: str,        upload_file: Optional[UploadFile] = None    ):        """Initialize with file path."""        self._file_path = file_path        self._upload_file = upload_file    def load(self) -> List[Document]:        plaintext_file_key = ''        plaintext_file_exists = False        if self._upload_file:            if self._upload_file.hash:                plaintext_file_key = 'upload_files/' + self._upload_file.tenant_id + '/' \                                     + self._upload_file.hash + '.0625.plaintext'                try:                    text = storage.load(plaintext_file_key).decode('utf-8')                    plaintext_file_exists = True                    return [Document(page_content=text)]                except FileNotFoundError:                    pass        documents = PyPDFium2Loader(file_path=self._file_path).load()        text_list = []        for document in documents:            text_list.append(document.page_content)        text = "\n\n".join(text_list)        # save plaintext file for caching        if not plaintext_file_exists and plaintext_file_key:            storage.save(plaintext_file_key, text.encode('utf-8'))        return documents
 |