pdf_parser.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. from pathlib import Path
  2. from typing import Dict
  3. from flask import current_app
  4. from llama_index.readers.file.base_parser import BaseParser
  5. from pypdf import PdfReader
  6. from extensions.ext_storage import storage
  7. from models.model import UploadFile
  8. class PDFParser(BaseParser):
  9. """PDF parser."""
  10. def _init_parser(self) -> Dict:
  11. """Init parser."""
  12. return {}
  13. def parse_file(self, file: Path, errors: str = "ignore") -> str:
  14. """Parse file."""
  15. if not current_app.config.get('PDF_PREVIEW', True):
  16. return ''
  17. plaintext_file_key = ''
  18. plaintext_file_exists = False
  19. if self._parser_config and 'upload_file' in self._parser_config and self._parser_config['upload_file']:
  20. upload_file: UploadFile = self._parser_config['upload_file']
  21. if upload_file.hash:
  22. plaintext_file_key = 'upload_files/' + upload_file.tenant_id + '/' + upload_file.hash + '.plaintext'
  23. try:
  24. text = storage.load(plaintext_file_key).decode('utf-8')
  25. plaintext_file_exists = True
  26. return text
  27. except FileNotFoundError:
  28. pass
  29. text_list = []
  30. with open(file, "rb") as fp:
  31. # Create a PDF object
  32. pdf = PdfReader(fp)
  33. # Get the number of pages in the PDF document
  34. num_pages = len(pdf.pages)
  35. # Iterate over every page
  36. for page in range(num_pages):
  37. # Extract the text from the page
  38. page_text = pdf.pages[page].extract_text()
  39. text_list.append(page_text)
  40. text = "\n".join(text_list)
  41. # save plaintext file for caching
  42. if not plaintext_file_exists and plaintext_file_key:
  43. storage.save(plaintext_file_key, text.encode('utf-8'))
  44. return text