Bladeren bron

Parse base64 eml file (#1796)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Charlie.Wei 1 jaar geleden
bovenliggende
commit
64642fabc4
2 gewijzigde bestanden met toevoegingen van 21 en 10 verwijderingen
  1. 18 9
      api/core/data_loader/loader/unstructured/unstructured_eml.py
  2. 3 1
      api/requirements.txt

+ 18 - 9
api/core/data_loader/loader/unstructured/unstructured_eml.py

@@ -1,9 +1,8 @@
 import logging
 import logging
-import re
-from typing import Optional, List, Tuple, cast
-
+import base64
+from typing import List
+from bs4 import BeautifulSoup
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.base import BaseLoader
-from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 from langchain.schema import Document
 
 
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
@@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
 
 
 class UnstructuredEmailLoader(BaseLoader):
 class UnstructuredEmailLoader(BaseLoader):
     """Load msg files.
     """Load msg files.
-
-
     Args:
     Args:
         file_path: Path to the file to load.
         file_path: Path to the file to load.
     """
     """
@@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
         self._file_path = file_path
         self._file_path = file_path
         self._api_url = api_url
         self._api_url = api_url
 
 
-
     def load(self) -> List[Document]:
     def load(self) -> List[Document]:
         from unstructured.partition.email import partition_email
         from unstructured.partition.email import partition_email
-
         elements = partition_email(filename=self._file_path, api_url=self._api_url)
         elements = partition_email(filename=self._file_path, api_url=self._api_url)
+
+        # noinspection PyBroadException
+        try:
+            for element in elements:
+                element_text = element.text.strip()
+
+                padding_needed = 4 - len(element_text) % 4
+                element_text += '=' * padding_needed
+
+                element_decode = base64.b64decode(element_text)
+                soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
+                element.text = soup.get_text()
+        except Exception:
+            pass
+
         from unstructured.chunking.title import chunk_by_title
         from unstructured.chunking.title import chunk_by_title
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
         documents = []
         documents = []
         for chunk in chunks:
         for chunk in chunks:
             text = chunk.text.strip()
             text = chunk.text.strip()
             documents.append(Document(page_content=text))
             documents.append(Document(page_content=text))
-
         return documents
         return documents

+ 3 - 1
api/requirements.txt

@@ -55,4 +55,6 @@ pymilvus==2.3.0
 qdrant-client==1.6.4
 qdrant-client==1.6.4
 cohere~=4.32
 cohere~=4.32
 unstructured~=0.10.27
 unstructured~=0.10.27
-unstructured[docx,pptx]~=0.10.27
+unstructured[docx,pptx]~=0.10.27
+bs4~=0.0.1
+markdown~=3.5.1