|
@@ -1,9 +1,8 @@
|
|
import logging
|
|
import logging
|
|
-import re
|
|
|
|
-from typing import Optional, List, Tuple, cast
|
|
|
|
-
|
|
|
|
|
|
+import base64
|
|
|
|
+from typing import List
|
|
|
|
+from bs4 import BeautifulSoup
|
|
from langchain.document_loaders.base import BaseLoader
|
|
from langchain.document_loaders.base import BaseLoader
|
|
-from langchain.document_loaders.helpers import detect_file_encodings
|
|
|
|
from langchain.schema import Document
|
|
from langchain.schema import Document
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
@@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
|
|
|
|
|
|
class UnstructuredEmailLoader(BaseLoader):
|
|
class UnstructuredEmailLoader(BaseLoader):
|
|
"""Load msg files.
|
|
"""Load msg files.
|
|
-
|
|
|
|
-
|
|
|
|
Args:
|
|
Args:
|
|
file_path: Path to the file to load.
|
|
file_path: Path to the file to load.
|
|
"""
|
|
"""
|
|
@@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
|
|
self._file_path = file_path
|
|
self._file_path = file_path
|
|
self._api_url = api_url
|
|
self._api_url = api_url
|
|
|
|
|
|
-
|
|
|
|
def load(self) -> List[Document]:
|
|
def load(self) -> List[Document]:
|
|
from unstructured.partition.email import partition_email
|
|
from unstructured.partition.email import partition_email
|
|
-
|
|
|
|
elements = partition_email(filename=self._file_path, api_url=self._api_url)
|
|
elements = partition_email(filename=self._file_path, api_url=self._api_url)
|
|
|
|
+
|
|
|
|
+ # noinspection PyBroadException
|
|
|
|
+ try:
|
|
|
|
+ for element in elements:
|
|
|
|
+ element_text = element.text.strip()
|
|
|
|
+
|
|
|
|
+ padding_needed = 4 - len(element_text) % 4
|
|
|
|
+ element_text += '=' * padding_needed
|
|
|
|
+
|
|
|
|
+ element_decode = base64.b64decode(element_text)
|
|
|
|
+ soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
|
|
|
|
+ element.text = soup.get_text()
|
|
|
|
+ except Exception:
|
|
|
|
+ pass
|
|
|
|
+
|
|
from unstructured.chunking.title import chunk_by_title
|
|
from unstructured.chunking.title import chunk_by_title
|
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
|
documents = []
|
|
documents = []
|
|
for chunk in chunks:
|
|
for chunk in chunks:
|
|
text = chunk.text.strip()
|
|
text = chunk.text.strip()
|
|
documents.append(Document(page_content=text))
|
|
documents.append(Document(page_content=text))
|
|
-
|
|
|
|
return documents
|
|
return documents
|