소스 검색

feat: add YAML type in document extractor node (#9997)

zhuhao 5 달 전
부모
커밋
0095896051
1개의 변경된 파일14개의 추가작업 그리고 0개의 파일을 삭제
  1. 14 0
      api/core/workflow/nodes/document_extractor/node.py

+ 14 - 0
api/core/workflow/nodes/document_extractor/node.py

@@ -5,6 +5,7 @@ import json
 import docx
 import docx
 import pandas as pd
 import pandas as pd
 import pypdfium2
 import pypdfium2
+import yaml
 from unstructured.partition.email import partition_email
 from unstructured.partition.email import partition_email
 from unstructured.partition.epub import partition_epub
 from unstructured.partition.epub import partition_epub
 from unstructured.partition.msg import partition_msg
 from unstructured.partition.msg import partition_msg
@@ -101,6 +102,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
             return _extract_text_from_msg(file_content)
             return _extract_text_from_msg(file_content)
         case "application/json":
         case "application/json":
             return _extract_text_from_json(file_content)
             return _extract_text_from_json(file_content)
+        case "application/x-yaml" | "text/yaml":
+            return _extract_text_from_yaml(file_content)
         case _:
         case _:
             raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
             raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
 
 
@@ -112,6 +115,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
             return _extract_text_from_plain_text(file_content)
             return _extract_text_from_plain_text(file_content)
         case ".json":
         case ".json":
             return _extract_text_from_json(file_content)
             return _extract_text_from_json(file_content)
+        case ".yaml" | ".yml":
+            return _extract_text_from_yaml(file_content)
         case ".pdf":
         case ".pdf":
             return _extract_text_from_pdf(file_content)
             return _extract_text_from_pdf(file_content)
         case ".doc" | ".docx":
         case ".doc" | ".docx":
@@ -149,6 +154,15 @@ def _extract_text_from_json(file_content: bytes) -> str:
         raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
         raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
 
 
 
 
+def _extract_text_from_yaml(file_content: bytes) -> str:
+    """Extract the content from yaml file"""
+    try:
+        yaml_data = yaml.safe_load_all(file_content.decode("utf-8"))
+        return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
+    except (UnicodeDecodeError, yaml.YAMLError) as e:
+        raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
+
+
 def _extract_text_from_pdf(file_content: bytes) -> str:
 def _extract_text_from_pdf(file_content: bytes) -> str:
     try:
     try:
         pdf_file = io.BytesIO(file_content)
         pdf_file = io.BytesIO(file_content)