Browse Source

improve: unify Excel files parsing in either xls or xlsx file format by Pandas (#4965)

Bowen Liang 10 months ago
parent
commit
39c14ec7c1
1 changed files with 8 additions and 51 deletions
  1. 8 51
      api/core/rag/extractor/excel_extractor.py

+ 8 - 51
api/core/rag/extractor/excel_extractor.py

@@ -2,7 +2,6 @@
 from typing import Optional
 
 import pandas as pd
-import xlrd
 
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -28,61 +27,19 @@ class ExcelExtractor(BaseExtractor):
         self._autodetect_encoding = autodetect_encoding
 
     def extract(self) -> list[Document]:
-        """ parse excel file"""
-        if self._file_path.endswith('.xls'):
-            return self._extract4xls()
-        elif self._file_path.endswith('.xlsx'):
-            return self._extract4xlsx()
-
-    def _extract4xls(self) -> list[Document]:
-        wb = xlrd.open_workbook(filename=self._file_path)
+        """ Load from Excel file in xls or xlsx format using Pandas."""
         documents = []
-        # loop over all sheets
-        for sheet in wb.sheets():
-            row_header = None
-            for row_index, row in enumerate(sheet.get_rows(), start=1):                
-                if self.is_blank_row(row):
-                    continue
-                if row_header is None:
-                    row_header = row
-                    continue
-                item_arr = []
-                for index, cell in enumerate(row):
-                    txt_value = str(cell.value)
-                    item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
-                item_str = ",".join(item_arr)
-                document = Document(page_content=item_str, metadata={'source': self._file_path})
-                documents.append(document)
-        return documents
-
-    def _extract4xlsx(self) -> list[Document]:
-        """Load from file path using Pandas."""
-        data = []
         # Read each worksheet of an Excel file using Pandas
-        xls = pd.ExcelFile(self._file_path)
-        for sheet_name in xls.sheet_names:
-            df = pd.read_excel(xls, sheet_name=sheet_name)
+        excel_file = pd.ExcelFile(self._file_path)
+        for sheet_name in excel_file.sheet_names:
+            df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)
 
             # filter out rows with all NaN values
             df.dropna(how='all', inplace=True)
 
             # transform each row into a Document
-            for _, row in df.iterrows():
-                item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
-                document = Document(page_content=item, metadata={'source': self._file_path})
-                data.append(document)
-        return data
+            documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
+                                   metadata={'source': self._file_path},
+                                   ) for _, row in df.iterrows()]
 
-    @staticmethod
-    def is_blank_row(row):
-        """
-
-        Determine whether the specified line is a blank line.
-        :param row: row object。
-        :return: Returns True if the row is blank, False otherwise.
-        """
-        # Iterates through the cells and returns False if a non-empty cell is found
-        for cell in row:
-            if cell.value is not None and cell.value != '':
-                return False
-        return True
+        return documents