| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 | import loggingfrom typing import Optional, Dict, Listfrom langchain.document_loaders import CSVLoader as LCCSVLoaderfrom langchain.document_loaders.helpers import detect_file_encodingsfrom models.dataset import Documentlogger = logging.getLogger(__name__)class CSVLoader(LCCSVLoader):    def __init__(            self,            file_path: str,            source_column: Optional[str] = None,            csv_args: Optional[Dict] = None,            encoding: Optional[str] = None,            autodetect_encoding: bool = True,    ):        self.file_path = file_path        self.source_column = source_column        self.encoding = encoding        self.csv_args = csv_args or {}        self.autodetect_encoding = autodetect_encoding    def load(self) -> List[Document]:        """Load data into document objects."""        try:            with open(self.file_path, newline="", encoding=self.encoding) as csvfile:                docs = self._read_from_file(csvfile)        except UnicodeDecodeError as e:            if self.autodetect_encoding:                detected_encodings = detect_file_encodings(self.file_path)                for encoding in detected_encodings:                    logger.debug("Trying encoding: ", encoding.encoding)                    try:                        with open(self.file_path, newline="", encoding=encoding.encoding) as csvfile:                            docs = self._read_from_file(csvfile)                        break                    except UnicodeDecodeError:                        continue            else:                raise RuntimeError(f"Error loading {self.file_path}") from e        return docs    def _read_from_file(self, csvfile):        docs = []        csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore        for i, row in enumerate(csv_reader):            content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())            try:                source = (                    row[self.source_column]                    if self.source_column is not None                    else ''                )            except KeyError:                raise ValueError(                    f"Source column '{self.source_column}' not found in CSV file."                )            metadata = {"source": source, "row": i}            doc = Document(page_content=content, metadata=metadata)            docs.append(doc)        return docs
 |