| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495 | from abc import ABC, abstractmethodfrom collections.abc import Sequencefrom typing import Any, Optionalfrom pydantic import BaseModelclass ChildDocument(BaseModel):    """Class for storing a piece of text and associated metadata."""    page_content: str    vector: Optional[list[float]] = None    """Arbitrary metadata about the page content (e.g., source, relationships to other        documents, etc.).    """    metadata: dict = {}class Document(BaseModel):    """Class for storing a piece of text and associated metadata."""    page_content: str    vector: Optional[list[float]] = None    """Arbitrary metadata about the page content (e.g., source, relationships to other        documents, etc.).    """    metadata: dict = {}    provider: Optional[str] = "dify"    children: Optional[list[ChildDocument]] = Noneclass BaseDocumentTransformer(ABC):    """Abstract base class for document transformation systems.    A document transformation system takes a sequence of Documents and returns a    sequence of transformed Documents.    Example:        .. code-block:: python            class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):                embeddings: Embeddings                similarity_fn: Callable = cosine_similarity                similarity_threshold: float = 0.95                class Config:                    arbitrary_types_allowed = True                def transform_documents(                    self, documents: Sequence[Document], **kwargs: Any                ) -> Sequence[Document]:                    stateful_documents = get_stateful_documents(documents)                    embedded_documents = _get_embeddings_from_stateful_docs(                        self.embeddings, stateful_documents                    )                    included_idxs = _filter_similar_embeddings(                        embedded_documents, self.similarity_fn, self.similarity_threshold                    )                    return [stateful_documents[i] for i in sorted(included_idxs)]                async def atransform_documents(                    self, documents: Sequence[Document], **kwargs: Any                ) -> Sequence[Document]:                    raise NotImplementedError    """    @abstractmethod    def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:        """Transform a list of documents.        Args:            documents: A sequence of Documents to be transformed.        Returns:            A list of transformed Documents.        """    @abstractmethod    async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:        """Asynchronously transform a list of documents.        Args:            documents: A sequence of Documents to be transformed.        Returns:            A list of transformed Documents.        """
 |