document.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. from abc import ABC, abstractmethod
  2. from collections.abc import Sequence
  3. from typing import Any, Optional
  4. from pydantic import BaseModel, Field
  5. class Document(BaseModel):
  6. """Class for storing a piece of text and associated metadata."""
  7. page_content: str
  8. vector: Optional[list[float]] = None
  9. """Arbitrary metadata about the page content (e.g., source, relationships to other
  10. documents, etc.).
  11. """
  12. metadata: Optional[dict] = Field(default_factory=dict)
  13. provider: Optional[str] = "dify"
  14. class BaseDocumentTransformer(ABC):
  15. """Abstract base class for document transformation systems.
  16. A document transformation system takes a sequence of Documents and returns a
  17. sequence of transformed Documents.
  18. Example:
  19. .. code-block:: python
  20. class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
  21. embeddings: Embeddings
  22. similarity_fn: Callable = cosine_similarity
  23. similarity_threshold: float = 0.95
  24. class Config:
  25. arbitrary_types_allowed = True
  26. def transform_documents(
  27. self, documents: Sequence[Document], **kwargs: Any
  28. ) -> Sequence[Document]:
  29. stateful_documents = get_stateful_documents(documents)
  30. embedded_documents = _get_embeddings_from_stateful_docs(
  31. self.embeddings, stateful_documents
  32. )
  33. included_idxs = _filter_similar_embeddings(
  34. embedded_documents, self.similarity_fn, self.similarity_threshold
  35. )
  36. return [stateful_documents[i] for i in sorted(included_idxs)]
  37. async def atransform_documents(
  38. self, documents: Sequence[Document], **kwargs: Any
  39. ) -> Sequence[Document]:
  40. raise NotImplementedError
  41. """
  42. @abstractmethod
  43. def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  44. """Transform a list of documents.
  45. Args:
  46. documents: A sequence of Documents to be transformed.
  47. Returns:
  48. A list of transformed Documents.
  49. """
  50. @abstractmethod
  51. async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  52. """Asynchronously transform a list of documents.
  53. Args:
  54. documents: A sequence of Documents to be transformed.
  55. Returns:
  56. A list of transformed Documents.
  57. """