document.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. from abc import ABC, abstractmethod
  2. from collections.abc import Sequence
  3. from typing import Any, Optional
  4. from pydantic import BaseModel, Field
  5. class Document(BaseModel):
  6. """Class for storing a piece of text and associated metadata."""
  7. page_content: str
  8. vector: Optional[list[float]] = None
  9. """Arbitrary metadata about the page content (e.g., source, relationships to other
  10. documents, etc.).
  11. """
  12. metadata: Optional[dict] = Field(default_factory=dict)
  13. class BaseDocumentTransformer(ABC):
  14. """Abstract base class for document transformation systems.
  15. A document transformation system takes a sequence of Documents and returns a
  16. sequence of transformed Documents.
  17. Example:
  18. .. code-block:: python
  19. class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
  20. embeddings: Embeddings
  21. similarity_fn: Callable = cosine_similarity
  22. similarity_threshold: float = 0.95
  23. class Config:
  24. arbitrary_types_allowed = True
  25. def transform_documents(
  26. self, documents: Sequence[Document], **kwargs: Any
  27. ) -> Sequence[Document]:
  28. stateful_documents = get_stateful_documents(documents)
  29. embedded_documents = _get_embeddings_from_stateful_docs(
  30. self.embeddings, stateful_documents
  31. )
  32. included_idxs = _filter_similar_embeddings(
  33. embedded_documents, self.similarity_fn, self.similarity_threshold
  34. )
  35. return [stateful_documents[i] for i in sorted(included_idxs)]
  36. async def atransform_documents(
  37. self, documents: Sequence[Document], **kwargs: Any
  38. ) -> Sequence[Document]:
  39. raise NotImplementedError
  40. """
  41. @abstractmethod
  42. def transform_documents(
  43. self, documents: Sequence[Document], **kwargs: Any
  44. ) -> Sequence[Document]:
  45. """Transform a list of documents.
  46. Args:
  47. documents: A sequence of Documents to be transformed.
  48. Returns:
  49. A list of transformed Documents.
  50. """
  51. @abstractmethod
  52. async def atransform_documents(
  53. self, documents: Sequence[Document], **kwargs: Any
  54. ) -> Sequence[Document]:
  55. """Asynchronously transform a list of documents.
  56. Args:
  57. documents: A sequence of Documents to be transformed.
  58. Returns:
  59. A list of transformed Documents.
  60. """