base.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. from __future__ import annotations
  2. from abc import abstractmethod, ABC
  3. from typing import List, Any
  4. from langchain.schema import Document, BaseRetriever
  5. from models.dataset import Dataset
  6. class BaseIndex(ABC):
  7. def __init__(self, dataset: Dataset):
  8. self.dataset = dataset
  9. @abstractmethod
  10. def create(self, texts: list[Document], **kwargs) -> BaseIndex:
  11. raise NotImplementedError
  12. @abstractmethod
  13. def create_with_collection_name(self, texts: list[Document], collection_name: str, **kwargs) -> BaseIndex:
  14. raise NotImplementedError
  15. @abstractmethod
  16. def add_texts(self, texts: list[Document], **kwargs):
  17. raise NotImplementedError
  18. @abstractmethod
  19. def text_exists(self, id: str) -> bool:
  20. raise NotImplementedError
  21. @abstractmethod
  22. def delete_by_ids(self, ids: list[str]) -> None:
  23. raise NotImplementedError
  24. @abstractmethod
  25. def delete_by_group_id(self, group_id: str) -> None:
  26. raise NotImplementedError
  27. @abstractmethod
  28. def delete_by_document_id(self, document_id: str):
  29. raise NotImplementedError
  30. @abstractmethod
  31. def get_retriever(self, **kwargs: Any) -> BaseRetriever:
  32. raise NotImplementedError
  33. @abstractmethod
  34. def search(
  35. self, query: str,
  36. **kwargs: Any
  37. ) -> List[Document]:
  38. raise NotImplementedError
  39. def delete(self) -> None:
  40. raise NotImplementedError
  41. def _filter_duplicate_texts(self, texts: list[Document]) -> list[Document]:
  42. for text in texts:
  43. doc_id = text.metadata['doc_id']
  44. exists_duplicate_node = self.text_exists(doc_id)
  45. if exists_duplicate_node:
  46. texts.remove(text)
  47. return texts
  48. def _get_uuids(self, texts: list[Document]) -> list[str]:
  49. return [text.metadata['doc_id'] for text in texts]