vector_service.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. from typing import Optional
  2. from core.rag.datasource.keyword.keyword_factory import Keyword
  3. from core.rag.datasource.vdb.vector_factory import Vector
  4. from core.rag.models.document import Document
  5. from models.dataset import Dataset, DocumentSegment
  6. class VectorService:
  7. @classmethod
  8. def create_segments_vector(
  9. cls, keywords_list: Optional[list[list[str]]], segments: list[DocumentSegment], dataset: Dataset
  10. ):
  11. documents = []
  12. for segment in segments:
  13. document = Document(
  14. page_content=segment.content,
  15. metadata={
  16. "doc_id": segment.index_node_id,
  17. "doc_hash": segment.index_node_hash,
  18. "document_id": segment.document_id,
  19. "dataset_id": segment.dataset_id,
  20. },
  21. )
  22. documents.append(document)
  23. if dataset.indexing_technique == "high_quality":
  24. # save vector index
  25. vector = Vector(dataset=dataset)
  26. vector.add_texts(documents, duplicate_check=True)
  27. # save keyword index
  28. keyword = Keyword(dataset)
  29. if keywords_list and len(keywords_list) > 0:
  30. keyword.add_texts(documents, keywords_list=keywords_list)
  31. else:
  32. keyword.add_texts(documents)
  33. @classmethod
  34. def update_segment_vector(cls, keywords: Optional[list[str]], segment: DocumentSegment, dataset: Dataset):
  35. # update segment index task
  36. # format new index
  37. document = Document(
  38. page_content=segment.content,
  39. metadata={
  40. "doc_id": segment.index_node_id,
  41. "doc_hash": segment.index_node_hash,
  42. "document_id": segment.document_id,
  43. "dataset_id": segment.dataset_id,
  44. },
  45. )
  46. if dataset.indexing_technique == "high_quality":
  47. # update vector index
  48. vector = Vector(dataset=dataset)
  49. vector.delete_by_ids([segment.index_node_id])
  50. vector.add_texts([document], duplicate_check=True)
  51. # update keyword index
  52. keyword = Keyword(dataset)
  53. keyword.delete_by_ids([segment.index_node_id])
  54. # save keyword index
  55. if keywords and len(keywords) > 0:
  56. keyword.add_texts([document], keywords_list=[keywords])
  57. else:
  58. keyword.add_texts([document])