dataset_index_tool.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. from flask import current_app
  2. from langchain.embeddings import OpenAIEmbeddings
  3. from langchain.tools import BaseTool
  4. from core.callback_handler.index_tool_callback_handler import DatasetIndexToolCallbackHandler
  5. from core.embedding.cached_embedding import CacheEmbedding
  6. from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
  7. from core.index.vector_index.vector_index import VectorIndex
  8. from core.llm.llm_builder import LLMBuilder
  9. from models.dataset import Dataset, DocumentSegment
  10. class DatasetTool(BaseTool):
  11. """Tool for querying a Dataset."""
  12. dataset: Dataset
  13. k: int = 2
  14. def _run(self, tool_input: str) -> str:
  15. if self.dataset.indexing_technique == "economy":
  16. # use keyword table query
  17. kw_table_index = KeywordTableIndex(
  18. dataset=self.dataset,
  19. config=KeywordTableConfig(
  20. max_keywords_per_chunk=5
  21. )
  22. )
  23. documents = kw_table_index.search(tool_input, search_kwargs={'k': self.k})
  24. return str("\n".join([document.page_content for document in documents]))
  25. else:
  26. model_credentials = LLMBuilder.get_model_credentials(
  27. tenant_id=self.dataset.tenant_id,
  28. model_provider=LLMBuilder.get_default_provider(self.dataset.tenant_id, 'text-embedding-ada-002'),
  29. model_name='text-embedding-ada-002'
  30. )
  31. embeddings = CacheEmbedding(OpenAIEmbeddings(
  32. **model_credentials
  33. ))
  34. vector_index = VectorIndex(
  35. dataset=self.dataset,
  36. config=current_app.config,
  37. embeddings=embeddings
  38. )
  39. documents = vector_index.search(
  40. tool_input,
  41. search_type='similarity',
  42. search_kwargs={
  43. 'k': self.k
  44. }
  45. )
  46. hit_callback = DatasetIndexToolCallbackHandler(self.dataset.id)
  47. hit_callback.on_tool_end(documents)
  48. document_context_list = []
  49. index_node_ids = [document.metadata['doc_id'] for document in documents]
  50. segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  51. DocumentSegment.status == 'completed',
  52. DocumentSegment.enabled == True,
  53. DocumentSegment.index_node_id.in_(index_node_ids)
  54. ).all()
  55. if segments:
  56. for segment in segments:
  57. if segment.answer:
  58. document_context_list.append(segment.answer)
  59. else:
  60. document_context_list.append(segment.content)
  61. return str("\n".join(document_context_list))
  62. async def _arun(self, tool_input: str) -> str:
  63. model_credentials = LLMBuilder.get_model_credentials(
  64. tenant_id=self.dataset.tenant_id,
  65. model_provider=LLMBuilder.get_default_provider(self.dataset.tenant_id, 'text-embedding-ada-002'),
  66. model_name='text-embedding-ada-002'
  67. )
  68. embeddings = CacheEmbedding(OpenAIEmbeddings(
  69. **model_credentials
  70. ))
  71. vector_index = VectorIndex(
  72. dataset=self.dataset,
  73. config=current_app.config,
  74. embeddings=embeddings
  75. )
  76. documents = await vector_index.asearch(
  77. tool_input,
  78. search_type='similarity',
  79. search_kwargs={
  80. 'k': 10
  81. }
  82. )
  83. hit_callback = DatasetIndexToolCallbackHandler(self.dataset.id)
  84. hit_callback.on_tool_end(documents)
  85. return str("\n".join([document.page_content for document in documents]))