Browse Source

chore: extract retrival method literal values into enum (#5060)

Bowen Liang 10 months ago
parent
commit
c923684edd

+ 9 - 4
api/controllers/console/datasets/datasets.py

@@ -17,6 +17,7 @@ from core.model_runtime.entities.model_entities import ModelType
 from core.provider_manager import ProviderManager
 from core.provider_manager import ProviderManager
 from core.rag.datasource.vdb.vector_type import VectorType
 from core.rag.datasource.vdb.vector_type import VectorType
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.extractor.entity.extract_setting import ExtractSetting
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from extensions.ext_database import db
 from extensions.ext_database import db
 from fields.app_fields import related_app_list
 from fields.app_fields import related_app_list
 from fields.dataset_fields import dataset_detail_fields, dataset_query_detail_fields
 from fields.dataset_fields import dataset_detail_fields, dataset_query_detail_fields
@@ -500,13 +501,15 @@ class DatasetRetrievalSettingApi(Resource):
             case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCENT:
             case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCENT:
                 return {
                 return {
                     'retrieval_method': [
                     'retrieval_method': [
-                        'semantic_search'
+                        RetrievalMethod.SEMANTIC_SEARCH
                     ]
                     ]
                 }
                 }
             case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH:
             case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH:
                 return {
                 return {
                     'retrieval_method': [
                     'retrieval_method': [
-                        'semantic_search', 'full_text_search', 'hybrid_search'
+                        RetrievalMethod.SEMANTIC_SEARCH,
+                        RetrievalMethod.FULL_TEXT_SEARCH,
+                        RetrievalMethod.HYBRID_SEARCH,
                     ]
                     ]
                 }
                 }
             case _:
             case _:
@@ -522,13 +525,15 @@ class DatasetRetrievalSettingMockApi(Resource):
             case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCEN:
             case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCEN:
                 return {
                 return {
                     'retrieval_method': [
                     'retrieval_method': [
-                        'semantic_search'
+                        RetrievalMethod.SEMANTIC_SEARCH
                     ]
                     ]
                 }
                 }
             case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH:
             case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH:
                 return {
                 return {
                     'retrieval_method': [
                     'retrieval_method': [
-                        'semantic_search', 'full_text_search', 'hybrid_search'
+                        RetrievalMethod.SEMANTIC_SEARCH,
+                        RetrievalMethod.FULL_TEXT_SEARCH,
+                        RetrievalMethod.HYBRID_SEARCH,
                     ]
                     ]
                 }
                 }
             case _:
             case _:

+ 7 - 6
api/core/rag/datasource/retrieval_service.py

@@ -6,11 +6,12 @@ from flask import Flask, current_app
 from core.rag.data_post_processor.data_post_processor import DataPostProcessor
 from core.rag.data_post_processor.data_post_processor import DataPostProcessor
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.datasource.vdb.vector_factory import Vector
 from core.rag.datasource.vdb.vector_factory import Vector
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from extensions.ext_database import db
 from extensions.ext_database import db
 from models.dataset import Dataset
 from models.dataset import Dataset
 
 
 default_retrieval_model = {
 default_retrieval_model = {
-    'search_method': 'semantic_search',
+    'search_method': RetrievalMethod.SEMANTIC_SEARCH,
     'reranking_enable': False,
     'reranking_enable': False,
     'reranking_model': {
     'reranking_model': {
         'reranking_provider_name': '',
         'reranking_provider_name': '',
@@ -47,7 +48,7 @@ class RetrievalService:
             threads.append(keyword_thread)
             threads.append(keyword_thread)
             keyword_thread.start()
             keyword_thread.start()
         # retrieval_model source with semantic
         # retrieval_model source with semantic
-        if retrival_method == 'semantic_search' or retrival_method == 'hybrid_search':
+        if RetrievalMethod.is_support_semantic_search(retrival_method):
             embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
             embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
                 'flask_app': current_app._get_current_object(),
                 'flask_app': current_app._get_current_object(),
                 'dataset_id': dataset_id,
                 'dataset_id': dataset_id,
@@ -63,7 +64,7 @@ class RetrievalService:
             embedding_thread.start()
             embedding_thread.start()
 
 
         # retrieval source with full text
         # retrieval source with full text
-        if retrival_method == 'full_text_search' or retrival_method == 'hybrid_search':
+        if RetrievalMethod.is_support_fulltext_search(retrival_method):
             full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={
             full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={
                 'flask_app': current_app._get_current_object(),
                 'flask_app': current_app._get_current_object(),
                 'dataset_id': dataset_id,
                 'dataset_id': dataset_id,
@@ -85,7 +86,7 @@ class RetrievalService:
             exception_message = ';\n'.join(exceptions)
             exception_message = ';\n'.join(exceptions)
             raise Exception(exception_message)
             raise Exception(exception_message)
 
 
-        if retrival_method == 'hybrid_search':
+        if retrival_method == RetrievalMethod.HYBRID_SEARCH:
             data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False)
             data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False)
             all_documents = data_post_processor.invoke(
             all_documents = data_post_processor.invoke(
                 query=query,
                 query=query,
@@ -141,7 +142,7 @@ class RetrievalService:
                 )
                 )
 
 
                 if documents:
                 if documents:
-                    if reranking_model and retrival_method == 'semantic_search':
+                    if reranking_model and retrival_method == RetrievalMethod.SEMANTIC_SEARCH:
                         data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False)
                         data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False)
                         all_documents.extend(data_post_processor.invoke(
                         all_documents.extend(data_post_processor.invoke(
                             query=query,
                             query=query,
@@ -173,7 +174,7 @@ class RetrievalService:
                     top_k=top_k
                     top_k=top_k
                 )
                 )
                 if documents:
                 if documents:
-                    if reranking_model and retrival_method == 'full_text_search':
+                    if reranking_model and retrival_method == RetrievalMethod.FULL_TEXT_SEARCH:
                         data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False)
                         data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False)
                         all_documents.extend(data_post_processor.invoke(
                         all_documents.extend(data_post_processor.invoke(
                             query=query,
                             query=query,

+ 3 - 2
api/core/rag/retrieval/dataset_retrieval.py

@@ -15,6 +15,7 @@ from core.model_runtime.model_providers.__base.large_language_model import Large
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.models.document import Document
 from core.rag.models.document import Document
 from core.rag.rerank.rerank import RerankRunner
 from core.rag.rerank.rerank import RerankRunner
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter
 from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter
 from core.rag.retrieval.router.multi_dataset_react_route import ReactMultiDatasetRouter
 from core.rag.retrieval.router.multi_dataset_react_route import ReactMultiDatasetRouter
 from core.tools.tool.dataset_retriever.dataset_multi_retriever_tool import DatasetMultiRetrieverTool
 from core.tools.tool.dataset_retriever.dataset_multi_retriever_tool import DatasetMultiRetrieverTool
@@ -25,7 +26,7 @@ from models.dataset import Dataset, DatasetQuery, DocumentSegment
 from models.dataset import Document as DatasetDocument
 from models.dataset import Document as DatasetDocument
 
 
 default_retrieval_model = {
 default_retrieval_model = {
-    'search_method': 'semantic_search',
+    'search_method': RetrievalMethod.SEMANTIC_SEARCH,
     'reranking_enable': False,
     'reranking_enable': False,
     'reranking_model': {
     'reranking_model': {
         'reranking_provider_name': '',
         'reranking_provider_name': '',
@@ -419,7 +420,7 @@ class DatasetRetrieval:
         if retrieve_config.retrieve_strategy == DatasetRetrieveConfigEntity.RetrieveStrategy.SINGLE:
         if retrieve_config.retrieve_strategy == DatasetRetrieveConfigEntity.RetrieveStrategy.SINGLE:
             # get retrieval model config
             # get retrieval model config
             default_retrieval_model = {
             default_retrieval_model = {
-                'search_method': 'semantic_search',
+                'search_method': RetrievalMethod.SEMANTIC_SEARCH,
                 'reranking_enable': False,
                 'reranking_enable': False,
                 'reranking_model': {
                 'reranking_model': {
                     'reranking_provider_name': '',
                     'reranking_provider_name': '',

+ 15 - 0
api/core/rag/retrieval/retrival_methods.py

@@ -0,0 +1,15 @@
+from enum import Enum
+
+
+class RetrievalMethod(str, Enum):
+    SEMANTIC_SEARCH = 'semantic_search'
+    FULL_TEXT_SEARCH = 'full_text_search'
+    HYBRID_SEARCH = 'hybrid_search'
+
+    @staticmethod
+    def is_support_semantic_search(retrieval_method: str) -> bool:
+        return retrieval_method in {RetrievalMethod.SEMANTIC_SEARCH, RetrievalMethod.HYBRID_SEARCH}
+
+    @staticmethod
+    def is_support_fulltext_search(retrieval_method: str) -> bool:
+        return retrieval_method in {RetrievalMethod.FULL_TEXT_SEARCH, RetrievalMethod.HYBRID_SEARCH}

+ 2 - 1
api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py

@@ -8,12 +8,13 @@ from core.model_manager import ModelManager
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.entities.model_entities import ModelType
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.rerank.rerank import RerankRunner
 from core.rag.rerank.rerank import RerankRunner
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from core.tools.tool.dataset_retriever.dataset_retriever_base_tool import DatasetRetrieverBaseTool
 from core.tools.tool.dataset_retriever.dataset_retriever_base_tool import DatasetRetrieverBaseTool
 from extensions.ext_database import db
 from extensions.ext_database import db
 from models.dataset import Dataset, Document, DocumentSegment
 from models.dataset import Dataset, Document, DocumentSegment
 
 
 default_retrieval_model = {
 default_retrieval_model = {
-    'search_method': 'semantic_search',
+    'search_method': RetrievalMethod.SEMANTIC_SEARCH,
     'reranking_enable': False,
     'reranking_enable': False,
     'reranking_model': {
     'reranking_model': {
         'reranking_provider_name': '',
         'reranking_provider_name': '',

+ 2 - 1
api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py

@@ -2,12 +2,13 @@
 from pydantic import BaseModel, Field
 from pydantic import BaseModel, Field
 
 
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.datasource.retrieval_service import RetrievalService
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from core.tools.tool.dataset_retriever.dataset_retriever_base_tool import DatasetRetrieverBaseTool
 from core.tools.tool.dataset_retriever.dataset_retriever_base_tool import DatasetRetrieverBaseTool
 from extensions.ext_database import db
 from extensions.ext_database import db
 from models.dataset import Dataset, Document, DocumentSegment
 from models.dataset import Dataset, Document, DocumentSegment
 
 
 default_retrieval_model = {
 default_retrieval_model = {
-    'search_method': 'semantic_search',
+    'search_method': RetrievalMethod.SEMANTIC_SEARCH,
     'reranking_enable': False,
     'reranking_enable': False,
     'reranking_model': {
     'reranking_model': {
         'reranking_provider_name': '',
         'reranking_provider_name': '',

+ 2 - 1
api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py

@@ -11,6 +11,7 @@ from core.model_manager import ModelInstance, ModelManager
 from core.model_runtime.entities.model_entities import ModelFeature, ModelType
 from core.model_runtime.entities.model_entities import ModelFeature, ModelType
 from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
 from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
 from core.rag.retrieval.dataset_retrieval import DatasetRetrieval
 from core.rag.retrieval.dataset_retrieval import DatasetRetrieval
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from core.workflow.entities.base_node_data_entities import BaseNodeData
 from core.workflow.entities.base_node_data_entities import BaseNodeData
 from core.workflow.entities.node_entities import NodeRunResult, NodeType
 from core.workflow.entities.node_entities import NodeRunResult, NodeType
 from core.workflow.entities.variable_pool import VariablePool
 from core.workflow.entities.variable_pool import VariablePool
@@ -21,7 +22,7 @@ from models.dataset import Dataset, Document, DocumentSegment
 from models.workflow import WorkflowNodeExecutionStatus
 from models.workflow import WorkflowNodeExecutionStatus
 
 
 default_retrieval_model = {
 default_retrieval_model = {
-    'search_method': 'semantic_search',
+    'search_method': RetrievalMethod.SEMANTIC_SEARCH,
     'reranking_enable': False,
     'reranking_enable': False,
     'reranking_model': {
     'reranking_model': {
         'reranking_provider_name': '',
         'reranking_provider_name': '',

+ 2 - 1
api/models/dataset.py

@@ -13,6 +13,7 @@ from flask import current_app
 from sqlalchemy import func
 from sqlalchemy import func
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.dialects.postgresql import JSONB
 
 
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from extensions.ext_database import db
 from extensions.ext_database import db
 from extensions.ext_storage import storage
 from extensions.ext_storage import storage
 from models import StringUUID
 from models import StringUUID
@@ -116,7 +117,7 @@ class Dataset(db.Model):
     @property
     @property
     def retrieval_model_dict(self):
     def retrieval_model_dict(self):
         default_retrieval_model = {
         default_retrieval_model = {
-            'search_method': 'semantic_search',
+            'search_method': RetrievalMethod.SEMANTIC_SEARCH,
             'reranking_enable': False,
             'reranking_enable': False,
             'reranking_model': {
             'reranking_model': {
                 'reranking_provider_name': '',
                 'reranking_provider_name': '',

+ 3 - 2
api/services/dataset_service.py

@@ -15,6 +15,7 @@ from core.model_manager import ModelManager
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.entities.model_entities import ModelType
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.models.document import Document as RAGDocument
 from core.rag.models.document import Document as RAGDocument
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from events.dataset_event import dataset_was_deleted
 from events.dataset_event import dataset_was_deleted
 from events.document_event import document_was_deleted
 from events.document_event import document_was_deleted
 from extensions.ext_database import db
 from extensions.ext_database import db
@@ -602,7 +603,7 @@ class DocumentService:
                 dataset.collection_binding_id = dataset_collection_binding.id
                 dataset.collection_binding_id = dataset_collection_binding.id
                 if not dataset.retrieval_model:
                 if not dataset.retrieval_model:
                     default_retrieval_model = {
                     default_retrieval_model = {
-                        'search_method': 'semantic_search',
+                        'search_method': RetrievalMethod.SEMANTIC_SEARCH,
                         'reranking_enable': False,
                         'reranking_enable': False,
                         'reranking_model': {
                         'reranking_model': {
                             'reranking_provider_name': '',
                             'reranking_provider_name': '',
@@ -959,7 +960,7 @@ class DocumentService:
                 retrieval_model = document_data['retrieval_model']
                 retrieval_model = document_data['retrieval_model']
             else:
             else:
                 default_retrieval_model = {
                 default_retrieval_model = {
-                    'search_method': 'semantic_search',
+                    'search_method': RetrievalMethod.SEMANTIC_SEARCH,
                     'reranking_enable': False,
                     'reranking_enable': False,
                     'reranking_model': {
                     'reranking_model': {
                         'reranking_provider_name': '',
                         'reranking_provider_name': '',

+ 2 - 1
api/services/hit_testing_service.py

@@ -10,12 +10,13 @@ from core.model_runtime.entities.model_entities import ModelType
 from core.rag.datasource.entity.embedding import Embeddings
 from core.rag.datasource.entity.embedding import Embeddings
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.models.document import Document
 from core.rag.models.document import Document
+from core.rag.retrieval.retrival_methods import RetrievalMethod
 from extensions.ext_database import db
 from extensions.ext_database import db
 from models.account import Account
 from models.account import Account
 from models.dataset import Dataset, DatasetQuery, DocumentSegment
 from models.dataset import Dataset, DatasetQuery, DocumentSegment
 
 
 default_retrieval_model = {
 default_retrieval_model = {
-    'search_method': 'semantic_search',
+    'search_method': RetrievalMethod.SEMANTIC_SEARCH,
     'reranking_enable': False,
     'reranking_enable': False,
     'reranking_model': {
     'reranking_model': {
         'reranking_provider_name': '',
         'reranking_provider_name': '',