Parcourir la source

feat: Jina Search & Jina Reader CSS selectors (#4523)

rennokki il y a 11 mois
Parent
commit
6b5685ef0c

+ 3 - 3
api/core/tools/provider/builtin/jina/jina.yaml

@@ -6,8 +6,8 @@ identity:
     zh_Hans: JinaReader
     pt_BR: JinaReader
   description:
-    en_US: Convert any URL to an LLM-friendly input. Experience improved output for your agent and RAG systems at no cost.
-    zh_Hans: 将任何 URL 转换为 LLM 友好的输入。无需付费即可体验为您的 Agent 和 RAG 系统提供的改进输出
-    pt_BR: Converta qualquer URL em uma entrada amigável ao LLM. Experimente uma saída aprimorada para seus sistemas de agente e RAG sem custo.
+    en_US: Convert any URL to an LLM-friendly input or perform searches on the web for grounding information. Experience improved output for your agent and RAG systems at no cost.
+    zh_Hans: 将任何URL转换为LLM易读的输入或在网页上搜索引擎上搜索引擎
+    pt_BR: Converte qualquer URL em uma entrada LLm-fácil de ler ou realize pesquisas na web para obter informação de grounding. Tenha uma experiência melhor para seu agente e sistemas RAG sem custo.
   icon: icon.svg
 credentials_for_provider:

+ 8 - 0
api/core/tools/provider/builtin/jina/tools/jina_reader.py

@@ -23,6 +23,14 @@ class JinaReaderTool(BuiltinTool):
             'Accept': 'application/json'
         }
 
+        target_selector = tool_parameters.get('target_selector', None)
+        if target_selector is not None:
+            headers['X-Target-Selector'] = target_selector
+
+        wait_for_selector = tool_parameters.get('wait_for_selector', None)
+        if wait_for_selector is not None:
+            headers['X-Wait-For-Selector'] = wait_for_selector
+
         response = ssrf_proxy.get(
             str(URL(self._jina_reader_endpoint + url)), 
             headers=headers,

+ 26 - 0
api/core/tools/provider/builtin/jina/tools/jina_reader.yaml

@@ -25,6 +25,32 @@ parameters:
       pt_BR: used for linking to webpages
     llm_description: url for scraping
     form: llm
+  - name: target_selector
+    type: string
+    required: false
+    label:
+      en_US: Target selector
+      zh_Hans: 目标选择器
+      pt_BR: Seletor de destino
+    human_description:
+      en_US: css selector for scraping specific elements
+      zh_Hans: css 选择器用于抓取特定元素
+      pt_BR: css selector for scraping specific elements
+    llm_description: css selector of the target element to scrape
+    form: form
+  - name: wait_for_selector
+    type: string
+    required: false
+    label:
+      en_US: Wait for selector
+      zh_Hans: 等待选择器
+      pt_BR: Aguardar por seletor
+    human_description:
+      en_US: css selector for waiting for specific elements
+      zh_Hans: css 选择器用于等待特定元素
+      pt_BR: css selector for waiting for specific elements
+    llm_description: css selector of the target element to wait for
+    form: form
   - name: summary
     type: boolean
     required: false

+ 30 - 0
api/core/tools/provider/builtin/jina/tools/jina_search.py

@@ -0,0 +1,30 @@
+from typing import Any, Union
+
+from yarl import URL
+
+from core.helper import ssrf_proxy
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class JinaSearchTool(BuiltinTool):
+    _jina_search_endpoint = 'https://s.jina.ai/'
+
+    def _invoke(
+        self,
+        user_id: str,
+        tool_parameters: dict[str, Any],
+    ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        query = tool_parameters['query']
+
+        headers = {
+            'Accept': 'application/json'
+        }
+
+        response = ssrf_proxy.get(
+            str(URL(self._jina_search_endpoint + query)),
+            headers=headers,
+            timeout=(10, 60)
+        )
+
+        return self.create_text_message(response.text)

+ 21 - 0
api/core/tools/provider/builtin/jina/tools/jina_search.yaml

@@ -0,0 +1,21 @@
+identity:
+  name: jina_search
+  author: Dify
+  label:
+    en_US: JinaSearch
+    zh_Hans: JinaSearch
+    pt_BR: JinaSearch
+description:
+  human:
+    en_US: Search on the web and get the top 5 results. Useful for grounding using information from the web.
+  llm: A tool for searching results on the web for grounding. Input should be a simple question.
+parameters:
+  - name: query
+    type: string
+    required: true
+    label:
+      en_US: Question (Query)
+    human_description:
+      en_US: used to find information on the web
+    llm_description: simple question to ask on the web
+    form: llm