Browse Source

feat: add new features to enhance image and link handling in Jina tool (#5517)

Xiao Ley 10 months ago
parent
commit
dcec9d7bb7

+ 14 - 5
api/core/tools/provider/builtin/jina/tools/jina_reader.py

@@ -10,10 +10,10 @@ from core.tools.tool.builtin_tool import BuiltinTool
 class JinaReaderTool(BuiltinTool):
     _jina_reader_endpoint = 'https://r.jina.ai/'
 
-    def _invoke(self, 
+    def _invoke(self,
                 user_id: str,
-               tool_parameters: dict[str, Any], 
-        ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+                tool_parameters: dict[str, Any],
+                ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
         """
             invoke tools
         """
@@ -34,6 +34,15 @@ class JinaReaderTool(BuiltinTool):
         if wait_for_selector is not None and wait_for_selector != '':
             headers['X-Wait-For-Selector'] = wait_for_selector
 
+        if tool_parameters.get('image_caption', False):
+            headers['X-With-Generated-Alt'] = 'true'
+
+        if tool_parameters.get('gather_all_links_at_the_end', False):
+            headers['X-With-Links-Summary'] = 'true'
+
+        if tool_parameters.get('gather_all_images_at_the_end', False):
+            headers['X-With-Images-Summary'] = 'true'
+
         proxy_server = tool_parameters.get('proxy_server', None)
         if proxy_server is not None and proxy_server != '':
             headers['X-Proxy-Url'] = proxy_server
@@ -42,12 +51,12 @@ class JinaReaderTool(BuiltinTool):
             headers['X-No-Cache'] = 'true'
 
         response = ssrf_proxy.get(
-            str(URL(self._jina_reader_endpoint + url)), 
+            str(URL(self._jina_reader_endpoint + url)),
             headers=headers,
             timeout=(10, 60)
         )
 
         if tool_parameters.get('summary', False):
             return self.create_text_message(self.summary(user_id, response.text))
-        
+
         return self.create_text_message(response.text)

+ 42 - 0
api/core/tools/provider/builtin/jina/tools/jina_reader.yaml

@@ -51,6 +51,48 @@ parameters:
       pt_BR: css selector for waiting for specific elements
     llm_description: css selector of the target element to wait for
     form: form
+  - name: image_caption
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Image caption
+      zh_Hans: 图片说明
+      pt_BR: Legenda da imagem
+    human_description:
+      en_US: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+      zh_Hans: "为指定 URL 上的所有图像添加标题,为没有标题的图像添加“Image [idx]: [caption]”作为 alt 标签。这允许下游 LLM 在推理和总结等活动中与图像进行交互。"
+      pt_BR: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+    llm_description: Captions all images at the specified URL
+    form: form
+  - name: gather_all_links_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all links at the end
+      zh_Hans: 将所有链接集中到最后
+      pt_BR: Coletar todos os links ao final
+    human_description:
+      en_US: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+      zh_Hans: 最后会创建一个“按钮和链接”部分。这可以帮助下游 LLM 或 Web 代理浏览页面或采取进一步的行动。
+      pt_BR: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+    llm_description: Gather all links at the end
+    form: form
+  - name: gather_all_images_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all images at the end
+      zh_Hans: 将所有图片集中到最后
+      pt_BR: Coletar todas as imagens ao final
+    human_description:
+      en_US: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+      zh_Hans: 最后会创建一个“图像”部分。这可以让下游的 LLM 概览页面上的所有视觉效果,从而提高推理能力。
+      pt_BR: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+    llm_description: Gather all images at the end
+    form: form
   - name: proxy_server
     type: string
     required: false

+ 9 - 0
api/core/tools/provider/builtin/jina/tools/jina_search.py

@@ -24,6 +24,15 @@ class JinaSearchTool(BuiltinTool):
         if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'):
             headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key')
 
+        if tool_parameters.get('image_caption', False):
+            headers['X-With-Generated-Alt'] = 'true'
+
+        if tool_parameters.get('gather_all_links_at_the_end', False):
+            headers['X-With-Links-Summary'] = 'true'
+
+        if tool_parameters.get('gather_all_images_at_the_end', False):
+            headers['X-With-Images-Summary'] = 'true'
+
         proxy_server = tool_parameters.get('proxy_server', None)
         if proxy_server is not None and proxy_server != '':
             headers['X-Proxy-Url'] = proxy_server

+ 42 - 0
api/core/tools/provider/builtin/jina/tools/jina_search.yaml

@@ -22,6 +22,48 @@ parameters:
       zh_Hans: 在网络上搜索信息
     llm_description: simple question to ask on the web
     form: llm
+  - name: image_caption
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Image caption
+      zh_Hans: 图片说明
+      pt_BR: Legenda da imagem
+    human_description:
+      en_US: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+      zh_Hans: "为指定 URL 上的所有图像添加标题,为没有标题的图像添加“Image [idx]: [caption]”作为 alt 标签。这允许下游 LLM 在推理和总结等活动中与图像进行交互。"
+      pt_BR: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+    llm_description: Captions all images at the specified URL
+    form: form
+  - name: gather_all_links_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all links at the end
+      zh_Hans: 将所有链接集中到最后
+      pt_BR: Coletar todos os links ao final
+    human_description:
+      en_US: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+      zh_Hans: 最后会创建一个“按钮和链接”部分。这可以帮助下游 LLM 或 Web 代理浏览页面或采取进一步的行动。
+      pt_BR: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+    llm_description: Gather all links at the end
+    form: form
+  - name: gather_all_images_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all images at the end
+      zh_Hans: 将所有图片集中到最后
+      pt_BR: Coletar todas as imagens ao final
+    human_description:
+      en_US: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+      zh_Hans: 最后会创建一个“图像”部分。这可以让下游的 LLM 概览页面上的所有视觉效果,从而提高推理能力。
+      pt_BR: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+    llm_description: Gather all images at the end
+    form: form
   - name: proxy_server
     type: string
     required: false