Explorar o código

feat: Spider web scraper & crawler tool (#5725)

William Espegren hai 1 ano
pai
achega
588615b20e

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 1 - 0
api/core/tools/provider/builtin/spider/_assets/icon.svg


+ 14 - 0
api/core/tools/provider/builtin/spider/spider.py

@@ -0,0 +1,14 @@
+from typing import Any
+
+from core.tools.errors import ToolProviderCredentialValidationError
+from core.tools.provider.builtin.spider.spiderApp import Spider
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class SpiderProvider(BuiltinToolProviderController):
+    def _validate_credentials(self, credentials: dict[str, Any]) -> None:
+        try:
+            app = Spider(api_key=credentials["spider_api_key"])
+            app.scrape_url(url="https://spider.cloud")
+        except Exception as e:
+            raise ToolProviderCredentialValidationError(str(e))

+ 27 - 0
api/core/tools/provider/builtin/spider/spider.yaml

@@ -0,0 +1,27 @@
+identity:
+  author: William Espegren
+  name: spider
+  label:
+    en_US: Spider
+    zh_CN: Spider
+  description:
+    en_US: Spider API integration, returning LLM-ready data by scraping & crawling websites.
+    zh_CN: Spider API 集成,通过爬取和抓取网站返回 LLM-ready 数据。
+  icon: icon.svg
+  tags:
+    - search
+    - utilities
+credentials_for_provider:
+  spider_api_key:
+    type: secret-input
+    required: true
+    label:
+      en_US: Spider API Key
+      zh_CN: Spider API 密钥
+    placeholder:
+      en_US: Please input your Spider API key
+      zh_CN: 请输入您的 Spider API 密钥
+    help:
+      en_US: Get your Spider API key from your Spider dashboard
+      zh_CN: 从您的 Spider 仪表板中获取 Spider API 密钥。
+    url: https://spider.cloud/

+ 237 - 0
api/core/tools/provider/builtin/spider/spiderApp.py

@@ -0,0 +1,237 @@
+import os
+from typing import Literal, Optional, TypedDict
+
+import requests
+
+
+class RequestParamsDict(TypedDict, total=False):
+    url: Optional[str]
+    request: Optional[Literal["http", "chrome", "smart"]]
+    limit: Optional[int]
+    return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]]
+    tld: Optional[bool]
+    depth: Optional[int]
+    cache: Optional[bool]
+    budget: Optional[dict[str, int]]
+    locale: Optional[str]
+    cookies: Optional[str]
+    stealth: Optional[bool]
+    headers: Optional[dict[str, str]]
+    anti_bot: Optional[bool]
+    metadata: Optional[bool]
+    viewport: Optional[dict[str, int]]
+    encoding: Optional[str]
+    subdomains: Optional[bool]
+    user_agent: Optional[str]
+    store_data: Optional[bool]
+    gpt_config: Optional[list[str]]
+    fingerprint: Optional[bool]
+    storageless: Optional[bool]
+    readability: Optional[bool]
+    proxy_enabled: Optional[bool]
+    respect_robots: Optional[bool]
+    query_selector: Optional[str]
+    full_resources: Optional[bool]
+    request_timeout: Optional[int]
+    run_in_background: Optional[bool]
+    skip_config_checks: Optional[bool]
+
+
+class Spider:
+    def __init__(self, api_key: Optional[str] = None):
+        """
+        Initialize the Spider with an API key.
+
+        :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
+        :raises ValueError: If no API key is provided.
+        """
+        self.api_key = api_key or os.getenv("SPIDER_API_KEY")
+        if self.api_key is None:
+            raise ValueError("No API key provided")
+
+    def api_post(
+        self,
+        endpoint: str,
+        data: dict,
+        stream: bool,
+        content_type: str = "application/json",
+    ):
+        """
+        Send a POST request to the specified API endpoint.
+
+        :param endpoint: The API endpoint to which the POST request is sent.
+        :param data: The data (dictionary) to be sent in the POST request.
+        :param stream: Boolean indicating if the response should be streamed.
+        :return: The JSON response or the raw response stream if stream is True.
+        """
+        headers = self._prepare_headers(content_type)
+        response = self._post_request(
+            f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream
+        )
+
+        if stream:
+            return response
+        elif response.status_code == 200:
+            return response.json()
+        else:
+            self._handle_error(response, f"post to {endpoint}")
+
+    def api_get(
+        self, endpoint: str, stream: bool, content_type: str = "application/json"
+    ):
+        """
+        Send a GET request to the specified endpoint.
+
+        :param endpoint: The API endpoint from which to retrieve data.
+        :return: The JSON decoded response.
+        """
+        headers = self._prepare_headers(content_type)
+        response = self._get_request(
+            f"https://api.spider.cloud/v1/{endpoint}", headers, stream
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            self._handle_error(response, f"get from {endpoint}")
+
+    def get_credits(self):
+        """
+        Retrieve the account's remaining credits.
+
+        :return: JSON response containing the number of credits left.
+        """
+        return self.api_get("credits", stream=False)
+
+    def scrape_url(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Scrape data from the specified URL.
+
+        :param url: The URL from which to scrape data.
+        :param params: Optional dictionary of additional parameters for the scrape request.
+        :return: JSON response containing the scraping results.
+        """
+
+        # Add { "return_format": "markdown" } to the params if not already present
+        if "return_format" not in params:
+            params["return_format"] = "markdown"    
+
+        # Set limit to 1
+        params["limit"] = 1
+
+        return self.api_post(
+            "crawl", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def crawl_url(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Start crawling at the specified URL.
+
+        :param url: The URL to begin crawling.
+        :param params: Optional dictionary with additional parameters to customize the crawl.
+        :param stream: Boolean indicating if the response should be streamed. Defaults to False.
+        :return: JSON response or the raw response stream if streaming enabled.
+        """
+
+        # Add { "return_format": "markdown" } to the params if not already present
+        if "return_format" not in params:
+            params["return_format"] = "markdown"
+
+        return self.api_post(
+            "crawl", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def links(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Retrieve links from the specified URL.
+
+        :param url: The URL from which to extract links.
+        :param params: Optional parameters for the link retrieval request.
+        :return: JSON response containing the links.
+        """
+        return self.api_post(
+            "links", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def extract_contacts(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Extract contact information from the specified URL.
+
+        :param url: The URL from which to extract contact information.
+        :param params: Optional parameters for the contact extraction.
+        :return: JSON response containing extracted contact details.
+        """
+        return self.api_post(
+            "pipeline/extract-contacts",
+            {"url": url, **(params or {})},
+            stream,
+            content_type,
+        )
+
+    def label(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Apply labeling to data extracted from the specified URL.
+
+        :param url: The URL to label data from.
+        :param params: Optional parameters to guide the labeling process.
+        :return: JSON response with labeled data.
+        """
+        return self.api_post(
+            "pipeline/label", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def _prepare_headers(self, content_type: str = "application/json"):
+        return {
+            "Content-Type": content_type,
+            "Authorization": f"Bearer {self.api_key}",
+            "User-Agent": "Spider-Client/0.0.27",
+        }
+
+    def _post_request(self, url: str, data, headers, stream=False):
+        return requests.post(url, headers=headers, json=data, stream=stream)
+
+    def _get_request(self, url: str, headers, stream=False):
+        return requests.get(url, headers=headers, stream=stream)
+
+    def _delete_request(self, url: str, headers, stream=False):
+        return requests.delete(url, headers=headers, stream=stream)
+
+    def _handle_error(self, response, action):
+        if response.status_code in [402, 409, 500]:
+            error_message = response.json().get("error", "Unknown error occurred")
+            raise Exception(
+                f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}"
+            )
+        else:
+            raise Exception(
+                f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}"
+            )

+ 47 - 0
api/core/tools/provider/builtin/spider/tools/scraper_crawler.py

@@ -0,0 +1,47 @@
+from typing import Any, Union
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.provider.builtin.spider.spiderApp import Spider
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class ScrapeTool(BuiltinTool):
+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        # initialize the app object with the api key
+        app = Spider(api_key=self.runtime.credentials['spider_api_key'])
+
+        url = tool_parameters['url']
+        mode = tool_parameters['mode']
+        
+        options = {
+            'limit': tool_parameters.get('limit', 0),
+            'depth': tool_parameters.get('depth', 0),
+            'blacklist': tool_parameters.get('blacklist', '').split(',') if tool_parameters.get('blacklist') else [],
+            'whitelist': tool_parameters.get('whitelist', '').split(',') if tool_parameters.get('whitelist') else [],
+            'readability': tool_parameters.get('readability', False),
+        }
+
+        result = ""
+
+        try:
+            if mode == 'scrape':
+                scrape_result = app.scrape_url(
+                    url=url, 
+                    params=options,
+                )
+
+                for i in scrape_result:
+                    result += "URL: " + i.get('url', '') + "\n"
+                    result += "CONTENT: " + i.get('content', '') + "\n\n"
+            elif mode == 'crawl':
+                crawl_result = app.crawl_url(
+                    url=tool_parameters['url'], 
+                    params=options,
+                )
+                for i in crawl_result:
+                    result += "URL: " + i.get('url', '') + "\n"
+                    result += "CONTENT: " + i.get('content', '') + "\n\n"
+        except Exception as e:
+            return self.create_text_message("An error occured", str(e))
+
+        return self.create_text_message(result)

+ 100 - 0
api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml

@@ -0,0 +1,100 @@
+identity:
+  name: scraper_crawler
+  author: William Espegren
+  label:
+    en_US: Web Scraper & Crawler
+    zh_Hans: 网页抓取与爬虫
+description:
+  human:
+    en_US: A tool for scraping & crawling webpages. Input should be a url.
+    zh_Hans: 用于抓取和爬取网页的工具。输入应该是一个网址。
+  llm: A tool for scraping & crawling webpages. Input should be a url.
+parameters:
+  - name: url
+    type: string
+    required: true
+    label:
+      en_US: URL
+      zh_Hans: 网址
+    human_description:
+      en_US: url to be scraped or crawled
+      zh_Hans: 要抓取或爬取的网址
+    llm_description: url to either be scraped or crawled
+    form: llm
+  - name: mode
+    type: select
+    required: true
+    options:
+      - value: scrape
+        label:
+          en_US: scrape
+          zh_Hans: 抓取
+      - value: crawl
+        label:
+          en_US: crawl
+          zh_Hans: 爬取
+    default: crawl
+    label:
+      en_US: Mode
+      zh_Hans: 模式
+    human_description:
+      en_US: used for selecting to either scrape the website or crawl the entire website following subpages
+      zh_Hans: 用于选择抓取网站或爬取整个网站及其子页面
+    form: form
+  - name: limit
+    type: number
+    required: false
+    label:
+      en_US: maximum number of pages to crawl
+      zh_Hans: 最大爬取页面数
+    human_description:
+      en_US: specify the maximum number of pages to crawl per website. the crawler will stop after reaching this limit.
+      zh_Hans: 指定每个网站要爬取的最大页面数。爬虫将在达到此限制后停止。
+    form: form
+    min: 0
+    default: 0
+  - name: depth
+    type: number
+    required: false
+    label:
+      en_US: maximum depth of pages to crawl
+      zh_Hans: 最大爬取深度
+    human_description:
+      en_US: the crawl limit for maximum depth.
+      zh_Hans: 最大爬取深度的限制。
+    form: form
+    min: 0
+    default: 0
+  - name: blacklist
+    type: string
+    required: false
+    label:
+      en_US: url patterns to exclude
+      zh_Hans: 要排除的URL模式
+    human_description:
+      en_US: blacklist a set of paths that you do not want to crawl. you can use regex patterns to help with the list.
+      zh_Hans: 指定一组不想爬取的路径。您可以使用正则表达式模式来帮助定义列表。
+    placeholder: /blog/*, /about
+    form: form
+  - name: whitelist
+    type: string
+    required: false
+    label:
+      en_US: URL patterns to include
+      zh_Hans: 要包含的URL模式
+    human_description:
+      en_US: Whitelist a set of paths that you want to crawl, ignoring all other routes that do not match the patterns. You can use regex patterns to help with the list.
+      zh_Hans: 指定一组要爬取的路径,忽略所有不匹配模式的其他路由。您可以使用正则表达式模式来帮助定义列表。
+    placeholder: /blog/*, /about
+    form: form
+  - name: readability
+    type: boolean
+    required: false
+    label:
+      en_US: Pre-process the content for LLM usage
+      zh_Hans: 仅返回页面的主要内容
+    human_description:
+      en_US: Use Mozilla's readability to pre-process the content for reading. This may drastically improve the content for LLM usage.
+      zh_Hans: 如果启用,爬虫将仅返回页面的主要内容,不包括标题、导航、页脚等。
+    form: form
+    default: false