1 year ago · 588615b20e
--- a/api/core/tools/provider/builtin/spider/_assets/icon.svg
+++ b/api/core/tools/provider/builtin/spider/_assets/icon.svg
--- a/api/core/tools/provider/builtin/spider/spider.py
+++ b/api/core/tools/provider/builtin/spider/spider.py
@@ -0,0 +1,14 @@
 
				+from typing import Any
			
 
				+
			
 
				+from core.tools.errors import ToolProviderCredentialValidationError
			
 
				+from core.tools.provider.builtin.spider.spiderApp import Spider
			
 
				+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
			
 
				+
			
 
				+
			
 
				+class SpiderProvider(BuiltinToolProviderController):
			
 
				+    def _validate_credentials(self, credentials: dict[str, Any]) -> None:
			
 
				+        try:
			
 
				+            app = Spider(api_key=credentials["spider_api_key"])
			
 
				+            app.scrape_url(url="https://spider.cloud")
			
 
				+        except Exception as e:
			
 
				+            raise ToolProviderCredentialValidationError(str(e))
			
--- a/api/core/tools/provider/builtin/spider/spider.yaml
+++ b/api/core/tools/provider/builtin/spider/spider.yaml
@@ -0,0 +1,27 @@
 
				+identity:
			
 
				+  author: William Espegren
			
 
				+  name: spider
			
 
				+  label:
			
 
				+    en_US: Spider
			
 
				+    zh_CN: Spider
			
 
				+  description:
			
 
				+    en_US: Spider API integration, returning LLM-ready data by scraping & crawling websites.
			
 
				+    zh_CN: Spider API 集成，通过爬取和抓取网站返回 LLM-ready 数据。
			
 
				+  icon: icon.svg
			
 
				+  tags:
			
 
				+    - search
			
 
				+    - utilities
			
 
				+credentials_for_provider:
			
 
				+  spider_api_key:
			
 
				+    type: secret-input
			
 
				+    required: true
			
 
				+    label:
			
 
				+      en_US: Spider API Key
			
 
				+      zh_CN: Spider API 密钥
			
 
				+    placeholder:
			
 
				+      en_US: Please input your Spider API key
			
 
				+      zh_CN: 请输入您的 Spider API 密钥
			
 
				+    help:
			
 
				+      en_US: Get your Spider API key from your Spider dashboard
			
 
				+      zh_CN: 从您的 Spider 仪表板中获取 Spider API 密钥。
			
 
				+    url: https://spider.cloud/
			
--- a/api/core/tools/provider/builtin/spider/spiderApp.py
+++ b/api/core/tools/provider/builtin/spider/spiderApp.py
@@ -0,0 +1,237 @@
 
				+import os
			
 
				+from typing import Literal, Optional, TypedDict
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+class RequestParamsDict(TypedDict, total=False):
			
 
				+    url: Optional[str]
			
 
				+    request: Optional[Literal["http", "chrome", "smart"]]
			
 
				+    limit: Optional[int]
			
 
				+    return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]]
			
 
				+    tld: Optional[bool]
			
 
				+    depth: Optional[int]
			
 
				+    cache: Optional[bool]
			
 
				+    budget: Optional[dict[str, int]]
			
 
				+    locale: Optional[str]
			
 
				+    cookies: Optional[str]
			
 
				+    stealth: Optional[bool]
			
 
				+    headers: Optional[dict[str, str]]
			
 
				+    anti_bot: Optional[bool]
			
 
				+    metadata: Optional[bool]
			
 
				+    viewport: Optional[dict[str, int]]
			
 
				+    encoding: Optional[str]
			
 
				+    subdomains: Optional[bool]
			
 
				+    user_agent: Optional[str]
			
 
				+    store_data: Optional[bool]
			
 
				+    gpt_config: Optional[list[str]]
			
 
				+    fingerprint: Optional[bool]
			
 
				+    storageless: Optional[bool]
			
 
				+    readability: Optional[bool]
			
 
				+    proxy_enabled: Optional[bool]
			
 
				+    respect_robots: Optional[bool]
			
 
				+    query_selector: Optional[str]
			
 
				+    full_resources: Optional[bool]
			
 
				+    request_timeout: Optional[int]
			
 
				+    run_in_background: Optional[bool]
			
 
				+    skip_config_checks: Optional[bool]
			
 
				+
			
 
				+
			
 
				+class Spider:
			
 
				+    def __init__(self, api_key: Optional[str] = None):
			
 
				+        """
			
 
				+        Initialize the Spider with an API key.
			
 
				+
			
 
				+        :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
			
 
				+        :raises ValueError: If no API key is provided.
			
 
				+        """
			
 
				+        self.api_key = api_key or os.getenv("SPIDER_API_KEY")
			
 
				+        if self.api_key is None:
			
 
				+            raise ValueError("No API key provided")
			
 
				+
			
 
				+    def api_post(
			
 
				+        self,
			
 
				+        endpoint: str,
			
 
				+        data: dict,
			
 
				+        stream: bool,
			
 
				+        content_type: str = "application/json",
			
 
				+    ):
			
 
				+        """
			
 
				+        Send a POST request to the specified API endpoint.
			
 
				+
			
 
				+        :param endpoint: The API endpoint to which the POST request is sent.
			
 
				+        :param data: The data (dictionary) to be sent in the POST request.
			
 
				+        :param stream: Boolean indicating if the response should be streamed.
			
 
				+        :return: The JSON response or the raw response stream if stream is True.
			
 
				+        """
			
 
				+        headers = self._prepare_headers(content_type)
			
 
				+        response = self._post_request(
			
 
				+            f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream
			
 
				+        )
			
 
				+
			
 
				+        if stream:
			
 
				+            return response
			
 
				+        elif response.status_code == 200:
			
 
				+            return response.json()
			
 
				+        else:
			
 
				+            self._handle_error(response, f"post to {endpoint}")
			
 
				+
			
 
				+    def api_get(
			
 
				+        self, endpoint: str, stream: bool, content_type: str = "application/json"
			
 
				+    ):
			
 
				+        """
			
 
				+        Send a GET request to the specified endpoint.
			
 
				+
			
 
				+        :param endpoint: The API endpoint from which to retrieve data.
			
 
				+        :return: The JSON decoded response.
			
 
				+        """
			
 
				+        headers = self._prepare_headers(content_type)
			
 
				+        response = self._get_request(
			
 
				+            f"https://api.spider.cloud/v1/{endpoint}", headers, stream
			
 
				+        )
			
 
				+        if response.status_code == 200:
			
 
				+            return response.json()
			
 
				+        else:
			
 
				+            self._handle_error(response, f"get from {endpoint}")
			
 
				+
			
 
				+    def get_credits(self):
			
 
				+        """
			
 
				+        Retrieve the account's remaining credits.
			
 
				+
			
 
				+        :return: JSON response containing the number of credits left.
			
 
				+        """
			
 
				+        return self.api_get("credits", stream=False)
			
 
				+
			
 
				+    def scrape_url(
			
 
				+        self,
			
 
				+        url: str,
			
 
				+        params: Optional[RequestParamsDict] = None,
			
 
				+        stream: bool = False,
			
 
				+        content_type: str = "application/json",
			
 
				+    ):
			
 
				+        """
			
 
				+        Scrape data from the specified URL.
			
 
				+
			
 
				+        :param url: The URL from which to scrape data.
			
 
				+        :param params: Optional dictionary of additional parameters for the scrape request.
			
 
				+        :return: JSON response containing the scraping results.
			
 
				+        """
			
 
				+
			
 
				+        # Add { "return_format": "markdown" } to the params if not already present
			
 
				+        if "return_format" not in params:
			
 
				+            params["return_format"] = "markdown"    
			
 
				+
			
 
				+        # Set limit to 1
			
 
				+        params["limit"] = 1
			
 
				+
			
 
				+        return self.api_post(
			
 
				+            "crawl", {"url": url, **(params or {})}, stream, content_type
			
 
				+        )
			
 
				+
			
 
				+    def crawl_url(
			
 
				+        self,
			
 
				+        url: str,
			
 
				+        params: Optional[RequestParamsDict] = None,
			
 
				+        stream: bool = False,
			
 
				+        content_type: str = "application/json",
			
 
				+    ):
			
 
				+        """
			
 
				+        Start crawling at the specified URL.
			
 
				+
			
 
				+        :param url: The URL to begin crawling.
			
 
				+        :param params: Optional dictionary with additional parameters to customize the crawl.
			
 
				+        :param stream: Boolean indicating if the response should be streamed. Defaults to False.
			
 
				+        :return: JSON response or the raw response stream if streaming enabled.
			
 
				+        """
			
 
				+
			
 
				+        # Add { "return_format": "markdown" } to the params if not already present
			
 
				+        if "return_format" not in params:
			
 
				+            params["return_format"] = "markdown"
			
 
				+
			
 
				+        return self.api_post(
			
 
				+            "crawl", {"url": url, **(params or {})}, stream, content_type
			
 
				+        )
			
 
				+
			
 
				+    def links(
			
 
				+        self,
			
 
				+        url: str,
			
 
				+        params: Optional[RequestParamsDict] = None,
			
 
				+        stream: bool = False,
			
 
				+        content_type: str = "application/json",
			
 
				+    ):
			
 
				+        """
			
 
				+        Retrieve links from the specified URL.
			
 
				+
			
 
				+        :param url: The URL from which to extract links.
			
 
				+        :param params: Optional parameters for the link retrieval request.
			
 
				+        :return: JSON response containing the links.
			
 
				+        """
			
 
				+        return self.api_post(
			
 
				+            "links", {"url": url, **(params or {})}, stream, content_type
			
 
				+        )
			
 
				+
			
 
				+    def extract_contacts(
			
 
				+        self,
			
 
				+        url: str,
			
 
				+        params: Optional[RequestParamsDict] = None,
			
 
				+        stream: bool = False,
			
 
				+        content_type: str = "application/json",
			
 
				+    ):
			
 
				+        """
			
 
				+        Extract contact information from the specified URL.
			
 
				+
			
 
				+        :param url: The URL from which to extract contact information.
			
 
				+        :param params: Optional parameters for the contact extraction.
			
 
				+        :return: JSON response containing extracted contact details.
			
 
				+        """
			
 
				+        return self.api_post(
			
 
				+            "pipeline/extract-contacts",
			
 
				+            {"url": url, **(params or {})},
			
 
				+            stream,
			
 
				+            content_type,
			
 
				+        )
			
 
				+
			
 
				+    def label(
			
 
				+        self,
			
 
				+        url: str,
			
 
				+        params: Optional[RequestParamsDict] = None,
			
 
				+        stream: bool = False,
			
 
				+        content_type: str = "application/json",
			
 
				+    ):
			
 
				+        """
			
 
				+        Apply labeling to data extracted from the specified URL.
			
 
				+
			
 
				+        :param url: The URL to label data from.
			
 
				+        :param params: Optional parameters to guide the labeling process.
			
 
				+        :return: JSON response with labeled data.
			
 
				+        """
			
 
				+        return self.api_post(
			
 
				+            "pipeline/label", {"url": url, **(params or {})}, stream, content_type
			
 
				+        )
			
 
				+
			
 
				+    def _prepare_headers(self, content_type: str = "application/json"):
			
 
				+        return {
			
 
				+            "Content-Type": content_type,
			
 
				+            "Authorization": f"Bearer {self.api_key}",
			
 
				+            "User-Agent": "Spider-Client/0.0.27",
			
 
				+        }
			
 
				+
			
 
				+    def _post_request(self, url: str, data, headers, stream=False):
			
 
				+        return requests.post(url, headers=headers, json=data, stream=stream)
			
 
				+
			
 
				+    def _get_request(self, url: str, headers, stream=False):
			
 
				+        return requests.get(url, headers=headers, stream=stream)
			
 
				+
			
 
				+    def _delete_request(self, url: str, headers, stream=False):
			
 
				+        return requests.delete(url, headers=headers, stream=stream)
			
 
				+
			
 
				+    def _handle_error(self, response, action):
			
 
				+        if response.status_code in [402, 409, 500]:
			
 
				+            error_message = response.json().get("error", "Unknown error occurred")
			
 
				+            raise Exception(
			
 
				+                f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}"
			
 
				+            )
			
 
				+        else:
			
 
				+            raise Exception(
			
 
				+                f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}"
			
 
				+            )
			
--- a/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py
+++ b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py
@@ -0,0 +1,47 @@
 
				+from typing import Any, Union
			
 
				+
			
 
				+from core.tools.entities.tool_entities import ToolInvokeMessage
			
 
				+from core.tools.provider.builtin.spider.spiderApp import Spider
			
 
				+from core.tools.tool.builtin_tool import BuiltinTool
			
 
				+
			
 
				+
			
 
				+class ScrapeTool(BuiltinTool):
			
 
				+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
			
 
				+        # initialize the app object with the api key
			
 
				+        app = Spider(api_key=self.runtime.credentials['spider_api_key'])
			
 
				+
			
 
				+        url = tool_parameters['url']
			
 
				+        mode = tool_parameters['mode']
			
 
				+        
			
 
				+        options = {
			
 
				+            'limit': tool_parameters.get('limit', 0),
			
 
				+            'depth': tool_parameters.get('depth', 0),
			
 
				+            'blacklist': tool_parameters.get('blacklist', '').split(',') if tool_parameters.get('blacklist') else [],
			
 
				+            'whitelist': tool_parameters.get('whitelist', '').split(',') if tool_parameters.get('whitelist') else [],
			
 
				+            'readability': tool_parameters.get('readability', False),
			
 
				+        }
			
 
				+
			
 
				+        result = ""
			
 
				+
			
 
				+        try:
			
 
				+            if mode == 'scrape':
			
 
				+                scrape_result = app.scrape_url(
			
 
				+                    url=url, 
			
 
				+                    params=options,
			
 
				+                )
			
 
				+
			
 
				+                for i in scrape_result:
			
 
				+                    result += "URL: " + i.get('url', '') + "\n"
			
 
				+                    result += "CONTENT: " + i.get('content', '') + "\n\n"
			
 
				+            elif mode == 'crawl':
			
 
				+                crawl_result = app.crawl_url(
			
 
				+                    url=tool_parameters['url'], 
			
 
				+                    params=options,
			
 
				+                )
			
 
				+                for i in crawl_result:
			
 
				+                    result += "URL: " + i.get('url', '') + "\n"
			
 
				+                    result += "CONTENT: " + i.get('content', '') + "\n\n"
			
 
				+        except Exception as e:
			
 
				+            return self.create_text_message("An error occured", str(e))
			
 
				+
			
 
				+        return self.create_text_message(result)
			
--- a/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml
+++ b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml
@@ -0,0 +1,100 @@
 
				+identity:
			
 
				+  name: scraper_crawler
			
 
				+  author: William Espegren
			
 
				+  label:
			
 
				+    en_US: Web Scraper & Crawler
			
 
				+    zh_Hans: 网页抓取与爬虫
			
 
				+description:
			
 
				+  human:
			
 
				+    en_US: A tool for scraping & crawling webpages. Input should be a url.
			
 
				+    zh_Hans: 用于抓取和爬取网页的工具。输入应该是一个网址。
			
 
				+  llm: A tool for scraping & crawling webpages. Input should be a url.
			
 
				+parameters:
			
 
				+  - name: url
			
 
				+    type: string
			
 
				+    required: true
			
 
				+    label:
			
 
				+      en_US: URL
			
 
				+      zh_Hans: 网址
			
 
				+    human_description:
			
 
				+      en_US: url to be scraped or crawled
			
 
				+      zh_Hans: 要抓取或爬取的网址
			
 
				+    llm_description: url to either be scraped or crawled
			
 
				+    form: llm
			
 
				+  - name: mode
			
 
				+    type: select
			
 
				+    required: true
			
 
				+    options:
			
 
				+      - value: scrape
			
 
				+        label:
			
 
				+          en_US: scrape
			
 
				+          zh_Hans: 抓取
			
 
				+      - value: crawl
			
 
				+        label:
			
 
				+          en_US: crawl
			
 
				+          zh_Hans: 爬取
			
 
				+    default: crawl
			
 
				+    label:
			
 
				+      en_US: Mode
			
 
				+      zh_Hans: 模式
			
 
				+    human_description:
			
 
				+      en_US: used for selecting to either scrape the website or crawl the entire website following subpages
			
 
				+      zh_Hans: 用于选择抓取网站或爬取整个网站及其子页面
			
 
				+    form: form
			
 
				+  - name: limit
			
 
				+    type: number
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: maximum number of pages to crawl
			
 
				+      zh_Hans: 最大爬取页面数
			
 
				+    human_description:
			
 
				+      en_US: specify the maximum number of pages to crawl per website. the crawler will stop after reaching this limit.
			
 
				+      zh_Hans: 指定每个网站要爬取的最大页面数。爬虫将在达到此限制后停止。
			
 
				+    form: form
			
 
				+    min: 0
			
 
				+    default: 0
			
 
				+  - name: depth
			
 
				+    type: number
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: maximum depth of pages to crawl
			
 
				+      zh_Hans: 最大爬取深度
			
 
				+    human_description:
			
 
				+      en_US: the crawl limit for maximum depth.
			
 
				+      zh_Hans: 最大爬取深度的限制。
			
 
				+    form: form
			
 
				+    min: 0
			
 
				+    default: 0
			
 
				+  - name: blacklist
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: url patterns to exclude
			
 
				+      zh_Hans: 要排除的URL模式
			
 
				+    human_description:
			
 
				+      en_US: blacklist a set of paths that you do not want to crawl. you can use regex patterns to help with the list.
			
 
				+      zh_Hans: 指定一组不想爬取的路径。您可以使用正则表达式模式来帮助定义列表。
			
 
				+    placeholder: /blog/*, /about
			
 
				+    form: form
			
 
				+  - name: whitelist
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: URL patterns to include
			
 
				+      zh_Hans: 要包含的URL模式
			
 
				+    human_description:
			
 
				+      en_US: Whitelist a set of paths that you want to crawl, ignoring all other routes that do not match the patterns. You can use regex patterns to help with the list.
			
 
				+      zh_Hans: 指定一组要爬取的路径，忽略所有不匹配模式的其他路由。您可以使用正则表达式模式来帮助定义列表。
			
 
				+    placeholder: /blog/*, /about
			
 
				+    form: form
			
 
				+  - name: readability
			
 
				+    type: boolean
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: Pre-process the content for LLM usage
			
 
				+      zh_Hans: 仅返回页面的主要内容
			
 
				+    human_description:
			
 
				+      en_US: Use Mozilla's readability to pre-process the content for reading. This may drastically improve the content for LLM usage.
			
 
				+      zh_Hans: 如果启用，爬虫将仅返回页面的主要内容，不包括标题、导航、页脚等。
			
 
				+    form: form
			
 
				+    default: false