| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 | 
							- import os
 
- from typing import Literal, Optional, TypedDict
 
- import requests
 
- class RequestParamsDict(TypedDict, total=False):
 
-     url: Optional[str]
 
-     request: Optional[Literal["http", "chrome", "smart"]]
 
-     limit: Optional[int]
 
-     return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]]
 
-     tld: Optional[bool]
 
-     depth: Optional[int]
 
-     cache: Optional[bool]
 
-     budget: Optional[dict[str, int]]
 
-     locale: Optional[str]
 
-     cookies: Optional[str]
 
-     stealth: Optional[bool]
 
-     headers: Optional[dict[str, str]]
 
-     anti_bot: Optional[bool]
 
-     metadata: Optional[bool]
 
-     viewport: Optional[dict[str, int]]
 
-     encoding: Optional[str]
 
-     subdomains: Optional[bool]
 
-     user_agent: Optional[str]
 
-     store_data: Optional[bool]
 
-     gpt_config: Optional[list[str]]
 
-     fingerprint: Optional[bool]
 
-     storageless: Optional[bool]
 
-     readability: Optional[bool]
 
-     proxy_enabled: Optional[bool]
 
-     respect_robots: Optional[bool]
 
-     query_selector: Optional[str]
 
-     full_resources: Optional[bool]
 
-     request_timeout: Optional[int]
 
-     run_in_background: Optional[bool]
 
-     skip_config_checks: Optional[bool]
 
- class Spider:
 
-     def __init__(self, api_key: Optional[str] = None):
 
-         """
 
-         Initialize the Spider with an API key.
 
-         :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
 
-         :raises ValueError: If no API key is provided.
 
-         """
 
-         self.api_key = api_key or os.getenv("SPIDER_API_KEY")
 
-         if self.api_key is None:
 
-             raise ValueError("No API key provided")
 
-     def api_post(
 
-         self,
 
-         endpoint: str,
 
-         data: dict,
 
-         stream: bool,
 
-         content_type: str = "application/json",
 
-     ):
 
-         """
 
-         Send a POST request to the specified API endpoint.
 
-         :param endpoint: The API endpoint to which the POST request is sent.
 
-         :param data: The data (dictionary) to be sent in the POST request.
 
-         :param stream: Boolean indicating if the response should be streamed.
 
-         :return: The JSON response or the raw response stream if stream is True.
 
-         """
 
-         headers = self._prepare_headers(content_type)
 
-         response = self._post_request(f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream)
 
-         if stream:
 
-             return response
 
-         elif response.status_code == 200:
 
-             return response.json()
 
-         else:
 
-             self._handle_error(response, f"post to {endpoint}")
 
-     def api_get(self, endpoint: str, stream: bool, content_type: str = "application/json"):
 
-         """
 
-         Send a GET request to the specified endpoint.
 
-         :param endpoint: The API endpoint from which to retrieve data.
 
-         :return: The JSON decoded response.
 
-         """
 
-         headers = self._prepare_headers(content_type)
 
-         response = self._get_request(f"https://api.spider.cloud/v1/{endpoint}", headers, stream)
 
-         if response.status_code == 200:
 
-             return response.json()
 
-         else:
 
-             self._handle_error(response, f"get from {endpoint}")
 
-     def get_credits(self):
 
-         """
 
-         Retrieve the account's remaining credits.
 
-         :return: JSON response containing the number of credits left.
 
-         """
 
-         return self.api_get("credits", stream=False)
 
-     def scrape_url(
 
-         self,
 
-         url: str,
 
-         params: Optional[RequestParamsDict] = None,
 
-         stream: bool = False,
 
-         content_type: str = "application/json",
 
-     ):
 
-         """
 
-         Scrape data from the specified URL.
 
-         :param url: The URL from which to scrape data.
 
-         :param params: Optional dictionary of additional parameters for the scrape request.
 
-         :return: JSON response containing the scraping results.
 
-         """
 
-         params = params or {}
 
-         # Add { "return_format": "markdown" } to the params if not already present
 
-         if "return_format" not in params:
 
-             params["return_format"] = "markdown"
 
-         # Set limit to 1
 
-         params["limit"] = 1
 
-         return self.api_post("crawl", {"url": url, **(params or {})}, stream, content_type)
 
-     def crawl_url(
 
-         self,
 
-         url: str,
 
-         params: Optional[RequestParamsDict] = None,
 
-         stream: bool = False,
 
-         content_type: str = "application/json",
 
-     ):
 
-         """
 
-         Start crawling at the specified URL.
 
-         :param url: The URL to begin crawling.
 
-         :param params: Optional dictionary with additional parameters to customize the crawl.
 
-         :param stream: Boolean indicating if the response should be streamed. Defaults to False.
 
-         :return: JSON response or the raw response stream if streaming enabled.
 
-         """
 
-         params = params or {}
 
-         # Add { "return_format": "markdown" } to the params if not already present
 
-         if "return_format" not in params:
 
-             params["return_format"] = "markdown"
 
-         return self.api_post("crawl", {"url": url, **(params or {})}, stream, content_type)
 
-     def links(
 
-         self,
 
-         url: str,
 
-         params: Optional[RequestParamsDict] = None,
 
-         stream: bool = False,
 
-         content_type: str = "application/json",
 
-     ):
 
-         """
 
-         Retrieve links from the specified URL.
 
-         :param url: The URL from which to extract links.
 
-         :param params: Optional parameters for the link retrieval request.
 
-         :return: JSON response containing the links.
 
-         """
 
-         return self.api_post("links", {"url": url, **(params or {})}, stream, content_type)
 
-     def extract_contacts(
 
-         self,
 
-         url: str,
 
-         params: Optional[RequestParamsDict] = None,
 
-         stream: bool = False,
 
-         content_type: str = "application/json",
 
-     ):
 
-         """
 
-         Extract contact information from the specified URL.
 
-         :param url: The URL from which to extract contact information.
 
-         :param params: Optional parameters for the contact extraction.
 
-         :return: JSON response containing extracted contact details.
 
-         """
 
-         return self.api_post(
 
-             "pipeline/extract-contacts",
 
-             {"url": url, **(params or {})},
 
-             stream,
 
-             content_type,
 
-         )
 
-     def label(
 
-         self,
 
-         url: str,
 
-         params: Optional[RequestParamsDict] = None,
 
-         stream: bool = False,
 
-         content_type: str = "application/json",
 
-     ):
 
-         """
 
-         Apply labeling to data extracted from the specified URL.
 
-         :param url: The URL to label data from.
 
-         :param params: Optional parameters to guide the labeling process.
 
-         :return: JSON response with labeled data.
 
-         """
 
-         return self.api_post("pipeline/label", {"url": url, **(params or {})}, stream, content_type)
 
-     def _prepare_headers(self, content_type: str = "application/json"):
 
-         return {
 
-             "Content-Type": content_type,
 
-             "Authorization": f"Bearer {self.api_key}",
 
-             "User-Agent": "Spider-Client/0.0.27",
 
-         }
 
-     def _post_request(self, url: str, data, headers, stream=False):
 
-         return requests.post(url, headers=headers, json=data, stream=stream)
 
-     def _get_request(self, url: str, headers, stream=False):
 
-         return requests.get(url, headers=headers, stream=stream)
 
-     def _delete_request(self, url: str, headers, stream=False):
 
-         return requests.delete(url, headers=headers, stream=stream)
 
-     def _handle_error(self, response, action):
 
-         if response.status_code in {402, 409, 500}:
 
-             error_message = response.json().get("error", "Unknown error occurred")
 
-             raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}")
 
-         else:
 
-             raise Exception(f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}")
 
 
  |