website_service.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. import datetime
  2. import json
  3. from typing import Any
  4. import requests
  5. from flask_login import current_user # type: ignore
  6. from core.helper import encrypter
  7. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  8. from extensions.ext_redis import redis_client
  9. from extensions.ext_storage import storage
  10. from services.auth.api_key_auth_service import ApiKeyAuthService
  11. class WebsiteService:
  12. @classmethod
  13. def document_create_args_validate(cls, args: dict):
  14. if "url" not in args or not args["url"]:
  15. raise ValueError("url is required")
  16. if "options" not in args or not args["options"]:
  17. raise ValueError("options is required")
  18. if "limit" not in args["options"] or not args["options"]["limit"]:
  19. raise ValueError("limit is required")
  20. @classmethod
  21. def crawl_url(cls, args: dict) -> dict:
  22. provider = args.get("provider", "")
  23. url = args.get("url")
  24. options = args.get("options", "")
  25. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  26. if provider == "firecrawl":
  27. # decrypt api_key
  28. api_key = encrypter.decrypt_token(
  29. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  30. )
  31. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  32. crawl_sub_pages = options.get("crawl_sub_pages", False)
  33. only_main_content = options.get("only_main_content", False)
  34. if not crawl_sub_pages:
  35. params = {
  36. "includePaths": [],
  37. "excludePaths": [],
  38. "limit": 1,
  39. "scrapeOptions": {"onlyMainContent": only_main_content},
  40. }
  41. else:
  42. includes = options.get("includes").split(",") if options.get("includes") else []
  43. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  44. params = {
  45. "includePaths": includes,
  46. "excludePaths": excludes,
  47. "limit": options.get("limit", 1),
  48. "scrapeOptions": {"onlyMainContent": only_main_content},
  49. }
  50. if options.get("max_depth"):
  51. params["maxDepth"] = options.get("max_depth")
  52. job_id = firecrawl_app.crawl_url(url, params)
  53. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  54. time = str(datetime.datetime.now().timestamp())
  55. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  56. return {"status": "active", "job_id": job_id}
  57. elif provider == "jinareader":
  58. api_key = encrypter.decrypt_token(
  59. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  60. )
  61. crawl_sub_pages = options.get("crawl_sub_pages", False)
  62. if not crawl_sub_pages:
  63. response = requests.get(
  64. f"https://r.jina.ai/{url}",
  65. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  66. )
  67. if response.json().get("code") != 200:
  68. raise ValueError("Failed to crawl")
  69. return {"status": "active", "data": response.json().get("data")}
  70. else:
  71. response = requests.post(
  72. "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
  73. json={
  74. "url": url,
  75. "maxPages": options.get("limit", 1),
  76. "useSitemap": options.get("use_sitemap", True),
  77. },
  78. headers={
  79. "Content-Type": "application/json",
  80. "Authorization": f"Bearer {api_key}",
  81. },
  82. )
  83. if response.json().get("code") != 200:
  84. raise ValueError("Failed to crawl")
  85. return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
  86. else:
  87. raise ValueError("Invalid provider")
  88. @classmethod
  89. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  90. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  91. if provider == "firecrawl":
  92. # decrypt api_key
  93. api_key = encrypter.decrypt_token(
  94. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  95. )
  96. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  97. result = firecrawl_app.check_crawl_status(job_id)
  98. crawl_status_data = {
  99. "status": result.get("status", "active"),
  100. "job_id": job_id,
  101. "total": result.get("total", 0),
  102. "current": result.get("current", 0),
  103. "data": result.get("data", []),
  104. }
  105. if crawl_status_data["status"] == "completed":
  106. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  107. start_time = redis_client.get(website_crawl_time_cache_key)
  108. if start_time:
  109. end_time = datetime.datetime.now().timestamp()
  110. time_consuming = abs(end_time - float(start_time))
  111. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  112. redis_client.delete(website_crawl_time_cache_key)
  113. elif provider == "jinareader":
  114. api_key = encrypter.decrypt_token(
  115. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  116. )
  117. response = requests.post(
  118. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  119. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  120. json={"taskId": job_id},
  121. )
  122. data = response.json().get("data", {})
  123. crawl_status_data = {
  124. "status": data.get("status", "active"),
  125. "job_id": job_id,
  126. "total": len(data.get("urls", [])),
  127. "current": len(data.get("processed", [])) + len(data.get("failed", [])),
  128. "data": [],
  129. "time_consuming": data.get("duration", 0) / 1000,
  130. }
  131. if crawl_status_data["status"] == "completed":
  132. response = requests.post(
  133. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  134. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  135. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  136. )
  137. data = response.json().get("data", {})
  138. formatted_data = [
  139. {
  140. "title": item.get("data", {}).get("title"),
  141. "source_url": item.get("data", {}).get("url"),
  142. "description": item.get("data", {}).get("description"),
  143. "markdown": item.get("data", {}).get("content"),
  144. }
  145. for item in data.get("processed", {}).values()
  146. ]
  147. crawl_status_data["data"] = formatted_data
  148. else:
  149. raise ValueError("Invalid provider")
  150. return crawl_status_data
  151. @classmethod
  152. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
  153. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  154. # decrypt api_key
  155. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  156. # FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
  157. data: Any
  158. if provider == "firecrawl":
  159. file_key = "website_files/" + job_id + ".txt"
  160. if storage.exists(file_key):
  161. d = storage.load_once(file_key)
  162. if d:
  163. data = json.loads(d.decode("utf-8"))
  164. else:
  165. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  166. result = firecrawl_app.check_crawl_status(job_id)
  167. if result.get("status") != "completed":
  168. raise ValueError("Crawl job is not completed")
  169. data = result.get("data")
  170. if data:
  171. for item in data:
  172. if item.get("source_url") == url:
  173. return dict(item)
  174. return None
  175. elif provider == "jinareader":
  176. if not job_id:
  177. response = requests.get(
  178. f"https://r.jina.ai/{url}",
  179. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  180. )
  181. if response.json().get("code") != 200:
  182. raise ValueError("Failed to crawl")
  183. return dict(response.json().get("data", {}))
  184. else:
  185. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  186. response = requests.post(
  187. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  188. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  189. json={"taskId": job_id},
  190. )
  191. data = response.json().get("data", {})
  192. if data.get("status") != "completed":
  193. raise ValueError("Crawl job is not completed")
  194. response = requests.post(
  195. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  196. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  197. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  198. )
  199. data = response.json().get("data", {})
  200. for item in data.get("processed", {}).values():
  201. if item.get("data", {}).get("url") == url:
  202. return dict(item.get("data", {}))
  203. return None
  204. else:
  205. raise ValueError("Invalid provider")
  206. @classmethod
  207. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
  208. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  209. if provider == "firecrawl":
  210. # decrypt api_key
  211. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  212. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  213. params = {"onlyMainContent": only_main_content}
  214. result = firecrawl_app.scrape_url(url, params)
  215. return result
  216. else:
  217. raise ValueError("Invalid provider")