website_service.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. import datetime
  2. import json
  3. from typing import Any
  4. import requests
  5. from flask_login import current_user # type: ignore
  6. from core.helper import encrypter
  7. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  8. from extensions.ext_redis import redis_client
  9. from extensions.ext_storage import storage
  10. from services.auth.api_key_auth_service import ApiKeyAuthService
  11. class WebsiteService:
  12. @classmethod
  13. def document_create_args_validate(cls, args: dict):
  14. if "url" not in args or not args["url"]:
  15. raise ValueError("url is required")
  16. if "options" not in args or not args["options"]:
  17. raise ValueError("options is required")
  18. if "limit" not in args["options"] or not args["options"]["limit"]:
  19. raise ValueError("limit is required")
  20. @classmethod
  21. def crawl_url(cls, args: dict) -> dict:
  22. provider = args.get("provider", "")
  23. url = args.get("url")
  24. options = args.get("options", "")
  25. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  26. if provider == "firecrawl":
  27. # decrypt api_key
  28. api_key = encrypter.decrypt_token(
  29. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  30. )
  31. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  32. crawl_sub_pages = options.get("crawl_sub_pages", False)
  33. only_main_content = options.get("only_main_content", False)
  34. if not crawl_sub_pages:
  35. params = {
  36. "crawlerOptions": {
  37. "includes": [],
  38. "excludes": [],
  39. "generateImgAltText": True,
  40. "limit": 1,
  41. "returnOnlyUrls": False,
  42. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  43. }
  44. }
  45. else:
  46. includes = options.get("includes").split(",") if options.get("includes") else []
  47. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  48. params = {
  49. "crawlerOptions": {
  50. "includes": includes,
  51. "excludes": excludes,
  52. "generateImgAltText": True,
  53. "limit": options.get("limit", 1),
  54. "returnOnlyUrls": False,
  55. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  56. }
  57. }
  58. if options.get("max_depth"):
  59. params["crawlerOptions"]["maxDepth"] = options.get("max_depth")
  60. job_id = firecrawl_app.crawl_url(url, params)
  61. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  62. time = str(datetime.datetime.now().timestamp())
  63. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  64. return {"status": "active", "job_id": job_id}
  65. elif provider == "jinareader":
  66. api_key = encrypter.decrypt_token(
  67. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  68. )
  69. crawl_sub_pages = options.get("crawl_sub_pages", False)
  70. if not crawl_sub_pages:
  71. response = requests.get(
  72. f"https://r.jina.ai/{url}",
  73. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  74. )
  75. if response.json().get("code") != 200:
  76. raise ValueError("Failed to crawl")
  77. return {"status": "active", "data": response.json().get("data")}
  78. else:
  79. response = requests.post(
  80. "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
  81. json={
  82. "url": url,
  83. "maxPages": options.get("limit", 1),
  84. "useSitemap": options.get("use_sitemap", True),
  85. },
  86. headers={
  87. "Content-Type": "application/json",
  88. "Authorization": f"Bearer {api_key}",
  89. },
  90. )
  91. if response.json().get("code") != 200:
  92. raise ValueError("Failed to crawl")
  93. return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
  94. else:
  95. raise ValueError("Invalid provider")
  96. @classmethod
  97. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  98. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  99. if provider == "firecrawl":
  100. # decrypt api_key
  101. api_key = encrypter.decrypt_token(
  102. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  103. )
  104. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  105. result = firecrawl_app.check_crawl_status(job_id)
  106. crawl_status_data = {
  107. "status": result.get("status", "active"),
  108. "job_id": job_id,
  109. "total": result.get("total", 0),
  110. "current": result.get("current", 0),
  111. "data": result.get("data", []),
  112. }
  113. if crawl_status_data["status"] == "completed":
  114. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  115. start_time = redis_client.get(website_crawl_time_cache_key)
  116. if start_time:
  117. end_time = datetime.datetime.now().timestamp()
  118. time_consuming = abs(end_time - float(start_time))
  119. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  120. redis_client.delete(website_crawl_time_cache_key)
  121. elif provider == "jinareader":
  122. api_key = encrypter.decrypt_token(
  123. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  124. )
  125. response = requests.post(
  126. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  127. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  128. json={"taskId": job_id},
  129. )
  130. data = response.json().get("data", {})
  131. crawl_status_data = {
  132. "status": data.get("status", "active"),
  133. "job_id": job_id,
  134. "total": len(data.get("urls", [])),
  135. "current": len(data.get("processed", [])) + len(data.get("failed", [])),
  136. "data": [],
  137. "time_consuming": data.get("duration", 0) / 1000,
  138. }
  139. if crawl_status_data["status"] == "completed":
  140. response = requests.post(
  141. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  142. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  143. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  144. )
  145. data = response.json().get("data", {})
  146. formatted_data = [
  147. {
  148. "title": item.get("data", {}).get("title"),
  149. "source_url": item.get("data", {}).get("url"),
  150. "description": item.get("data", {}).get("description"),
  151. "markdown": item.get("data", {}).get("content"),
  152. }
  153. for item in data.get("processed", {}).values()
  154. ]
  155. crawl_status_data["data"] = formatted_data
  156. else:
  157. raise ValueError("Invalid provider")
  158. return crawl_status_data
  159. @classmethod
  160. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
  161. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  162. # decrypt api_key
  163. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  164. # FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
  165. data: Any
  166. if provider == "firecrawl":
  167. file_key = "website_files/" + job_id + ".txt"
  168. if storage.exists(file_key):
  169. d = storage.load_once(file_key)
  170. if d:
  171. data = json.loads(d.decode("utf-8"))
  172. else:
  173. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  174. result = firecrawl_app.check_crawl_status(job_id)
  175. if result.get("status") != "completed":
  176. raise ValueError("Crawl job is not completed")
  177. data = result.get("data")
  178. if data:
  179. for item in data:
  180. if item.get("source_url") == url:
  181. return dict(item)
  182. return None
  183. elif provider == "jinareader":
  184. if not job_id:
  185. response = requests.get(
  186. f"https://r.jina.ai/{url}",
  187. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  188. )
  189. if response.json().get("code") != 200:
  190. raise ValueError("Failed to crawl")
  191. return dict(response.json().get("data", {}))
  192. else:
  193. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  194. response = requests.post(
  195. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  196. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  197. json={"taskId": job_id},
  198. )
  199. data = response.json().get("data", {})
  200. if data.get("status") != "completed":
  201. raise ValueError("Crawl job is not completed")
  202. response = requests.post(
  203. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  204. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  205. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  206. )
  207. data = response.json().get("data", {})
  208. for item in data.get("processed", {}).values():
  209. if item.get("data", {}).get("url") == url:
  210. return dict(item.get("data", {}))
  211. return None
  212. else:
  213. raise ValueError("Invalid provider")
  214. @classmethod
  215. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
  216. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  217. if provider == "firecrawl":
  218. # decrypt api_key
  219. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  220. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  221. params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}}
  222. result = firecrawl_app.scrape_url(url, params)
  223. return result
  224. else:
  225. raise ValueError("Invalid provider")