website_service.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. import datetime
  2. import json
  3. from flask_login import current_user
  4. from core.helper import encrypter
  5. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  6. from extensions.ext_redis import redis_client
  7. from extensions.ext_storage import storage
  8. from services.auth.api_key_auth_service import ApiKeyAuthService
  9. class WebsiteService:
  10. @classmethod
  11. def document_create_args_validate(cls, args: dict):
  12. if "url" not in args or not args["url"]:
  13. raise ValueError("url is required")
  14. if "options" not in args or not args["options"]:
  15. raise ValueError("options is required")
  16. if "limit" not in args["options"] or not args["options"]["limit"]:
  17. raise ValueError("limit is required")
  18. @classmethod
  19. def crawl_url(cls, args: dict) -> dict:
  20. provider = args.get("provider")
  21. url = args.get("url")
  22. options = args.get("options")
  23. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  24. if provider == "firecrawl":
  25. # decrypt api_key
  26. api_key = encrypter.decrypt_token(
  27. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  28. )
  29. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  30. crawl_sub_pages = options.get("crawl_sub_pages", False)
  31. only_main_content = options.get("only_main_content", False)
  32. if not crawl_sub_pages:
  33. params = {
  34. "crawlerOptions": {
  35. "includes": [],
  36. "excludes": [],
  37. "generateImgAltText": True,
  38. "limit": 1,
  39. "returnOnlyUrls": False,
  40. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  41. }
  42. }
  43. else:
  44. includes = options.get("includes").split(",") if options.get("includes") else []
  45. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  46. params = {
  47. "crawlerOptions": {
  48. "includes": includes or [],
  49. "excludes": excludes or [],
  50. "generateImgAltText": True,
  51. "limit": options.get("limit", 1),
  52. "returnOnlyUrls": False,
  53. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  54. }
  55. }
  56. if options.get("max_depth"):
  57. params["crawlerOptions"]["maxDepth"] = options.get("max_depth")
  58. job_id = firecrawl_app.crawl_url(url, params)
  59. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  60. time = str(datetime.datetime.now().timestamp())
  61. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  62. return {"status": "active", "job_id": job_id}
  63. else:
  64. raise ValueError("Invalid provider")
  65. @classmethod
  66. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  67. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  68. if provider == "firecrawl":
  69. # decrypt api_key
  70. api_key = encrypter.decrypt_token(
  71. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  72. )
  73. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  74. result = firecrawl_app.check_crawl_status(job_id)
  75. crawl_status_data = {
  76. "status": result.get("status", "active"),
  77. "job_id": job_id,
  78. "total": result.get("total", 0),
  79. "current": result.get("current", 0),
  80. "data": result.get("data", []),
  81. }
  82. if crawl_status_data["status"] == "completed":
  83. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  84. start_time = redis_client.get(website_crawl_time_cache_key)
  85. if start_time:
  86. end_time = datetime.datetime.now().timestamp()
  87. time_consuming = abs(end_time - float(start_time))
  88. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  89. redis_client.delete(website_crawl_time_cache_key)
  90. else:
  91. raise ValueError("Invalid provider")
  92. return crawl_status_data
  93. @classmethod
  94. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict | None:
  95. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  96. if provider == "firecrawl":
  97. file_key = "website_files/" + job_id + ".txt"
  98. if storage.exists(file_key):
  99. data = storage.load_once(file_key)
  100. if data:
  101. data = json.loads(data.decode("utf-8"))
  102. else:
  103. # decrypt api_key
  104. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  105. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  106. result = firecrawl_app.check_crawl_status(job_id)
  107. if result.get("status") != "completed":
  108. raise ValueError("Crawl job is not completed")
  109. data = result.get("data")
  110. if data:
  111. for item in data:
  112. if item.get("source_url") == url:
  113. return item
  114. return None
  115. else:
  116. raise ValueError("Invalid provider")
  117. @classmethod
  118. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict | None:
  119. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  120. if provider == "firecrawl":
  121. # decrypt api_key
  122. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  123. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  124. params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}}
  125. result = firecrawl_app.scrape_url(url, params)
  126. return result
  127. else:
  128. raise ValueError("Invalid provider")