datasets_document.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from datetime import UTC, datetime
  4. from typing import cast
  5. import json
  6. from flask import request,jsonify
  7. from flask_login import current_user # type: ignore
  8. from flask_restful import Resource, fields, marshal, marshal_with, reqparse # type: ignore
  9. from sqlalchemy import asc, desc
  10. from werkzeug.exceptions import Forbidden, NotFound
  11. import services
  12. from controllers.console import api
  13. from controllers.console.app.error import (
  14. ProviderModelCurrentlyNotSupportError,
  15. ProviderNotInitializeError,
  16. ProviderQuotaExceededError,
  17. )
  18. from controllers.console.datasets.error import (
  19. ArchivedDocumentImmutableError,
  20. DocumentAlreadyFinishedError,
  21. DocumentIndexingError,
  22. IndexingEstimateError,
  23. InvalidActionError,
  24. InvalidMetadataError,
  25. )
  26. from controllers.console.wraps import (
  27. account_initialization_required,
  28. cloud_edition_billing_rate_limit_check,
  29. cloud_edition_billing_resource_check,
  30. setup_required,
  31. )
  32. from core.errors.error import (
  33. LLMBadRequestError,
  34. ModelCurrentlyNotSupportError,
  35. ProviderTokenNotInitError,
  36. QuotaExceededError,
  37. )
  38. from core.indexing_runner import IndexingRunner
  39. from core.model_manager import ModelManager
  40. from core.model_runtime.entities.model_entities import ModelType
  41. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  42. from core.plugin.manager.exc import PluginDaemonClientSideError
  43. from core.rag.extractor.entity.extract_setting import ExtractSetting
  44. from extensions.ext_database import db
  45. from extensions.ext_redis import redis_client
  46. from fields.document_fields import (
  47. dataset_and_document_fields,
  48. document_fields,
  49. document_status_fields,
  50. document_with_segments_fields,
  51. )
  52. from libs.login import login_required
  53. from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
  54. from services.dataset_service import DatasetService, DocumentService
  55. from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
  56. from tasks.add_document_to_index_task import add_document_to_index_task
  57. from tasks.remove_document_from_index_task import remove_document_from_index_task
  58. from models.account import Account, TenantAccountRole
  59. class DocumentResource(Resource):
  60. def get_document(self, dataset_id: str, document_id: str) -> Document:
  61. dataset = DatasetService.get_dataset(dataset_id)
  62. if not dataset:
  63. raise NotFound("Dataset not found.")
  64. try:
  65. DatasetService.check_dataset_permission(dataset, current_user)
  66. except services.errors.account.NoPermissionError as e:
  67. raise Forbidden(str(e))
  68. document = DocumentService.get_document(dataset_id, document_id)
  69. if not document:
  70. raise NotFound("Document not found.")
  71. if document.tenant_id != current_user.current_tenant_id:
  72. raise Forbidden("No permission.")
  73. return document
  74. def get_batch_documents(self, dataset_id: str, batch: str) -> list[Document]:
  75. dataset = DatasetService.get_dataset(dataset_id)
  76. if not dataset:
  77. raise NotFound("Dataset not found.")
  78. try:
  79. DatasetService.check_dataset_permission(dataset, current_user)
  80. except services.errors.account.NoPermissionError as e:
  81. raise Forbidden(str(e))
  82. documents = DocumentService.get_batch_documents(dataset_id, batch)
  83. if not documents:
  84. raise NotFound("Documents not found.")
  85. return documents
  86. class GetProcessRuleApi(Resource):
  87. @setup_required
  88. @login_required
  89. @account_initialization_required
  90. def get(self):
  91. req_data = request.args
  92. document_id = req_data.get("document_id")
  93. # get default rules
  94. mode = DocumentService.DEFAULT_RULES["mode"]
  95. rules = DocumentService.DEFAULT_RULES["rules"]
  96. limits = DocumentService.DEFAULT_RULES["limits"]
  97. if document_id:
  98. # get the latest process rule
  99. document = Document.query.get_or_404(document_id)
  100. dataset = DatasetService.get_dataset(document.dataset_id)
  101. if not dataset:
  102. raise NotFound("Dataset not found.")
  103. try:
  104. DatasetService.check_dataset_permission(dataset, current_user)
  105. except services.errors.account.NoPermissionError as e:
  106. raise Forbidden(str(e))
  107. # get the latest process rule
  108. dataset_process_rule = (
  109. db.session.query(DatasetProcessRule)
  110. .filter(DatasetProcessRule.dataset_id == document.dataset_id)
  111. .order_by(DatasetProcessRule.created_at.desc())
  112. .limit(1)
  113. .one_or_none()
  114. )
  115. if dataset_process_rule:
  116. mode = dataset_process_rule.mode
  117. rules = dataset_process_rule.rules_dict
  118. return {"mode": mode, "rules": rules, "limits": limits}
  119. class DatasetDocumentListApi(Resource):
  120. @setup_required
  121. @login_required
  122. @account_initialization_required
  123. def get(self, dataset_id):
  124. dataset_id = str(dataset_id)
  125. page = request.args.get("page", default=1, type=int)
  126. limit = request.args.get("limit", default=20, type=int)
  127. search = request.args.get("keyword", default=None, type=str)
  128. sort = request.args.get("sort", default="-created_at", type=str)
  129. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  130. try:
  131. fetch_val = request.args.get("fetch", default="false")
  132. if isinstance(fetch_val, bool):
  133. fetch = fetch_val
  134. else:
  135. if fetch_val.lower() in ("yes", "true", "t", "y", "1"):
  136. fetch = True
  137. elif fetch_val.lower() in ("no", "false", "f", "n", "0"):
  138. fetch = False
  139. else:
  140. raise ArgumentTypeError(
  141. f"Truthy value expected: got {fetch_val} but expected one of yes/no, true/false, t/f, y/n, 1/0 "
  142. f"(case insensitive)."
  143. )
  144. except (ArgumentTypeError, ValueError, Exception):
  145. fetch = False
  146. dataset = DatasetService.get_dataset(dataset_id)
  147. if not dataset:
  148. raise NotFound("Dataset not found.")
  149. try:
  150. DatasetService.check_dataset_permission(dataset, current_user)
  151. except services.errors.account.NoPermissionError as e:
  152. raise Forbidden(str(e))
  153. query = Document.query.filter_by(dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  154. if search:
  155. search = f"%{search}%"
  156. query = query.filter(Document.name.like(search))
  157. if sort.startswith("-"):
  158. sort_logic = desc
  159. sort = sort[1:]
  160. else:
  161. sort_logic = asc
  162. if sort == "hit_count":
  163. sub_query = (
  164. db.select(DocumentSegment.document_id, db.func.sum(DocumentSegment.hit_count).label("total_hit_count"))
  165. .group_by(DocumentSegment.document_id)
  166. .subquery()
  167. )
  168. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id).order_by(
  169. sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)),
  170. sort_logic(Document.position),
  171. )
  172. elif sort == "created_at":
  173. query = query.order_by(
  174. sort_logic(Document.created_at),
  175. sort_logic(Document.position),
  176. )
  177. else:
  178. query = query.order_by(
  179. desc(Document.created_at),
  180. desc(Document.position),
  181. )
  182. paginated_documents = query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False)
  183. documents = paginated_documents.items
  184. if fetch:
  185. for document in documents:
  186. completed_segments = DocumentSegment.query.filter(
  187. DocumentSegment.completed_at.isnot(None),
  188. DocumentSegment.document_id == str(document.id),
  189. DocumentSegment.status != "re_segment",
  190. ).count()
  191. total_segments = DocumentSegment.query.filter(
  192. DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment"
  193. ).count()
  194. document.completed_segments = completed_segments
  195. document.total_segments = total_segments
  196. data = marshal(documents, document_with_segments_fields)
  197. else:
  198. data = marshal(documents, document_fields)
  199. response = {
  200. "data": data,
  201. "has_more": len(documents) == limit,
  202. "limit": limit,
  203. "total": paginated_documents.total,
  204. "page": page,
  205. }
  206. return response
  207. documents_and_batch_fields = {"documents": fields.List(fields.Nested(document_fields)), "batch": fields.String}
  208. @setup_required
  209. @login_required
  210. @account_initialization_required
  211. @marshal_with(documents_and_batch_fields)
  212. @cloud_edition_billing_resource_check("vector_space")
  213. @cloud_edition_billing_rate_limit_check("knowledge")
  214. def post(self, dataset_id):
  215. dataset_id = str(dataset_id)
  216. dataset = DatasetService.get_dataset(dataset_id)
  217. if not dataset:
  218. raise NotFound("Dataset not found.")
  219. # The role of the current user in the ta table must be admin, owner, or editor
  220. if not current_user.is_dataset_editor:
  221. raise Forbidden()
  222. try:
  223. DatasetService.check_dataset_permission(dataset, current_user)
  224. except services.errors.account.NoPermissionError as e:
  225. raise Forbidden(str(e))
  226. parser = reqparse.RequestParser()
  227. parser.add_argument(
  228. "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
  229. )
  230. parser.add_argument("data_source", type=dict, required=False, location="json")
  231. parser.add_argument("process_rule", type=dict, required=False, location="json")
  232. parser.add_argument("duplicate", type=bool, default=True, nullable=False, location="json")
  233. parser.add_argument("original_document_id", type=str, required=False, location="json")
  234. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  235. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  236. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  237. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  238. parser.add_argument(
  239. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  240. )
  241. args = parser.parse_args()
  242. knowledge_config = KnowledgeConfig(**args)
  243. if not dataset.indexing_technique and not knowledge_config.indexing_technique:
  244. raise ValueError("indexing_technique is required.")
  245. # validate args
  246. DocumentService.document_create_args_validate(knowledge_config)
  247. try:
  248. documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, current_user)
  249. except ProviderTokenNotInitError as ex:
  250. raise ProviderNotInitializeError(ex.description)
  251. except QuotaExceededError:
  252. raise ProviderQuotaExceededError()
  253. except ModelCurrentlyNotSupportError:
  254. raise ProviderModelCurrentlyNotSupportError()
  255. return {"documents": documents, "batch": batch}
  256. @setup_required
  257. @login_required
  258. @account_initialization_required
  259. @cloud_edition_billing_rate_limit_check("knowledge")
  260. def delete(self, dataset_id):
  261. dataset_id = str(dataset_id)
  262. dataset = DatasetService.get_dataset(dataset_id)
  263. if dataset is None:
  264. raise NotFound("Dataset not found.")
  265. # check user's model setting
  266. DatasetService.check_dataset_model_setting(dataset)
  267. try:
  268. document_ids = request.args.getlist("document_id")
  269. DocumentService.delete_documents(dataset, document_ids)
  270. except services.errors.document.DocumentIndexingError:
  271. raise DocumentIndexingError("Cannot delete document during indexing.")
  272. return {"result": "success"}, 204
  273. class DatasetInitApi(Resource):
  274. @setup_required
  275. @login_required
  276. @account_initialization_required
  277. @marshal_with(dataset_and_document_fields)
  278. @cloud_edition_billing_resource_check("vector_space")
  279. @cloud_edition_billing_rate_limit_check("knowledge")
  280. def post(self):
  281. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  282. if not current_user.is_dataset_editor:
  283. raise Forbidden()
  284. parser = reqparse.RequestParser()
  285. parser.add_argument(
  286. "indexing_technique",
  287. type=str,
  288. choices=Dataset.INDEXING_TECHNIQUE_LIST,
  289. required=True,
  290. nullable=False,
  291. location="json",
  292. )
  293. parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
  294. parser.add_argument("process_rule", type=dict, required=True, nullable=True, location="json")
  295. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  296. parser.add_argument(
  297. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  298. )
  299. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  300. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  301. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  302. args = parser.parse_args()
  303. # The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
  304. if not current_user.is_dataset_editor:
  305. raise Forbidden()
  306. knowledge_config = KnowledgeConfig(**args)
  307. if knowledge_config.indexing_technique == "high_quality":
  308. if knowledge_config.embedding_model is None or knowledge_config.embedding_model_provider is None:
  309. raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
  310. try:
  311. model_manager = ModelManager()
  312. model_manager.get_model_instance(
  313. tenant_id=current_user.current_tenant_id,
  314. provider=args["embedding_model_provider"],
  315. model_type=ModelType.TEXT_EMBEDDING,
  316. model=args["embedding_model"],
  317. )
  318. except InvokeAuthorizationError:
  319. raise ProviderNotInitializeError(
  320. "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
  321. )
  322. except ProviderTokenNotInitError as ex:
  323. raise ProviderNotInitializeError(ex.description)
  324. # validate args
  325. DocumentService.document_create_args_validate(knowledge_config)
  326. try:
  327. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  328. tenant_id=current_user.current_tenant_id, knowledge_config=knowledge_config, account=current_user
  329. )
  330. except ProviderTokenNotInitError as ex:
  331. raise ProviderNotInitializeError(ex.description)
  332. except QuotaExceededError:
  333. raise ProviderQuotaExceededError()
  334. except ModelCurrentlyNotSupportError:
  335. raise ProviderModelCurrentlyNotSupportError()
  336. response = {"dataset": dataset, "documents": documents, "batch": batch}
  337. return response
  338. class DocumentIndexingEstimateApi(DocumentResource):
  339. @setup_required
  340. @login_required
  341. @account_initialization_required
  342. def get(self, dataset_id, document_id):
  343. dataset_id = str(dataset_id)
  344. document_id = str(document_id)
  345. document = self.get_document(dataset_id, document_id)
  346. if document.indexing_status in {"completed", "error"}:
  347. raise DocumentAlreadyFinishedError()
  348. data_process_rule = document.dataset_process_rule
  349. data_process_rule_dict = data_process_rule.to_dict()
  350. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  351. if document.data_source_type == "upload_file":
  352. data_source_info = document.data_source_info_dict
  353. if data_source_info and "upload_file_id" in data_source_info:
  354. file_id = data_source_info["upload_file_id"]
  355. file = (
  356. db.session.query(UploadFile)
  357. .filter(UploadFile.tenant_id == document.tenant_id, UploadFile.id == file_id)
  358. .first()
  359. )
  360. # raise error if file not found
  361. if not file:
  362. raise NotFound("File not found.")
  363. extract_setting = ExtractSetting(
  364. datasource_type="upload_file", upload_file=file, document_model=document.doc_form
  365. )
  366. indexing_runner = IndexingRunner()
  367. try:
  368. estimate_response = indexing_runner.indexing_estimate(
  369. current_user.current_tenant_id,
  370. [extract_setting],
  371. data_process_rule_dict,
  372. document.doc_form,
  373. "English",
  374. dataset_id,
  375. )
  376. return estimate_response.model_dump(), 200
  377. except LLMBadRequestError:
  378. raise ProviderNotInitializeError(
  379. "No Embedding Model available. Please configure a valid provider "
  380. "in the Settings -> Model Provider."
  381. )
  382. except ProviderTokenNotInitError as ex:
  383. raise ProviderNotInitializeError(ex.description)
  384. except PluginDaemonClientSideError as ex:
  385. raise ProviderNotInitializeError(ex.description)
  386. except Exception as e:
  387. raise IndexingEstimateError(str(e))
  388. return response, 200
  389. class DocumentBatchIndexingEstimateApi(DocumentResource):
  390. @setup_required
  391. @login_required
  392. @account_initialization_required
  393. def get(self, dataset_id, batch):
  394. dataset_id = str(dataset_id)
  395. batch = str(batch)
  396. documents = self.get_batch_documents(dataset_id, batch)
  397. if not documents:
  398. return {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}, 200
  399. data_process_rule = documents[0].dataset_process_rule
  400. data_process_rule_dict = data_process_rule.to_dict()
  401. info_list = []
  402. extract_settings = []
  403. for document in documents:
  404. if document.indexing_status in {"completed", "error"}:
  405. raise DocumentAlreadyFinishedError()
  406. data_source_info = document.data_source_info_dict
  407. # format document files info
  408. if data_source_info and "upload_file_id" in data_source_info:
  409. file_id = data_source_info["upload_file_id"]
  410. info_list.append(file_id)
  411. # format document notion info
  412. elif (
  413. data_source_info and "notion_workspace_id" in data_source_info and "notion_page_id" in data_source_info
  414. ):
  415. pages = []
  416. page = {"page_id": data_source_info["notion_page_id"], "type": data_source_info["type"]}
  417. pages.append(page)
  418. notion_info = {"workspace_id": data_source_info["notion_workspace_id"], "pages": pages}
  419. info_list.append(notion_info)
  420. if document.data_source_type == "upload_file":
  421. file_id = data_source_info["upload_file_id"]
  422. file_detail = (
  423. db.session.query(UploadFile)
  424. .filter(UploadFile.tenant_id == current_user.current_tenant_id, UploadFile.id == file_id)
  425. .first()
  426. )
  427. if file_detail is None:
  428. raise NotFound("File not found.")
  429. extract_setting = ExtractSetting(
  430. datasource_type="upload_file", upload_file=file_detail, document_model=document.doc_form
  431. )
  432. extract_settings.append(extract_setting)
  433. elif document.data_source_type == "notion_import":
  434. extract_setting = ExtractSetting(
  435. datasource_type="notion_import",
  436. notion_info={
  437. "notion_workspace_id": data_source_info["notion_workspace_id"],
  438. "notion_obj_id": data_source_info["notion_page_id"],
  439. "notion_page_type": data_source_info["type"],
  440. "tenant_id": current_user.current_tenant_id,
  441. },
  442. document_model=document.doc_form,
  443. )
  444. extract_settings.append(extract_setting)
  445. elif document.data_source_type == "website_crawl":
  446. extract_setting = ExtractSetting(
  447. datasource_type="website_crawl",
  448. website_info={
  449. "provider": data_source_info["provider"],
  450. "job_id": data_source_info["job_id"],
  451. "url": data_source_info["url"],
  452. "tenant_id": current_user.current_tenant_id,
  453. "mode": data_source_info["mode"],
  454. "only_main_content": data_source_info["only_main_content"],
  455. },
  456. document_model=document.doc_form,
  457. )
  458. extract_settings.append(extract_setting)
  459. else:
  460. raise ValueError("Data source type not support")
  461. indexing_runner = IndexingRunner()
  462. try:
  463. response = indexing_runner.indexing_estimate(
  464. current_user.current_tenant_id,
  465. extract_settings,
  466. data_process_rule_dict,
  467. document.doc_form,
  468. "English",
  469. dataset_id,
  470. )
  471. return response.model_dump(), 200
  472. except LLMBadRequestError:
  473. raise ProviderNotInitializeError(
  474. "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
  475. )
  476. except ProviderTokenNotInitError as ex:
  477. raise ProviderNotInitializeError(ex.description)
  478. except PluginDaemonClientSideError as ex:
  479. raise ProviderNotInitializeError(ex.description)
  480. except Exception as e:
  481. raise IndexingEstimateError(str(e))
  482. class DocumentBatchIndexingStatusApi(DocumentResource):
  483. @setup_required
  484. @login_required
  485. @account_initialization_required
  486. def get(self, dataset_id, batch):
  487. dataset_id = str(dataset_id)
  488. batch = str(batch)
  489. documents = self.get_batch_documents(dataset_id, batch)
  490. documents_status = []
  491. for document in documents:
  492. completed_segments = DocumentSegment.query.filter(
  493. DocumentSegment.completed_at.isnot(None),
  494. DocumentSegment.document_id == str(document.id),
  495. DocumentSegment.status != "re_segment",
  496. ).count()
  497. total_segments = DocumentSegment.query.filter(
  498. DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment"
  499. ).count()
  500. document.completed_segments = completed_segments
  501. document.total_segments = total_segments
  502. if document.is_paused:
  503. document.indexing_status = "paused"
  504. documents_status.append(marshal(document, document_status_fields))
  505. data = {"data": documents_status}
  506. return data
  507. class DocumentIndexingStatusApi(DocumentResource):
  508. @setup_required
  509. @login_required
  510. @account_initialization_required
  511. def get(self, dataset_id, document_id):
  512. dataset_id = str(dataset_id)
  513. document_id = str(document_id)
  514. document = self.get_document(dataset_id, document_id)
  515. completed_segments = DocumentSegment.query.filter(
  516. DocumentSegment.completed_at.isnot(None),
  517. DocumentSegment.document_id == str(document_id),
  518. DocumentSegment.status != "re_segment",
  519. ).count()
  520. total_segments = DocumentSegment.query.filter(
  521. DocumentSegment.document_id == str(document_id), DocumentSegment.status != "re_segment"
  522. ).count()
  523. document.completed_segments = completed_segments
  524. document.total_segments = total_segments
  525. if document.is_paused:
  526. document.indexing_status = "paused"
  527. return marshal(document, document_status_fields)
  528. class DocumentDetailApi(DocumentResource):
  529. METADATA_CHOICES = {"all", "only", "without"}
  530. @setup_required
  531. @login_required
  532. @account_initialization_required
  533. def get(self, dataset_id, document_id):
  534. dataset_id = str(dataset_id)
  535. document_id = str(document_id)
  536. document = self.get_document(dataset_id, document_id)
  537. metadata = request.args.get("metadata", "all")
  538. if metadata not in self.METADATA_CHOICES:
  539. raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
  540. if metadata == "only":
  541. response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
  542. elif metadata == "without":
  543. dataset_process_rules = DatasetService.get_process_rules(dataset_id)
  544. document_process_rules = document.dataset_process_rule.to_dict()
  545. data_source_info = document.data_source_detail_dict
  546. response = {
  547. "id": document.id,
  548. "position": document.position,
  549. "data_source_type": document.data_source_type,
  550. "data_source_info": data_source_info,
  551. "dataset_process_rule_id": document.dataset_process_rule_id,
  552. "dataset_process_rule": dataset_process_rules,
  553. "document_process_rule": document_process_rules,
  554. "name": document.name,
  555. "created_from": document.created_from,
  556. "created_by": document.created_by,
  557. "created_at": document.created_at.timestamp(),
  558. "tokens": document.tokens,
  559. "indexing_status": document.indexing_status,
  560. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  561. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  562. "indexing_latency": document.indexing_latency,
  563. "error": document.error,
  564. "enabled": document.enabled,
  565. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  566. "disabled_by": document.disabled_by,
  567. "archived": document.archived,
  568. "segment_count": document.segment_count,
  569. "average_segment_length": document.average_segment_length,
  570. "hit_count": document.hit_count,
  571. "display_status": document.display_status,
  572. "doc_form": document.doc_form,
  573. "doc_language": document.doc_language,
  574. }
  575. else:
  576. dataset_process_rules = DatasetService.get_process_rules(dataset_id)
  577. document_process_rules = document.dataset_process_rule.to_dict()
  578. data_source_info = document.data_source_detail_dict
  579. response = {
  580. "id": document.id,
  581. "position": document.position,
  582. "data_source_type": document.data_source_type,
  583. "data_source_info": data_source_info,
  584. "dataset_process_rule_id": document.dataset_process_rule_id,
  585. "dataset_process_rule": dataset_process_rules,
  586. "document_process_rule": document_process_rules,
  587. "name": document.name,
  588. "created_from": document.created_from,
  589. "created_by": document.created_by,
  590. "created_at": document.created_at.timestamp(),
  591. "tokens": document.tokens,
  592. "indexing_status": document.indexing_status,
  593. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  594. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  595. "indexing_latency": document.indexing_latency,
  596. "error": document.error,
  597. "enabled": document.enabled,
  598. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  599. "disabled_by": document.disabled_by,
  600. "archived": document.archived,
  601. "doc_type": document.doc_type,
  602. "doc_metadata": document.doc_metadata_details,
  603. "segment_count": document.segment_count,
  604. "average_segment_length": document.average_segment_length,
  605. "hit_count": document.hit_count,
  606. "display_status": document.display_status,
  607. "doc_form": document.doc_form,
  608. "doc_language": document.doc_language,
  609. }
  610. return response, 200
  611. class DocumentProcessingApi(DocumentResource):
  612. @setup_required
  613. @login_required
  614. @account_initialization_required
  615. @cloud_edition_billing_rate_limit_check("knowledge")
  616. def patch(self, dataset_id, document_id, action):
  617. dataset_id = str(dataset_id)
  618. document_id = str(document_id)
  619. document = self.get_document(dataset_id, document_id)
  620. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  621. if not current_user.is_dataset_editor:
  622. raise Forbidden()
  623. if action == "pause":
  624. if document.indexing_status != "indexing":
  625. raise InvalidActionError("Document not in indexing state.")
  626. document.paused_by = current_user.id
  627. document.paused_at = datetime.now(UTC).replace(tzinfo=None)
  628. document.is_paused = True
  629. db.session.commit()
  630. elif action == "resume":
  631. if document.indexing_status not in {"paused", "error"}:
  632. raise InvalidActionError("Document not in paused or error state.")
  633. document.paused_by = None
  634. document.paused_at = None
  635. document.is_paused = False
  636. db.session.commit()
  637. else:
  638. raise InvalidActionError()
  639. return {"result": "success"}, 200
  640. class DocumentDeleteApi(DocumentResource):
  641. @setup_required
  642. @login_required
  643. @account_initialization_required
  644. @cloud_edition_billing_rate_limit_check("knowledge")
  645. def delete(self, dataset_id, document_id):
  646. dataset_id = str(dataset_id)
  647. document_id = str(document_id)
  648. dataset = DatasetService.get_dataset(dataset_id)
  649. if dataset is None:
  650. raise NotFound("Dataset not found.")
  651. # check user's model setting
  652. DatasetService.check_dataset_model_setting(dataset)
  653. document = self.get_document(dataset_id, document_id)
  654. try:
  655. DocumentService.delete_document(document)
  656. except services.errors.document.DocumentIndexingError:
  657. raise DocumentIndexingError("Cannot delete document during indexing.")
  658. return {"result": "success"}, 204
  659. class DocumentMetadataApi(DocumentResource):
  660. @setup_required
  661. @login_required
  662. @account_initialization_required
  663. def put(self, dataset_id, document_id):
  664. dataset_id = str(dataset_id)
  665. document_id = str(document_id)
  666. document = self.get_document(dataset_id, document_id)
  667. req_data = request.get_json()
  668. doc_type = req_data.get("doc_type")
  669. doc_metadata = req_data.get("doc_metadata")
  670. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  671. if not current_user.is_dataset_editor:
  672. raise Forbidden()
  673. if doc_type is None or doc_metadata is None:
  674. raise ValueError("Both doc_type and doc_metadata must be provided.")
  675. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  676. raise ValueError("Invalid doc_type.")
  677. if not isinstance(doc_metadata, dict):
  678. raise ValueError("doc_metadata must be a dictionary.")
  679. metadata_schema: dict = cast(dict, DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type])
  680. document.doc_metadata = {}
  681. if doc_type == "others":
  682. document.doc_metadata = doc_metadata
  683. else:
  684. for key, value_type in metadata_schema.items():
  685. value = doc_metadata.get(key)
  686. if value is not None and isinstance(value, value_type):
  687. document.doc_metadata[key] = value
  688. document.doc_type = doc_type
  689. document.updated_at = datetime.now(UTC).replace(tzinfo=None)
  690. db.session.commit()
  691. return {"result": "success", "message": "Document metadata updated."}, 200
  692. class DocumentStatusApi(DocumentResource):
  693. @setup_required
  694. @login_required
  695. @account_initialization_required
  696. @cloud_edition_billing_resource_check("vector_space")
  697. @cloud_edition_billing_rate_limit_check("knowledge")
  698. def patch(self, dataset_id, action):
  699. dataset_id = str(dataset_id)
  700. dataset = DatasetService.get_dataset(dataset_id)
  701. if dataset is None:
  702. raise NotFound("Dataset not found.")
  703. # The role of the current user in the ta table must be admin, owner, or editor
  704. if not current_user.is_dataset_editor:
  705. raise Forbidden()
  706. # check user's model setting
  707. DatasetService.check_dataset_model_setting(dataset)
  708. # check user's permission
  709. DatasetService.check_dataset_permission(dataset, current_user)
  710. document_ids = request.args.getlist("document_id")
  711. for document_id in document_ids:
  712. document = self.get_document(dataset_id, document_id)
  713. indexing_cache_key = "document_{}_indexing".format(document.id)
  714. cache_result = redis_client.get(indexing_cache_key)
  715. if cache_result is not None:
  716. raise InvalidActionError(f"Document:{document.name} is being indexed, please try again later")
  717. if action == "enable":
  718. if document.enabled:
  719. continue
  720. if current_user.current_role != TenantAccountRole.ADMIN:
  721. document.enable_applicant = current_user.id
  722. document.check_status=1
  723. db.session.commit()
  724. return {"result": "该操作需要提交管理员审核后生效,请确认是否提交"}, 200
  725. document.enabled = True
  726. document.disabled_at = None
  727. document.disabled_by = None
  728. document.check_by = current_user.id
  729. document.check_at = datetime.now(UTC).replace(tzinfo=None)
  730. document.updated_at = datetime.now(UTC).replace(tzinfo=None)
  731. db.session.commit()
  732. # Set cache to prevent indexing the same document multiple times
  733. redis_client.setex(indexing_cache_key, 600, 1)
  734. add_document_to_index_task.delay(document_id)
  735. # 审核不通过
  736. elif action == "check_fail":
  737. document.enabled = False
  738. document.check_by = current_user.id
  739. document.updated_at = datetime.now(UTC).replace(tzinfo=None)
  740. document.check_status = 4
  741. db.session.commit()
  742. elif action == "disable":
  743. if not document.completed_at or document.indexing_status != "completed":
  744. raise InvalidActionError(f"Document: {document.name} is not completed.")
  745. if not document.enabled:
  746. continue
  747. # 下线判断,非管理员无法提交
  748. if current_user.current_role != TenantAccountRole.ADMIN:
  749. document.disable_applicant = current_user.id
  750. db.session.commit()
  751. # data = {"result": "该操作需要提交管理员审核后生效,请确认是否提交"}
  752. return {"result": "该操作需要提交管理员审核后生效,请确认是否提交"}, 200
  753. # return json.dumps(data, ensure_ascii=False), 200
  754. document.enabled = False
  755. document.disabled_at = datetime.now(UTC).replace(tzinfo=None)
  756. document.disabled_by = current_user.id
  757. document.updated_at = datetime.now(UTC).replace(tzinfo=None)
  758. document.check_status = 3
  759. db.session.commit()
  760. # Set cache to prevent indexing the same document multiple times
  761. redis_client.setex(indexing_cache_key, 600, 1)
  762. remove_document_from_index_task.delay(document_id)
  763. elif action == "archive":
  764. if document.archived:
  765. continue
  766. document.archived = True
  767. document.archived_at = datetime.now(UTC).replace(tzinfo=None)
  768. document.archived_by = current_user.id
  769. document.updated_at = datetime.now(UTC).replace(tzinfo=None)
  770. db.session.commit()
  771. if document.enabled:
  772. # Set cache to prevent indexing the same document multiple times
  773. redis_client.setex(indexing_cache_key, 600, 1)
  774. remove_document_from_index_task.delay(document_id)
  775. elif action == "un_archive":
  776. if not document.archived:
  777. continue
  778. document.archived = False
  779. document.archived_at = None
  780. document.archived_by = None
  781. document.updated_at = datetime.now(UTC).replace(tzinfo=None)
  782. db.session.commit()
  783. # Set cache to prevent indexing the same document multiple times
  784. redis_client.setex(indexing_cache_key, 600, 1)
  785. add_document_to_index_task.delay(document_id)
  786. else:
  787. raise InvalidActionError()
  788. return {"result": "success"}, 200
  789. class DocumentPauseApi(DocumentResource):
  790. @setup_required
  791. @login_required
  792. @account_initialization_required
  793. @cloud_edition_billing_rate_limit_check("knowledge")
  794. def patch(self, dataset_id, document_id):
  795. """pause document."""
  796. dataset_id = str(dataset_id)
  797. document_id = str(document_id)
  798. dataset = DatasetService.get_dataset(dataset_id)
  799. if not dataset:
  800. raise NotFound("Dataset not found.")
  801. document = DocumentService.get_document(dataset.id, document_id)
  802. # 404 if document not found
  803. if document is None:
  804. raise NotFound("Document Not Exists.")
  805. # 403 if document is archived
  806. if DocumentService.check_archived(document):
  807. raise ArchivedDocumentImmutableError()
  808. try:
  809. # pause document
  810. DocumentService.pause_document(document)
  811. except services.errors.document.DocumentIndexingError:
  812. raise DocumentIndexingError("Cannot pause completed document.")
  813. return {"result": "success"}, 204
  814. class DocumentRecoverApi(DocumentResource):
  815. @setup_required
  816. @login_required
  817. @account_initialization_required
  818. @cloud_edition_billing_rate_limit_check("knowledge")
  819. def patch(self, dataset_id, document_id):
  820. """recover document."""
  821. dataset_id = str(dataset_id)
  822. document_id = str(document_id)
  823. dataset = DatasetService.get_dataset(dataset_id)
  824. if not dataset:
  825. raise NotFound("Dataset not found.")
  826. document = DocumentService.get_document(dataset.id, document_id)
  827. # 404 if document not found
  828. if document is None:
  829. raise NotFound("Document Not Exists.")
  830. # 403 if document is archived
  831. if DocumentService.check_archived(document):
  832. raise ArchivedDocumentImmutableError()
  833. try:
  834. # pause document
  835. DocumentService.recover_document(document)
  836. except services.errors.document.DocumentIndexingError:
  837. raise DocumentIndexingError("Document is not in paused status.")
  838. return {"result": "success"}, 204
  839. class DocumentRetryApi(DocumentResource):
  840. @setup_required
  841. @login_required
  842. @account_initialization_required
  843. @cloud_edition_billing_rate_limit_check("knowledge")
  844. def post(self, dataset_id):
  845. """retry document."""
  846. parser = reqparse.RequestParser()
  847. parser.add_argument("document_ids", type=list, required=True, nullable=False, location="json")
  848. args = parser.parse_args()
  849. dataset_id = str(dataset_id)
  850. dataset = DatasetService.get_dataset(dataset_id)
  851. retry_documents = []
  852. if not dataset:
  853. raise NotFound("Dataset not found.")
  854. for document_id in args["document_ids"]:
  855. try:
  856. document_id = str(document_id)
  857. document = DocumentService.get_document(dataset.id, document_id)
  858. # 404 if document not found
  859. if document is None:
  860. raise NotFound("Document Not Exists.")
  861. # 403 if document is archived
  862. if DocumentService.check_archived(document):
  863. raise ArchivedDocumentImmutableError()
  864. # 400 if document is completed
  865. if document.indexing_status == "completed":
  866. raise DocumentAlreadyFinishedError()
  867. retry_documents.append(document)
  868. except Exception:
  869. logging.exception(f"Failed to retry document, document id: {document_id}")
  870. continue
  871. # retry document
  872. DocumentService.retry_document(dataset_id, retry_documents)
  873. return {"result": "success"}, 204
  874. class DocumentRenameApi(DocumentResource):
  875. @setup_required
  876. @login_required
  877. @account_initialization_required
  878. @marshal_with(document_fields)
  879. def post(self, dataset_id, document_id):
  880. # The role of the current user in the ta table must be admin, owner, editor, or dataset_operator
  881. if not current_user.is_dataset_editor:
  882. raise Forbidden()
  883. dataset = DatasetService.get_dataset(dataset_id)
  884. DatasetService.check_dataset_operator_permission(current_user, dataset)
  885. parser = reqparse.RequestParser()
  886. parser.add_argument("name", type=str, required=True, nullable=False, location="json")
  887. args = parser.parse_args()
  888. try:
  889. document = DocumentService.rename_document(dataset_id, document_id, args["name"])
  890. except services.errors.document.DocumentIndexingError:
  891. raise DocumentIndexingError("Cannot delete document during indexing.")
  892. return document
  893. class WebsiteDocumentSyncApi(DocumentResource):
  894. @setup_required
  895. @login_required
  896. @account_initialization_required
  897. def get(self, dataset_id, document_id):
  898. """sync website document."""
  899. dataset_id = str(dataset_id)
  900. dataset = DatasetService.get_dataset(dataset_id)
  901. if not dataset:
  902. raise NotFound("Dataset not found.")
  903. document_id = str(document_id)
  904. document = DocumentService.get_document(dataset.id, document_id)
  905. if not document:
  906. raise NotFound("Document not found.")
  907. if document.tenant_id != current_user.current_tenant_id:
  908. raise Forbidden("No permission.")
  909. if document.data_source_type != "website_crawl":
  910. raise ValueError("Document is not a website document.")
  911. # 403 if document is archived
  912. if DocumentService.check_archived(document):
  913. raise ArchivedDocumentImmutableError()
  914. # sync document
  915. DocumentService.sync_website_document(dataset_id, document)
  916. return {"result": "success"}, 200
  917. api.add_resource(GetProcessRuleApi, "/datasets/process-rule")
  918. api.add_resource(DatasetDocumentListApi, "/datasets/<uuid:dataset_id>/documents")
  919. api.add_resource(DatasetInitApi, "/datasets/init")
  920. api.add_resource(
  921. DocumentIndexingEstimateApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate"
  922. )
  923. api.add_resource(DocumentBatchIndexingEstimateApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate")
  924. api.add_resource(DocumentBatchIndexingStatusApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status")
  925. api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status")
  926. api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  927. api.add_resource(
  928. DocumentProcessingApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>"
  929. )
  930. api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  931. api.add_resource(DocumentMetadataApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata")
  932. api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/status/<string:action>/batch")
  933. api.add_resource(DocumentPauseApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause")
  934. api.add_resource(DocumentRecoverApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume")
  935. api.add_resource(DocumentRetryApi, "/datasets/<uuid:dataset_id>/retry")
  936. api.add_resource(DocumentRenameApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename")
  937. api.add_resource(WebsiteDocumentSyncApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync")