datasets_document.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from datetime import datetime, timezone
  4. from flask import request
  5. from flask_login import current_user
  6. from flask_restful import Resource, fields, marshal, marshal_with, reqparse
  7. from sqlalchemy import asc, desc
  8. from werkzeug.exceptions import Forbidden, NotFound
  9. import services
  10. from controllers.console import api
  11. from controllers.console.app.error import (
  12. ProviderModelCurrentlyNotSupportError,
  13. ProviderNotInitializeError,
  14. ProviderQuotaExceededError,
  15. )
  16. from controllers.console.datasets.error import (
  17. ArchivedDocumentImmutableError,
  18. DocumentAlreadyFinishedError,
  19. DocumentIndexingError,
  20. IndexingEstimateError,
  21. InvalidActionError,
  22. InvalidMetadataError,
  23. )
  24. from controllers.console.setup import setup_required
  25. from controllers.console.wraps import account_initialization_required, cloud_edition_billing_resource_check
  26. from core.errors.error import (
  27. LLMBadRequestError,
  28. ModelCurrentlyNotSupportError,
  29. ProviderTokenNotInitError,
  30. QuotaExceededError,
  31. )
  32. from core.indexing_runner import IndexingRunner
  33. from core.model_manager import ModelManager
  34. from core.model_runtime.entities.model_entities import ModelType
  35. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  36. from core.rag.extractor.entity.extract_setting import ExtractSetting
  37. from extensions.ext_database import db
  38. from extensions.ext_redis import redis_client
  39. from fields.document_fields import (
  40. dataset_and_document_fields,
  41. document_fields,
  42. document_status_fields,
  43. document_with_segments_fields,
  44. )
  45. from libs.login import login_required
  46. from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
  47. from services.dataset_service import DatasetService, DocumentService
  48. from tasks.add_document_to_index_task import add_document_to_index_task
  49. from tasks.remove_document_from_index_task import remove_document_from_index_task
  50. class DocumentResource(Resource):
  51. def get_document(self, dataset_id: str, document_id: str) -> Document:
  52. dataset = DatasetService.get_dataset(dataset_id)
  53. if not dataset:
  54. raise NotFound("Dataset not found.")
  55. try:
  56. DatasetService.check_dataset_permission(dataset, current_user)
  57. except services.errors.account.NoPermissionError as e:
  58. raise Forbidden(str(e))
  59. document = DocumentService.get_document(dataset_id, document_id)
  60. if not document:
  61. raise NotFound("Document not found.")
  62. if document.tenant_id != current_user.current_tenant_id:
  63. raise Forbidden("No permission.")
  64. return document
  65. def get_batch_documents(self, dataset_id: str, batch: str) -> list[Document]:
  66. dataset = DatasetService.get_dataset(dataset_id)
  67. if not dataset:
  68. raise NotFound("Dataset not found.")
  69. try:
  70. DatasetService.check_dataset_permission(dataset, current_user)
  71. except services.errors.account.NoPermissionError as e:
  72. raise Forbidden(str(e))
  73. documents = DocumentService.get_batch_documents(dataset_id, batch)
  74. if not documents:
  75. raise NotFound("Documents not found.")
  76. return documents
  77. class GetProcessRuleApi(Resource):
  78. @setup_required
  79. @login_required
  80. @account_initialization_required
  81. def get(self):
  82. req_data = request.args
  83. document_id = req_data.get("document_id")
  84. # get default rules
  85. mode = DocumentService.DEFAULT_RULES["mode"]
  86. rules = DocumentService.DEFAULT_RULES["rules"]
  87. if document_id:
  88. # get the latest process rule
  89. document = Document.query.get_or_404(document_id)
  90. dataset = DatasetService.get_dataset(document.dataset_id)
  91. if not dataset:
  92. raise NotFound("Dataset not found.")
  93. try:
  94. DatasetService.check_dataset_permission(dataset, current_user)
  95. except services.errors.account.NoPermissionError as e:
  96. raise Forbidden(str(e))
  97. # get the latest process rule
  98. dataset_process_rule = (
  99. db.session.query(DatasetProcessRule)
  100. .filter(DatasetProcessRule.dataset_id == document.dataset_id)
  101. .order_by(DatasetProcessRule.created_at.desc())
  102. .limit(1)
  103. .one_or_none()
  104. )
  105. if dataset_process_rule:
  106. mode = dataset_process_rule.mode
  107. rules = dataset_process_rule.rules_dict
  108. return {"mode": mode, "rules": rules}
  109. class DatasetDocumentListApi(Resource):
  110. @setup_required
  111. @login_required
  112. @account_initialization_required
  113. def get(self, dataset_id):
  114. dataset_id = str(dataset_id)
  115. page = request.args.get("page", default=1, type=int)
  116. limit = request.args.get("limit", default=20, type=int)
  117. search = request.args.get("keyword", default=None, type=str)
  118. sort = request.args.get("sort", default="-created_at", type=str)
  119. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  120. try:
  121. fetch_val = request.args.get("fetch", default="false")
  122. if isinstance(fetch_val, bool):
  123. fetch = fetch_val
  124. else:
  125. if fetch_val.lower() in ("yes", "true", "t", "y", "1"):
  126. fetch = True
  127. elif fetch_val.lower() in ("no", "false", "f", "n", "0"):
  128. fetch = False
  129. else:
  130. raise ArgumentTypeError(
  131. f"Truthy value expected: got {fetch_val} but expected one of yes/no, true/false, t/f, y/n, 1/0 "
  132. f"(case insensitive)."
  133. )
  134. except (ArgumentTypeError, ValueError, Exception) as e:
  135. fetch = False
  136. dataset = DatasetService.get_dataset(dataset_id)
  137. if not dataset:
  138. raise NotFound("Dataset not found.")
  139. try:
  140. DatasetService.check_dataset_permission(dataset, current_user)
  141. except services.errors.account.NoPermissionError as e:
  142. raise Forbidden(str(e))
  143. query = Document.query.filter_by(dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  144. if search:
  145. search = f"%{search}%"
  146. query = query.filter(Document.name.like(search))
  147. if sort.startswith("-"):
  148. sort_logic = desc
  149. sort = sort[1:]
  150. else:
  151. sort_logic = asc
  152. if sort == "hit_count":
  153. sub_query = (
  154. db.select(DocumentSegment.document_id, db.func.sum(DocumentSegment.hit_count).label("total_hit_count"))
  155. .group_by(DocumentSegment.document_id)
  156. .subquery()
  157. )
  158. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id).order_by(
  159. sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)),
  160. sort_logic(Document.position),
  161. )
  162. elif sort == "created_at":
  163. query = query.order_by(
  164. sort_logic(Document.created_at),
  165. sort_logic(Document.position),
  166. )
  167. else:
  168. query = query.order_by(
  169. desc(Document.created_at),
  170. desc(Document.position),
  171. )
  172. paginated_documents = query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False)
  173. documents = paginated_documents.items
  174. if fetch:
  175. for document in documents:
  176. completed_segments = DocumentSegment.query.filter(
  177. DocumentSegment.completed_at.isnot(None),
  178. DocumentSegment.document_id == str(document.id),
  179. DocumentSegment.status != "re_segment",
  180. ).count()
  181. total_segments = DocumentSegment.query.filter(
  182. DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment"
  183. ).count()
  184. document.completed_segments = completed_segments
  185. document.total_segments = total_segments
  186. data = marshal(documents, document_with_segments_fields)
  187. else:
  188. data = marshal(documents, document_fields)
  189. response = {
  190. "data": data,
  191. "has_more": len(documents) == limit,
  192. "limit": limit,
  193. "total": paginated_documents.total,
  194. "page": page,
  195. }
  196. return response
  197. documents_and_batch_fields = {"documents": fields.List(fields.Nested(document_fields)), "batch": fields.String}
  198. @setup_required
  199. @login_required
  200. @account_initialization_required
  201. @marshal_with(documents_and_batch_fields)
  202. @cloud_edition_billing_resource_check("vector_space")
  203. def post(self, dataset_id):
  204. dataset_id = str(dataset_id)
  205. dataset = DatasetService.get_dataset(dataset_id)
  206. if not dataset:
  207. raise NotFound("Dataset not found.")
  208. # The role of the current user in the ta table must be admin, owner, or editor
  209. if not current_user.is_dataset_editor:
  210. raise Forbidden()
  211. try:
  212. DatasetService.check_dataset_permission(dataset, current_user)
  213. except services.errors.account.NoPermissionError as e:
  214. raise Forbidden(str(e))
  215. parser = reqparse.RequestParser()
  216. parser.add_argument(
  217. "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
  218. )
  219. parser.add_argument("data_source", type=dict, required=False, location="json")
  220. parser.add_argument("process_rule", type=dict, required=False, location="json")
  221. parser.add_argument("duplicate", type=bool, default=True, nullable=False, location="json")
  222. parser.add_argument("original_document_id", type=str, required=False, location="json")
  223. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  224. parser.add_argument(
  225. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  226. )
  227. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  228. args = parser.parse_args()
  229. if not dataset.indexing_technique and not args["indexing_technique"]:
  230. raise ValueError("indexing_technique is required.")
  231. # validate args
  232. DocumentService.document_create_args_validate(args)
  233. try:
  234. documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
  235. except ProviderTokenNotInitError as ex:
  236. raise ProviderNotInitializeError(ex.description)
  237. except QuotaExceededError:
  238. raise ProviderQuotaExceededError()
  239. except ModelCurrentlyNotSupportError:
  240. raise ProviderModelCurrentlyNotSupportError()
  241. return {"documents": documents, "batch": batch}
  242. class DatasetInitApi(Resource):
  243. @setup_required
  244. @login_required
  245. @account_initialization_required
  246. @marshal_with(dataset_and_document_fields)
  247. @cloud_edition_billing_resource_check("vector_space")
  248. def post(self):
  249. # The role of the current user in the ta table must be admin, owner, or editor
  250. if not current_user.is_editor:
  251. raise Forbidden()
  252. parser = reqparse.RequestParser()
  253. parser.add_argument(
  254. "indexing_technique",
  255. type=str,
  256. choices=Dataset.INDEXING_TECHNIQUE_LIST,
  257. required=True,
  258. nullable=False,
  259. location="json",
  260. )
  261. parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
  262. parser.add_argument("process_rule", type=dict, required=True, nullable=True, location="json")
  263. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  264. parser.add_argument(
  265. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  266. )
  267. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  268. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  269. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  270. args = parser.parse_args()
  271. # The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
  272. if not current_user.is_dataset_editor:
  273. raise Forbidden()
  274. if args["indexing_technique"] == "high_quality":
  275. if args["embedding_model"] is None or args["embedding_model_provider"] is None:
  276. raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
  277. try:
  278. model_manager = ModelManager()
  279. model_manager.get_default_model_instance(
  280. tenant_id=current_user.current_tenant_id, model_type=ModelType.TEXT_EMBEDDING
  281. )
  282. except InvokeAuthorizationError:
  283. raise ProviderNotInitializeError(
  284. "No Embedding Model available. Please configure a valid provider "
  285. "in the Settings -> Model Provider."
  286. )
  287. except ProviderTokenNotInitError as ex:
  288. raise ProviderNotInitializeError(ex.description)
  289. # validate args
  290. DocumentService.document_create_args_validate(args)
  291. try:
  292. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  293. tenant_id=current_user.current_tenant_id, document_data=args, account=current_user
  294. )
  295. except ProviderTokenNotInitError as ex:
  296. raise ProviderNotInitializeError(ex.description)
  297. except QuotaExceededError:
  298. raise ProviderQuotaExceededError()
  299. except ModelCurrentlyNotSupportError:
  300. raise ProviderModelCurrentlyNotSupportError()
  301. response = {"dataset": dataset, "documents": documents, "batch": batch}
  302. return response
  303. class DocumentIndexingEstimateApi(DocumentResource):
  304. @setup_required
  305. @login_required
  306. @account_initialization_required
  307. def get(self, dataset_id, document_id):
  308. dataset_id = str(dataset_id)
  309. document_id = str(document_id)
  310. document = self.get_document(dataset_id, document_id)
  311. if document.indexing_status in {"completed", "error"}:
  312. raise DocumentAlreadyFinishedError()
  313. data_process_rule = document.dataset_process_rule
  314. data_process_rule_dict = data_process_rule.to_dict()
  315. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  316. if document.data_source_type == "upload_file":
  317. data_source_info = document.data_source_info_dict
  318. if data_source_info and "upload_file_id" in data_source_info:
  319. file_id = data_source_info["upload_file_id"]
  320. file = (
  321. db.session.query(UploadFile)
  322. .filter(UploadFile.tenant_id == document.tenant_id, UploadFile.id == file_id)
  323. .first()
  324. )
  325. # raise error if file not found
  326. if not file:
  327. raise NotFound("File not found.")
  328. extract_setting = ExtractSetting(
  329. datasource_type="upload_file", upload_file=file, document_model=document.doc_form
  330. )
  331. indexing_runner = IndexingRunner()
  332. try:
  333. response = indexing_runner.indexing_estimate(
  334. current_user.current_tenant_id,
  335. [extract_setting],
  336. data_process_rule_dict,
  337. document.doc_form,
  338. "English",
  339. dataset_id,
  340. )
  341. except LLMBadRequestError:
  342. raise ProviderNotInitializeError(
  343. "No Embedding Model available. Please configure a valid provider "
  344. "in the Settings -> Model Provider."
  345. )
  346. except ProviderTokenNotInitError as ex:
  347. raise ProviderNotInitializeError(ex.description)
  348. except Exception as e:
  349. raise IndexingEstimateError(str(e))
  350. return response
  351. class DocumentBatchIndexingEstimateApi(DocumentResource):
  352. @setup_required
  353. @login_required
  354. @account_initialization_required
  355. def get(self, dataset_id, batch):
  356. dataset_id = str(dataset_id)
  357. batch = str(batch)
  358. documents = self.get_batch_documents(dataset_id, batch)
  359. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  360. if not documents:
  361. return response
  362. data_process_rule = documents[0].dataset_process_rule
  363. data_process_rule_dict = data_process_rule.to_dict()
  364. info_list = []
  365. extract_settings = []
  366. for document in documents:
  367. if document.indexing_status in {"completed", "error"}:
  368. raise DocumentAlreadyFinishedError()
  369. data_source_info = document.data_source_info_dict
  370. # format document files info
  371. if data_source_info and "upload_file_id" in data_source_info:
  372. file_id = data_source_info["upload_file_id"]
  373. info_list.append(file_id)
  374. # format document notion info
  375. elif (
  376. data_source_info and "notion_workspace_id" in data_source_info and "notion_page_id" in data_source_info
  377. ):
  378. pages = []
  379. page = {"page_id": data_source_info["notion_page_id"], "type": data_source_info["type"]}
  380. pages.append(page)
  381. notion_info = {"workspace_id": data_source_info["notion_workspace_id"], "pages": pages}
  382. info_list.append(notion_info)
  383. if document.data_source_type == "upload_file":
  384. file_id = data_source_info["upload_file_id"]
  385. file_detail = (
  386. db.session.query(UploadFile)
  387. .filter(UploadFile.tenant_id == current_user.current_tenant_id, UploadFile.id == file_id)
  388. .first()
  389. )
  390. if file_detail is None:
  391. raise NotFound("File not found.")
  392. extract_setting = ExtractSetting(
  393. datasource_type="upload_file", upload_file=file_detail, document_model=document.doc_form
  394. )
  395. extract_settings.append(extract_setting)
  396. elif document.data_source_type == "notion_import":
  397. extract_setting = ExtractSetting(
  398. datasource_type="notion_import",
  399. notion_info={
  400. "notion_workspace_id": data_source_info["notion_workspace_id"],
  401. "notion_obj_id": data_source_info["notion_page_id"],
  402. "notion_page_type": data_source_info["type"],
  403. "tenant_id": current_user.current_tenant_id,
  404. },
  405. document_model=document.doc_form,
  406. )
  407. extract_settings.append(extract_setting)
  408. elif document.data_source_type == "website_crawl":
  409. extract_setting = ExtractSetting(
  410. datasource_type="website_crawl",
  411. website_info={
  412. "provider": data_source_info["provider"],
  413. "job_id": data_source_info["job_id"],
  414. "url": data_source_info["url"],
  415. "tenant_id": current_user.current_tenant_id,
  416. "mode": data_source_info["mode"],
  417. "only_main_content": data_source_info["only_main_content"],
  418. },
  419. document_model=document.doc_form,
  420. )
  421. extract_settings.append(extract_setting)
  422. else:
  423. raise ValueError("Data source type not support")
  424. indexing_runner = IndexingRunner()
  425. try:
  426. response = indexing_runner.indexing_estimate(
  427. current_user.current_tenant_id,
  428. extract_settings,
  429. data_process_rule_dict,
  430. document.doc_form,
  431. "English",
  432. dataset_id,
  433. )
  434. except LLMBadRequestError:
  435. raise ProviderNotInitializeError(
  436. "No Embedding Model available. Please configure a valid provider "
  437. "in the Settings -> Model Provider."
  438. )
  439. except ProviderTokenNotInitError as ex:
  440. raise ProviderNotInitializeError(ex.description)
  441. except Exception as e:
  442. raise IndexingEstimateError(str(e))
  443. return response
  444. class DocumentBatchIndexingStatusApi(DocumentResource):
  445. @setup_required
  446. @login_required
  447. @account_initialization_required
  448. def get(self, dataset_id, batch):
  449. dataset_id = str(dataset_id)
  450. batch = str(batch)
  451. documents = self.get_batch_documents(dataset_id, batch)
  452. documents_status = []
  453. for document in documents:
  454. completed_segments = DocumentSegment.query.filter(
  455. DocumentSegment.completed_at.isnot(None),
  456. DocumentSegment.document_id == str(document.id),
  457. DocumentSegment.status != "re_segment",
  458. ).count()
  459. total_segments = DocumentSegment.query.filter(
  460. DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment"
  461. ).count()
  462. document.completed_segments = completed_segments
  463. document.total_segments = total_segments
  464. if document.is_paused:
  465. document.indexing_status = "paused"
  466. documents_status.append(marshal(document, document_status_fields))
  467. data = {"data": documents_status}
  468. return data
  469. class DocumentIndexingStatusApi(DocumentResource):
  470. @setup_required
  471. @login_required
  472. @account_initialization_required
  473. def get(self, dataset_id, document_id):
  474. dataset_id = str(dataset_id)
  475. document_id = str(document_id)
  476. document = self.get_document(dataset_id, document_id)
  477. completed_segments = DocumentSegment.query.filter(
  478. DocumentSegment.completed_at.isnot(None),
  479. DocumentSegment.document_id == str(document_id),
  480. DocumentSegment.status != "re_segment",
  481. ).count()
  482. total_segments = DocumentSegment.query.filter(
  483. DocumentSegment.document_id == str(document_id), DocumentSegment.status != "re_segment"
  484. ).count()
  485. document.completed_segments = completed_segments
  486. document.total_segments = total_segments
  487. if document.is_paused:
  488. document.indexing_status = "paused"
  489. return marshal(document, document_status_fields)
  490. class DocumentDetailApi(DocumentResource):
  491. METADATA_CHOICES = {"all", "only", "without"}
  492. @setup_required
  493. @login_required
  494. @account_initialization_required
  495. def get(self, dataset_id, document_id):
  496. dataset_id = str(dataset_id)
  497. document_id = str(document_id)
  498. document = self.get_document(dataset_id, document_id)
  499. metadata = request.args.get("metadata", "all")
  500. if metadata not in self.METADATA_CHOICES:
  501. raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
  502. if metadata == "only":
  503. response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata}
  504. elif metadata == "without":
  505. process_rules = DatasetService.get_process_rules(dataset_id)
  506. data_source_info = document.data_source_detail_dict
  507. response = {
  508. "id": document.id,
  509. "position": document.position,
  510. "data_source_type": document.data_source_type,
  511. "data_source_info": data_source_info,
  512. "dataset_process_rule_id": document.dataset_process_rule_id,
  513. "dataset_process_rule": process_rules,
  514. "name": document.name,
  515. "created_from": document.created_from,
  516. "created_by": document.created_by,
  517. "created_at": document.created_at.timestamp(),
  518. "tokens": document.tokens,
  519. "indexing_status": document.indexing_status,
  520. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  521. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  522. "indexing_latency": document.indexing_latency,
  523. "error": document.error,
  524. "enabled": document.enabled,
  525. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  526. "disabled_by": document.disabled_by,
  527. "archived": document.archived,
  528. "segment_count": document.segment_count,
  529. "average_segment_length": document.average_segment_length,
  530. "hit_count": document.hit_count,
  531. "display_status": document.display_status,
  532. "doc_form": document.doc_form,
  533. "doc_language": document.doc_language,
  534. }
  535. else:
  536. process_rules = DatasetService.get_process_rules(dataset_id)
  537. data_source_info = document.data_source_detail_dict
  538. response = {
  539. "id": document.id,
  540. "position": document.position,
  541. "data_source_type": document.data_source_type,
  542. "data_source_info": data_source_info,
  543. "dataset_process_rule_id": document.dataset_process_rule_id,
  544. "dataset_process_rule": process_rules,
  545. "name": document.name,
  546. "created_from": document.created_from,
  547. "created_by": document.created_by,
  548. "created_at": document.created_at.timestamp(),
  549. "tokens": document.tokens,
  550. "indexing_status": document.indexing_status,
  551. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  552. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  553. "indexing_latency": document.indexing_latency,
  554. "error": document.error,
  555. "enabled": document.enabled,
  556. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  557. "disabled_by": document.disabled_by,
  558. "archived": document.archived,
  559. "doc_type": document.doc_type,
  560. "doc_metadata": document.doc_metadata,
  561. "segment_count": document.segment_count,
  562. "average_segment_length": document.average_segment_length,
  563. "hit_count": document.hit_count,
  564. "display_status": document.display_status,
  565. "doc_form": document.doc_form,
  566. "doc_language": document.doc_language,
  567. }
  568. return response, 200
  569. class DocumentProcessingApi(DocumentResource):
  570. @setup_required
  571. @login_required
  572. @account_initialization_required
  573. def patch(self, dataset_id, document_id, action):
  574. dataset_id = str(dataset_id)
  575. document_id = str(document_id)
  576. document = self.get_document(dataset_id, document_id)
  577. # The role of the current user in the ta table must be admin, owner, or editor
  578. if not current_user.is_editor:
  579. raise Forbidden()
  580. if action == "pause":
  581. if document.indexing_status != "indexing":
  582. raise InvalidActionError("Document not in indexing state.")
  583. document.paused_by = current_user.id
  584. document.paused_at = datetime.now(timezone.utc).replace(tzinfo=None)
  585. document.is_paused = True
  586. db.session.commit()
  587. elif action == "resume":
  588. if document.indexing_status not in {"paused", "error"}:
  589. raise InvalidActionError("Document not in paused or error state.")
  590. document.paused_by = None
  591. document.paused_at = None
  592. document.is_paused = False
  593. db.session.commit()
  594. else:
  595. raise InvalidActionError()
  596. return {"result": "success"}, 200
  597. class DocumentDeleteApi(DocumentResource):
  598. @setup_required
  599. @login_required
  600. @account_initialization_required
  601. def delete(self, dataset_id, document_id):
  602. dataset_id = str(dataset_id)
  603. document_id = str(document_id)
  604. dataset = DatasetService.get_dataset(dataset_id)
  605. if dataset is None:
  606. raise NotFound("Dataset not found.")
  607. # check user's model setting
  608. DatasetService.check_dataset_model_setting(dataset)
  609. document = self.get_document(dataset_id, document_id)
  610. try:
  611. DocumentService.delete_document(document)
  612. except services.errors.document.DocumentIndexingError:
  613. raise DocumentIndexingError("Cannot delete document during indexing.")
  614. return {"result": "success"}, 204
  615. class DocumentMetadataApi(DocumentResource):
  616. @setup_required
  617. @login_required
  618. @account_initialization_required
  619. def put(self, dataset_id, document_id):
  620. dataset_id = str(dataset_id)
  621. document_id = str(document_id)
  622. document = self.get_document(dataset_id, document_id)
  623. req_data = request.get_json()
  624. doc_type = req_data.get("doc_type")
  625. doc_metadata = req_data.get("doc_metadata")
  626. # The role of the current user in the ta table must be admin, owner, or editor
  627. if not current_user.is_editor:
  628. raise Forbidden()
  629. if doc_type is None or doc_metadata is None:
  630. raise ValueError("Both doc_type and doc_metadata must be provided.")
  631. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  632. raise ValueError("Invalid doc_type.")
  633. if not isinstance(doc_metadata, dict):
  634. raise ValueError("doc_metadata must be a dictionary.")
  635. metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
  636. document.doc_metadata = {}
  637. if doc_type == "others":
  638. document.doc_metadata = doc_metadata
  639. else:
  640. for key, value_type in metadata_schema.items():
  641. value = doc_metadata.get(key)
  642. if value is not None and isinstance(value, value_type):
  643. document.doc_metadata[key] = value
  644. document.doc_type = doc_type
  645. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  646. db.session.commit()
  647. return {"result": "success", "message": "Document metadata updated."}, 200
  648. class DocumentStatusApi(DocumentResource):
  649. @setup_required
  650. @login_required
  651. @account_initialization_required
  652. @cloud_edition_billing_resource_check("vector_space")
  653. def patch(self, dataset_id, document_id, action):
  654. dataset_id = str(dataset_id)
  655. document_id = str(document_id)
  656. dataset = DatasetService.get_dataset(dataset_id)
  657. if dataset is None:
  658. raise NotFound("Dataset not found.")
  659. # The role of the current user in the ta table must be admin, owner, or editor
  660. if not current_user.is_dataset_editor:
  661. raise Forbidden()
  662. # check user's model setting
  663. DatasetService.check_dataset_model_setting(dataset)
  664. # check user's permission
  665. DatasetService.check_dataset_permission(dataset, current_user)
  666. document = self.get_document(dataset_id, document_id)
  667. indexing_cache_key = "document_{}_indexing".format(document.id)
  668. cache_result = redis_client.get(indexing_cache_key)
  669. if cache_result is not None:
  670. raise InvalidActionError("Document is being indexed, please try again later")
  671. if action == "enable":
  672. if document.enabled:
  673. raise InvalidActionError("Document already enabled.")
  674. document.enabled = True
  675. document.disabled_at = None
  676. document.disabled_by = None
  677. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  678. db.session.commit()
  679. # Set cache to prevent indexing the same document multiple times
  680. redis_client.setex(indexing_cache_key, 600, 1)
  681. add_document_to_index_task.delay(document_id)
  682. return {"result": "success"}, 200
  683. elif action == "disable":
  684. if not document.completed_at or document.indexing_status != "completed":
  685. raise InvalidActionError("Document is not completed.")
  686. if not document.enabled:
  687. raise InvalidActionError("Document already disabled.")
  688. document.enabled = False
  689. document.disabled_at = datetime.now(timezone.utc).replace(tzinfo=None)
  690. document.disabled_by = current_user.id
  691. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  692. db.session.commit()
  693. # Set cache to prevent indexing the same document multiple times
  694. redis_client.setex(indexing_cache_key, 600, 1)
  695. remove_document_from_index_task.delay(document_id)
  696. return {"result": "success"}, 200
  697. elif action == "archive":
  698. if document.archived:
  699. raise InvalidActionError("Document already archived.")
  700. document.archived = True
  701. document.archived_at = datetime.now(timezone.utc).replace(tzinfo=None)
  702. document.archived_by = current_user.id
  703. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  704. db.session.commit()
  705. if document.enabled:
  706. # Set cache to prevent indexing the same document multiple times
  707. redis_client.setex(indexing_cache_key, 600, 1)
  708. remove_document_from_index_task.delay(document_id)
  709. return {"result": "success"}, 200
  710. elif action == "un_archive":
  711. if not document.archived:
  712. raise InvalidActionError("Document is not archived.")
  713. document.archived = False
  714. document.archived_at = None
  715. document.archived_by = None
  716. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  717. db.session.commit()
  718. # Set cache to prevent indexing the same document multiple times
  719. redis_client.setex(indexing_cache_key, 600, 1)
  720. add_document_to_index_task.delay(document_id)
  721. return {"result": "success"}, 200
  722. else:
  723. raise InvalidActionError()
  724. class DocumentPauseApi(DocumentResource):
  725. @setup_required
  726. @login_required
  727. @account_initialization_required
  728. def patch(self, dataset_id, document_id):
  729. """pause document."""
  730. dataset_id = str(dataset_id)
  731. document_id = str(document_id)
  732. dataset = DatasetService.get_dataset(dataset_id)
  733. if not dataset:
  734. raise NotFound("Dataset not found.")
  735. document = DocumentService.get_document(dataset.id, document_id)
  736. # 404 if document not found
  737. if document is None:
  738. raise NotFound("Document Not Exists.")
  739. # 403 if document is archived
  740. if DocumentService.check_archived(document):
  741. raise ArchivedDocumentImmutableError()
  742. try:
  743. # pause document
  744. DocumentService.pause_document(document)
  745. except services.errors.document.DocumentIndexingError:
  746. raise DocumentIndexingError("Cannot pause completed document.")
  747. return {"result": "success"}, 204
  748. class DocumentRecoverApi(DocumentResource):
  749. @setup_required
  750. @login_required
  751. @account_initialization_required
  752. def patch(self, dataset_id, document_id):
  753. """recover document."""
  754. dataset_id = str(dataset_id)
  755. document_id = str(document_id)
  756. dataset = DatasetService.get_dataset(dataset_id)
  757. if not dataset:
  758. raise NotFound("Dataset not found.")
  759. document = DocumentService.get_document(dataset.id, document_id)
  760. # 404 if document not found
  761. if document is None:
  762. raise NotFound("Document Not Exists.")
  763. # 403 if document is archived
  764. if DocumentService.check_archived(document):
  765. raise ArchivedDocumentImmutableError()
  766. try:
  767. # pause document
  768. DocumentService.recover_document(document)
  769. except services.errors.document.DocumentIndexingError:
  770. raise DocumentIndexingError("Document is not in paused status.")
  771. return {"result": "success"}, 204
  772. class DocumentRetryApi(DocumentResource):
  773. @setup_required
  774. @login_required
  775. @account_initialization_required
  776. def post(self, dataset_id):
  777. """retry document."""
  778. parser = reqparse.RequestParser()
  779. parser.add_argument("document_ids", type=list, required=True, nullable=False, location="json")
  780. args = parser.parse_args()
  781. dataset_id = str(dataset_id)
  782. dataset = DatasetService.get_dataset(dataset_id)
  783. retry_documents = []
  784. if not dataset:
  785. raise NotFound("Dataset not found.")
  786. for document_id in args["document_ids"]:
  787. try:
  788. document_id = str(document_id)
  789. document = DocumentService.get_document(dataset.id, document_id)
  790. # 404 if document not found
  791. if document is None:
  792. raise NotFound("Document Not Exists.")
  793. # 403 if document is archived
  794. if DocumentService.check_archived(document):
  795. raise ArchivedDocumentImmutableError()
  796. # 400 if document is completed
  797. if document.indexing_status == "completed":
  798. raise DocumentAlreadyFinishedError()
  799. retry_documents.append(document)
  800. except Exception as e:
  801. logging.error(f"Document {document_id} retry failed: {str(e)}")
  802. continue
  803. # retry document
  804. DocumentService.retry_document(dataset_id, retry_documents)
  805. return {"result": "success"}, 204
  806. class DocumentRenameApi(DocumentResource):
  807. @setup_required
  808. @login_required
  809. @account_initialization_required
  810. @marshal_with(document_fields)
  811. def post(self, dataset_id, document_id):
  812. # The role of the current user in the ta table must be admin, owner, editor, or dataset_operator
  813. if not current_user.is_dataset_editor:
  814. raise Forbidden()
  815. dataset = DatasetService.get_dataset(dataset_id)
  816. DatasetService.check_dataset_operator_permission(current_user, dataset)
  817. parser = reqparse.RequestParser()
  818. parser.add_argument("name", type=str, required=True, nullable=False, location="json")
  819. args = parser.parse_args()
  820. try:
  821. document = DocumentService.rename_document(dataset_id, document_id, args["name"])
  822. except services.errors.document.DocumentIndexingError:
  823. raise DocumentIndexingError("Cannot delete document during indexing.")
  824. return document
  825. class WebsiteDocumentSyncApi(DocumentResource):
  826. @setup_required
  827. @login_required
  828. @account_initialization_required
  829. def get(self, dataset_id, document_id):
  830. """sync website document."""
  831. dataset_id = str(dataset_id)
  832. dataset = DatasetService.get_dataset(dataset_id)
  833. if not dataset:
  834. raise NotFound("Dataset not found.")
  835. document_id = str(document_id)
  836. document = DocumentService.get_document(dataset.id, document_id)
  837. if not document:
  838. raise NotFound("Document not found.")
  839. if document.tenant_id != current_user.current_tenant_id:
  840. raise Forbidden("No permission.")
  841. if document.data_source_type != "website_crawl":
  842. raise ValueError("Document is not a website document.")
  843. # 403 if document is archived
  844. if DocumentService.check_archived(document):
  845. raise ArchivedDocumentImmutableError()
  846. # sync document
  847. DocumentService.sync_website_document(dataset_id, document)
  848. return {"result": "success"}, 200
  849. api.add_resource(GetProcessRuleApi, "/datasets/process-rule")
  850. api.add_resource(DatasetDocumentListApi, "/datasets/<uuid:dataset_id>/documents")
  851. api.add_resource(DatasetInitApi, "/datasets/init")
  852. api.add_resource(
  853. DocumentIndexingEstimateApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate"
  854. )
  855. api.add_resource(DocumentBatchIndexingEstimateApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate")
  856. api.add_resource(DocumentBatchIndexingStatusApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status")
  857. api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status")
  858. api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  859. api.add_resource(
  860. DocumentProcessingApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>"
  861. )
  862. api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  863. api.add_resource(DocumentMetadataApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata")
  864. api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>")
  865. api.add_resource(DocumentPauseApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause")
  866. api.add_resource(DocumentRecoverApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume")
  867. api.add_resource(DocumentRetryApi, "/datasets/<uuid:dataset_id>/retry")
  868. api.add_resource(DocumentRenameApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename")
  869. api.add_resource(WebsiteDocumentSyncApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync")