datasets_document.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from datetime import datetime, timezone
  4. from flask import request
  5. from flask_login import current_user
  6. from flask_restful import Resource, fields, marshal, marshal_with, reqparse
  7. from sqlalchemy import asc, desc, select
  8. from sqlalchemy.orm import Session
  9. from werkzeug.exceptions import Forbidden, NotFound
  10. import services
  11. from controllers.console import api
  12. from controllers.console.app.error import (
  13. ProviderModelCurrentlyNotSupportError,
  14. ProviderNotInitializeError,
  15. ProviderQuotaExceededError,
  16. )
  17. from controllers.console.datasets.error import (
  18. ArchivedDocumentImmutableError,
  19. DocumentAlreadyFinishedError,
  20. DocumentIndexingError,
  21. IndexingEstimateError,
  22. InvalidActionError,
  23. InvalidMetadataError,
  24. )
  25. from controllers.console.setup import setup_required
  26. from controllers.console.wraps import account_initialization_required, cloud_edition_billing_resource_check
  27. from core.errors.error import (
  28. LLMBadRequestError,
  29. ModelCurrentlyNotSupportError,
  30. ProviderTokenNotInitError,
  31. QuotaExceededError,
  32. )
  33. from core.indexing_runner import IndexingRunner
  34. from core.model_manager import ModelManager
  35. from core.model_runtime.entities.model_entities import ModelType
  36. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  37. from core.rag.extractor.entity.extract_setting import ExtractSetting
  38. from extensions.ext_database import db
  39. from extensions.ext_redis import redis_client
  40. from fields.document_fields import (
  41. dataset_and_document_fields,
  42. document_fields,
  43. document_status_fields,
  44. document_with_segments_fields,
  45. )
  46. from libs.login import login_required
  47. from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
  48. from services.dataset_service import DatasetService, DocumentService
  49. from tasks.add_document_to_index_task import add_document_to_index_task
  50. from tasks.remove_document_from_index_task import remove_document_from_index_task
  51. class DocumentResource(Resource):
  52. def get_document(self, dataset_id: str, document_id: str) -> Document:
  53. dataset = DatasetService.get_dataset(dataset_id)
  54. if not dataset:
  55. raise NotFound("Dataset not found.")
  56. try:
  57. DatasetService.check_dataset_permission(dataset, current_user)
  58. except services.errors.account.NoPermissionError as e:
  59. raise Forbidden(str(e))
  60. document = DocumentService.get_document(dataset_id, document_id)
  61. if not document:
  62. raise NotFound("Document not found.")
  63. if document.tenant_id != current_user.current_tenant_id:
  64. raise Forbidden("No permission.")
  65. return document
  66. def get_batch_documents(self, dataset_id: str, batch: str) -> list[Document]:
  67. dataset = DatasetService.get_dataset(dataset_id)
  68. if not dataset:
  69. raise NotFound("Dataset not found.")
  70. try:
  71. DatasetService.check_dataset_permission(dataset, current_user)
  72. except services.errors.account.NoPermissionError as e:
  73. raise Forbidden(str(e))
  74. documents = DocumentService.get_batch_documents(dataset_id, batch)
  75. if not documents:
  76. raise NotFound("Documents not found.")
  77. return documents
  78. class GetProcessRuleApi(Resource):
  79. @setup_required
  80. @login_required
  81. @account_initialization_required
  82. def get(self):
  83. req_data = request.args
  84. document_id = req_data.get("document_id")
  85. # get default rules
  86. mode = DocumentService.DEFAULT_RULES["mode"]
  87. rules = DocumentService.DEFAULT_RULES["rules"]
  88. if document_id:
  89. # get the latest process rule
  90. with Session(db.engine) as session:
  91. document = session.execute(select(Document).get_or_404(document_id)).scalar_one_or_none()
  92. dataset = DatasetService.get_dataset(document.dataset_id)
  93. if not dataset:
  94. raise NotFound("Dataset not found.")
  95. try:
  96. DatasetService.check_dataset_permission(dataset, current_user)
  97. except services.errors.account.NoPermissionError as e:
  98. raise Forbidden(str(e))
  99. # get the latest process rule
  100. dataset_process_rule = (
  101. db.session.query(DatasetProcessRule)
  102. .filter(DatasetProcessRule.dataset_id == document.dataset_id)
  103. .order_by(DatasetProcessRule.created_at.desc())
  104. .limit(1)
  105. .one_or_none()
  106. )
  107. if dataset_process_rule:
  108. mode = dataset_process_rule.mode
  109. rules = dataset_process_rule.rules_dict
  110. return {"mode": mode, "rules": rules}
  111. class DatasetDocumentListApi(Resource):
  112. @setup_required
  113. @login_required
  114. @account_initialization_required
  115. def get(self, dataset_id):
  116. dataset_id = str(dataset_id)
  117. page = request.args.get("page", default=1, type=int)
  118. limit = request.args.get("limit", default=20, type=int)
  119. search = request.args.get("keyword", default=None, type=str)
  120. sort = request.args.get("sort", default="-created_at", type=str)
  121. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  122. try:
  123. fetch_val = request.args.get("fetch", default="false")
  124. if isinstance(fetch_val, bool):
  125. fetch = fetch_val
  126. else:
  127. if fetch_val.lower() in ("yes", "true", "t", "y", "1"):
  128. fetch = True
  129. elif fetch_val.lower() in ("no", "false", "f", "n", "0"):
  130. fetch = False
  131. else:
  132. raise ArgumentTypeError(
  133. f"Truthy value expected: got {fetch_val} but expected one of yes/no, true/false, t/f, y/n, 1/0 "
  134. f"(case insensitive)."
  135. )
  136. except (ArgumentTypeError, ValueError, Exception) as e:
  137. fetch = False
  138. dataset = DatasetService.get_dataset(dataset_id)
  139. if not dataset:
  140. raise NotFound("Dataset not found.")
  141. try:
  142. DatasetService.check_dataset_permission(dataset, current_user)
  143. except services.errors.account.NoPermissionError as e:
  144. raise Forbidden(str(e))
  145. with Session(db.engine) as session:
  146. query = session.execute(
  147. select(Document).filter_by(dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  148. ).all()
  149. if search:
  150. search = f"%{search}%"
  151. query = query.filter(Document.name.like(search))
  152. if sort.startswith("-"):
  153. sort_logic = desc
  154. sort = sort[1:]
  155. else:
  156. sort_logic = asc
  157. if sort == "hit_count":
  158. sub_query = (
  159. db.select(DocumentSegment.document_id, db.func.sum(DocumentSegment.hit_count).label("total_hit_count"))
  160. .group_by(DocumentSegment.document_id)
  161. .subquery()
  162. )
  163. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id).order_by(
  164. sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)),
  165. sort_logic(Document.position),
  166. )
  167. elif sort == "created_at":
  168. query = query.order_by(
  169. sort_logic(Document.created_at),
  170. sort_logic(Document.position),
  171. )
  172. else:
  173. query = query.order_by(
  174. desc(Document.created_at),
  175. desc(Document.position),
  176. )
  177. paginated_documents = query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False)
  178. documents = paginated_documents.items
  179. if fetch:
  180. with Session(db.engine) as session:
  181. for document in documents:
  182. completed_segments = (
  183. session.query(DocumentSegment)
  184. .filter(
  185. DocumentSegment.completed_at.isnot(None),
  186. DocumentSegment.document_id == str(document.id),
  187. DocumentSegment.status != "re_segment",
  188. )
  189. .count()
  190. )
  191. total_segments = (
  192. session.query(DocumentSegment)
  193. .filter(DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment")
  194. .count()
  195. )
  196. document.completed_segments = completed_segments
  197. document.total_segments = total_segments
  198. data = marshal(documents, document_with_segments_fields)
  199. else:
  200. data = marshal(documents, document_fields)
  201. response = {
  202. "data": data,
  203. "has_more": len(documents) == limit,
  204. "limit": limit,
  205. "total": paginated_documents.total,
  206. "page": page,
  207. }
  208. return response
  209. documents_and_batch_fields = {"documents": fields.List(fields.Nested(document_fields)), "batch": fields.String}
  210. @setup_required
  211. @login_required
  212. @account_initialization_required
  213. @marshal_with(documents_and_batch_fields)
  214. @cloud_edition_billing_resource_check("vector_space")
  215. def post(self, dataset_id):
  216. dataset_id = str(dataset_id)
  217. dataset = DatasetService.get_dataset(dataset_id)
  218. if not dataset:
  219. raise NotFound("Dataset not found.")
  220. # The role of the current user in the ta table must be admin, owner, or editor
  221. if not current_user.is_dataset_editor:
  222. raise Forbidden()
  223. try:
  224. DatasetService.check_dataset_permission(dataset, current_user)
  225. except services.errors.account.NoPermissionError as e:
  226. raise Forbidden(str(e))
  227. parser = reqparse.RequestParser()
  228. parser.add_argument(
  229. "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
  230. )
  231. parser.add_argument("data_source", type=dict, required=False, location="json")
  232. parser.add_argument("process_rule", type=dict, required=False, location="json")
  233. parser.add_argument("duplicate", type=bool, default=True, nullable=False, location="json")
  234. parser.add_argument("original_document_id", type=str, required=False, location="json")
  235. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  236. parser.add_argument(
  237. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  238. )
  239. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  240. args = parser.parse_args()
  241. if not dataset.indexing_technique and not args["indexing_technique"]:
  242. raise ValueError("indexing_technique is required.")
  243. # validate args
  244. DocumentService.document_create_args_validate(args)
  245. try:
  246. documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
  247. except ProviderTokenNotInitError as ex:
  248. raise ProviderNotInitializeError(ex.description)
  249. except QuotaExceededError:
  250. raise ProviderQuotaExceededError()
  251. except ModelCurrentlyNotSupportError:
  252. raise ProviderModelCurrentlyNotSupportError()
  253. return {"documents": documents, "batch": batch}
  254. class DatasetInitApi(Resource):
  255. @setup_required
  256. @login_required
  257. @account_initialization_required
  258. @marshal_with(dataset_and_document_fields)
  259. @cloud_edition_billing_resource_check("vector_space")
  260. def post(self):
  261. # The role of the current user in the ta table must be admin, owner, or editor
  262. if not current_user.is_editor:
  263. raise Forbidden()
  264. parser = reqparse.RequestParser()
  265. parser.add_argument(
  266. "indexing_technique",
  267. type=str,
  268. choices=Dataset.INDEXING_TECHNIQUE_LIST,
  269. required=True,
  270. nullable=False,
  271. location="json",
  272. )
  273. parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
  274. parser.add_argument("process_rule", type=dict, required=True, nullable=True, location="json")
  275. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  276. parser.add_argument(
  277. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  278. )
  279. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  280. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  281. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  282. args = parser.parse_args()
  283. # The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
  284. if not current_user.is_dataset_editor:
  285. raise Forbidden()
  286. if args["indexing_technique"] == "high_quality":
  287. if args["embedding_model"] is None or args["embedding_model_provider"] is None:
  288. raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
  289. try:
  290. model_manager = ModelManager()
  291. model_manager.get_default_model_instance(
  292. tenant_id=current_user.current_tenant_id, model_type=ModelType.TEXT_EMBEDDING
  293. )
  294. except InvokeAuthorizationError:
  295. raise ProviderNotInitializeError(
  296. "No Embedding Model available. Please configure a valid provider "
  297. "in the Settings -> Model Provider."
  298. )
  299. except ProviderTokenNotInitError as ex:
  300. raise ProviderNotInitializeError(ex.description)
  301. # validate args
  302. DocumentService.document_create_args_validate(args)
  303. try:
  304. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  305. tenant_id=current_user.current_tenant_id, document_data=args, account=current_user
  306. )
  307. except ProviderTokenNotInitError as ex:
  308. raise ProviderNotInitializeError(ex.description)
  309. except QuotaExceededError:
  310. raise ProviderQuotaExceededError()
  311. except ModelCurrentlyNotSupportError:
  312. raise ProviderModelCurrentlyNotSupportError()
  313. response = {"dataset": dataset, "documents": documents, "batch": batch}
  314. return response
  315. class DocumentIndexingEstimateApi(DocumentResource):
  316. @setup_required
  317. @login_required
  318. @account_initialization_required
  319. def get(self, dataset_id, document_id):
  320. dataset_id = str(dataset_id)
  321. document_id = str(document_id)
  322. document = self.get_document(dataset_id, document_id)
  323. if document.indexing_status in {"completed", "error"}:
  324. raise DocumentAlreadyFinishedError()
  325. data_process_rule = document.dataset_process_rule
  326. data_process_rule_dict = data_process_rule.to_dict()
  327. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  328. if document.data_source_type == "upload_file":
  329. data_source_info = document.data_source_info_dict
  330. if data_source_info and "upload_file_id" in data_source_info:
  331. file_id = data_source_info["upload_file_id"]
  332. file = (
  333. db.session.query(UploadFile)
  334. .filter(UploadFile.tenant_id == document.tenant_id, UploadFile.id == file_id)
  335. .first()
  336. )
  337. # raise error if file not found
  338. if not file:
  339. raise NotFound("File not found.")
  340. extract_setting = ExtractSetting(
  341. datasource_type="upload_file", upload_file=file, document_model=document.doc_form
  342. )
  343. indexing_runner = IndexingRunner()
  344. try:
  345. response = indexing_runner.indexing_estimate(
  346. current_user.current_tenant_id,
  347. [extract_setting],
  348. data_process_rule_dict,
  349. document.doc_form,
  350. "English",
  351. dataset_id,
  352. )
  353. except LLMBadRequestError:
  354. raise ProviderNotInitializeError(
  355. "No Embedding Model available. Please configure a valid provider "
  356. "in the Settings -> Model Provider."
  357. )
  358. except ProviderTokenNotInitError as ex:
  359. raise ProviderNotInitializeError(ex.description)
  360. except Exception as e:
  361. raise IndexingEstimateError(str(e))
  362. return response
  363. class DocumentBatchIndexingEstimateApi(DocumentResource):
  364. @setup_required
  365. @login_required
  366. @account_initialization_required
  367. def get(self, dataset_id, batch):
  368. dataset_id = str(dataset_id)
  369. batch = str(batch)
  370. documents = self.get_batch_documents(dataset_id, batch)
  371. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  372. if not documents:
  373. return response
  374. data_process_rule = documents[0].dataset_process_rule
  375. data_process_rule_dict = data_process_rule.to_dict()
  376. info_list = []
  377. extract_settings = []
  378. for document in documents:
  379. if document.indexing_status in {"completed", "error"}:
  380. raise DocumentAlreadyFinishedError()
  381. data_source_info = document.data_source_info_dict
  382. # format document files info
  383. if data_source_info and "upload_file_id" in data_source_info:
  384. file_id = data_source_info["upload_file_id"]
  385. info_list.append(file_id)
  386. # format document notion info
  387. elif (
  388. data_source_info and "notion_workspace_id" in data_source_info and "notion_page_id" in data_source_info
  389. ):
  390. pages = []
  391. page = {"page_id": data_source_info["notion_page_id"], "type": data_source_info["type"]}
  392. pages.append(page)
  393. notion_info = {"workspace_id": data_source_info["notion_workspace_id"], "pages": pages}
  394. info_list.append(notion_info)
  395. if document.data_source_type == "upload_file":
  396. file_id = data_source_info["upload_file_id"]
  397. file_detail = (
  398. db.session.query(UploadFile)
  399. .filter(UploadFile.tenant_id == current_user.current_tenant_id, UploadFile.id == file_id)
  400. .first()
  401. )
  402. if file_detail is None:
  403. raise NotFound("File not found.")
  404. extract_setting = ExtractSetting(
  405. datasource_type="upload_file", upload_file=file_detail, document_model=document.doc_form
  406. )
  407. extract_settings.append(extract_setting)
  408. elif document.data_source_type == "notion_import":
  409. extract_setting = ExtractSetting(
  410. datasource_type="notion_import",
  411. notion_info={
  412. "notion_workspace_id": data_source_info["notion_workspace_id"],
  413. "notion_obj_id": data_source_info["notion_page_id"],
  414. "notion_page_type": data_source_info["type"],
  415. "tenant_id": current_user.current_tenant_id,
  416. },
  417. document_model=document.doc_form,
  418. )
  419. extract_settings.append(extract_setting)
  420. elif document.data_source_type == "website_crawl":
  421. extract_setting = ExtractSetting(
  422. datasource_type="website_crawl",
  423. website_info={
  424. "provider": data_source_info["provider"],
  425. "job_id": data_source_info["job_id"],
  426. "url": data_source_info["url"],
  427. "tenant_id": current_user.current_tenant_id,
  428. "mode": data_source_info["mode"],
  429. "only_main_content": data_source_info["only_main_content"],
  430. },
  431. document_model=document.doc_form,
  432. )
  433. extract_settings.append(extract_setting)
  434. else:
  435. raise ValueError("Data source type not support")
  436. indexing_runner = IndexingRunner()
  437. try:
  438. response = indexing_runner.indexing_estimate(
  439. current_user.current_tenant_id,
  440. extract_settings,
  441. data_process_rule_dict,
  442. document.doc_form,
  443. "English",
  444. dataset_id,
  445. )
  446. except LLMBadRequestError:
  447. raise ProviderNotInitializeError(
  448. "No Embedding Model available. Please configure a valid provider "
  449. "in the Settings -> Model Provider."
  450. )
  451. except ProviderTokenNotInitError as ex:
  452. raise ProviderNotInitializeError(ex.description)
  453. except Exception as e:
  454. raise IndexingEstimateError(str(e))
  455. return response
  456. class DocumentBatchIndexingStatusApi(DocumentResource):
  457. @setup_required
  458. @login_required
  459. @account_initialization_required
  460. def get(self, dataset_id, batch):
  461. dataset_id = str(dataset_id)
  462. batch = str(batch)
  463. documents = self.get_batch_documents(dataset_id, batch)
  464. documents_status = []
  465. for document in documents:
  466. completed_segments = DocumentSegment.query.filter(
  467. DocumentSegment.completed_at.isnot(None),
  468. DocumentSegment.document_id == str(document.id),
  469. DocumentSegment.status != "re_segment",
  470. ).count()
  471. total_segments = DocumentSegment.query.filter(
  472. DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment"
  473. ).count()
  474. document.completed_segments = completed_segments
  475. document.total_segments = total_segments
  476. if document.is_paused:
  477. document.indexing_status = "paused"
  478. documents_status.append(marshal(document, document_status_fields))
  479. data = {"data": documents_status}
  480. return data
  481. class DocumentIndexingStatusApi(DocumentResource):
  482. @setup_required
  483. @login_required
  484. @account_initialization_required
  485. def get(self, dataset_id, document_id):
  486. dataset_id = str(dataset_id)
  487. document_id = str(document_id)
  488. document = self.get_document(dataset_id, document_id)
  489. completed_segments = DocumentSegment.query.filter(
  490. DocumentSegment.completed_at.isnot(None),
  491. DocumentSegment.document_id == str(document_id),
  492. DocumentSegment.status != "re_segment",
  493. ).count()
  494. total_segments = DocumentSegment.query.filter(
  495. DocumentSegment.document_id == str(document_id), DocumentSegment.status != "re_segment"
  496. ).count()
  497. document.completed_segments = completed_segments
  498. document.total_segments = total_segments
  499. if document.is_paused:
  500. document.indexing_status = "paused"
  501. return marshal(document, document_status_fields)
  502. class DocumentDetailApi(DocumentResource):
  503. METADATA_CHOICES = {"all", "only", "without"}
  504. @setup_required
  505. @login_required
  506. @account_initialization_required
  507. def get(self, dataset_id, document_id):
  508. dataset_id = str(dataset_id)
  509. document_id = str(document_id)
  510. document = self.get_document(dataset_id, document_id)
  511. metadata = request.args.get("metadata", "all")
  512. if metadata not in self.METADATA_CHOICES:
  513. raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
  514. if metadata == "only":
  515. response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata}
  516. elif metadata == "without":
  517. process_rules = DatasetService.get_process_rules(dataset_id)
  518. data_source_info = document.data_source_detail_dict
  519. response = {
  520. "id": document.id,
  521. "position": document.position,
  522. "data_source_type": document.data_source_type,
  523. "data_source_info": data_source_info,
  524. "dataset_process_rule_id": document.dataset_process_rule_id,
  525. "dataset_process_rule": process_rules,
  526. "name": document.name,
  527. "created_from": document.created_from,
  528. "created_by": document.created_by,
  529. "created_at": document.created_at.timestamp(),
  530. "tokens": document.tokens,
  531. "indexing_status": document.indexing_status,
  532. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  533. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  534. "indexing_latency": document.indexing_latency,
  535. "error": document.error,
  536. "enabled": document.enabled,
  537. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  538. "disabled_by": document.disabled_by,
  539. "archived": document.archived,
  540. "segment_count": document.segment_count,
  541. "average_segment_length": document.average_segment_length,
  542. "hit_count": document.hit_count,
  543. "display_status": document.display_status,
  544. "doc_form": document.doc_form,
  545. "doc_language": document.doc_language,
  546. }
  547. else:
  548. process_rules = DatasetService.get_process_rules(dataset_id)
  549. data_source_info = document.data_source_detail_dict
  550. response = {
  551. "id": document.id,
  552. "position": document.position,
  553. "data_source_type": document.data_source_type,
  554. "data_source_info": data_source_info,
  555. "dataset_process_rule_id": document.dataset_process_rule_id,
  556. "dataset_process_rule": process_rules,
  557. "name": document.name,
  558. "created_from": document.created_from,
  559. "created_by": document.created_by,
  560. "created_at": document.created_at.timestamp(),
  561. "tokens": document.tokens,
  562. "indexing_status": document.indexing_status,
  563. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  564. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  565. "indexing_latency": document.indexing_latency,
  566. "error": document.error,
  567. "enabled": document.enabled,
  568. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  569. "disabled_by": document.disabled_by,
  570. "archived": document.archived,
  571. "doc_type": document.doc_type,
  572. "doc_metadata": document.doc_metadata,
  573. "segment_count": document.segment_count,
  574. "average_segment_length": document.average_segment_length,
  575. "hit_count": document.hit_count,
  576. "display_status": document.display_status,
  577. "doc_form": document.doc_form,
  578. "doc_language": document.doc_language,
  579. }
  580. return response, 200
  581. class DocumentProcessingApi(DocumentResource):
  582. @setup_required
  583. @login_required
  584. @account_initialization_required
  585. def patch(self, dataset_id, document_id, action):
  586. dataset_id = str(dataset_id)
  587. document_id = str(document_id)
  588. document = self.get_document(dataset_id, document_id)
  589. # The role of the current user in the ta table must be admin, owner, or editor
  590. if not current_user.is_editor:
  591. raise Forbidden()
  592. if action == "pause":
  593. if document.indexing_status != "indexing":
  594. raise InvalidActionError("Document not in indexing state.")
  595. document.paused_by = current_user.id
  596. document.paused_at = datetime.now(timezone.utc).replace(tzinfo=None)
  597. document.is_paused = True
  598. db.session.commit()
  599. elif action == "resume":
  600. if document.indexing_status not in {"paused", "error"}:
  601. raise InvalidActionError("Document not in paused or error state.")
  602. document.paused_by = None
  603. document.paused_at = None
  604. document.is_paused = False
  605. db.session.commit()
  606. else:
  607. raise InvalidActionError()
  608. return {"result": "success"}, 200
  609. class DocumentDeleteApi(DocumentResource):
  610. @setup_required
  611. @login_required
  612. @account_initialization_required
  613. def delete(self, dataset_id, document_id):
  614. dataset_id = str(dataset_id)
  615. document_id = str(document_id)
  616. dataset = DatasetService.get_dataset(dataset_id)
  617. if dataset is None:
  618. raise NotFound("Dataset not found.")
  619. # check user's model setting
  620. DatasetService.check_dataset_model_setting(dataset)
  621. document = self.get_document(dataset_id, document_id)
  622. try:
  623. DocumentService.delete_document(document)
  624. except services.errors.document.DocumentIndexingError:
  625. raise DocumentIndexingError("Cannot delete document during indexing.")
  626. return {"result": "success"}, 204
  627. class DocumentMetadataApi(DocumentResource):
  628. @setup_required
  629. @login_required
  630. @account_initialization_required
  631. def put(self, dataset_id, document_id):
  632. dataset_id = str(dataset_id)
  633. document_id = str(document_id)
  634. document = self.get_document(dataset_id, document_id)
  635. req_data = request.get_json()
  636. doc_type = req_data.get("doc_type")
  637. doc_metadata = req_data.get("doc_metadata")
  638. # The role of the current user in the ta table must be admin, owner, or editor
  639. if not current_user.is_editor:
  640. raise Forbidden()
  641. if doc_type is None or doc_metadata is None:
  642. raise ValueError("Both doc_type and doc_metadata must be provided.")
  643. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  644. raise ValueError("Invalid doc_type.")
  645. if not isinstance(doc_metadata, dict):
  646. raise ValueError("doc_metadata must be a dictionary.")
  647. metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
  648. document.doc_metadata = {}
  649. if doc_type == "others":
  650. document.doc_metadata = doc_metadata
  651. else:
  652. for key, value_type in metadata_schema.items():
  653. value = doc_metadata.get(key)
  654. if value is not None and isinstance(value, value_type):
  655. document.doc_metadata[key] = value
  656. document.doc_type = doc_type
  657. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  658. db.session.commit()
  659. return {"result": "success", "message": "Document metadata updated."}, 200
  660. class DocumentStatusApi(DocumentResource):
  661. @setup_required
  662. @login_required
  663. @account_initialization_required
  664. @cloud_edition_billing_resource_check("vector_space")
  665. def patch(self, dataset_id, document_id, action):
  666. dataset_id = str(dataset_id)
  667. document_id = str(document_id)
  668. dataset = DatasetService.get_dataset(dataset_id)
  669. if dataset is None:
  670. raise NotFound("Dataset not found.")
  671. # The role of the current user in the ta table must be admin, owner, or editor
  672. if not current_user.is_dataset_editor:
  673. raise Forbidden()
  674. # check user's model setting
  675. DatasetService.check_dataset_model_setting(dataset)
  676. # check user's permission
  677. DatasetService.check_dataset_permission(dataset, current_user)
  678. document = self.get_document(dataset_id, document_id)
  679. indexing_cache_key = "document_{}_indexing".format(document.id)
  680. cache_result = redis_client.get(indexing_cache_key)
  681. if cache_result is not None:
  682. raise InvalidActionError("Document is being indexed, please try again later")
  683. if action == "enable":
  684. if document.enabled:
  685. raise InvalidActionError("Document already enabled.")
  686. document.enabled = True
  687. document.disabled_at = None
  688. document.disabled_by = None
  689. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  690. db.session.commit()
  691. # Set cache to prevent indexing the same document multiple times
  692. redis_client.setex(indexing_cache_key, 600, 1)
  693. add_document_to_index_task.delay(document_id)
  694. return {"result": "success"}, 200
  695. elif action == "disable":
  696. if not document.completed_at or document.indexing_status != "completed":
  697. raise InvalidActionError("Document is not completed.")
  698. if not document.enabled:
  699. raise InvalidActionError("Document already disabled.")
  700. document.enabled = False
  701. document.disabled_at = datetime.now(timezone.utc).replace(tzinfo=None)
  702. document.disabled_by = current_user.id
  703. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  704. db.session.commit()
  705. # Set cache to prevent indexing the same document multiple times
  706. redis_client.setex(indexing_cache_key, 600, 1)
  707. remove_document_from_index_task.delay(document_id)
  708. return {"result": "success"}, 200
  709. elif action == "archive":
  710. if document.archived:
  711. raise InvalidActionError("Document already archived.")
  712. document.archived = True
  713. document.archived_at = datetime.now(timezone.utc).replace(tzinfo=None)
  714. document.archived_by = current_user.id
  715. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  716. db.session.commit()
  717. if document.enabled:
  718. # Set cache to prevent indexing the same document multiple times
  719. redis_client.setex(indexing_cache_key, 600, 1)
  720. remove_document_from_index_task.delay(document_id)
  721. return {"result": "success"}, 200
  722. elif action == "un_archive":
  723. if not document.archived:
  724. raise InvalidActionError("Document is not archived.")
  725. document.archived = False
  726. document.archived_at = None
  727. document.archived_by = None
  728. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  729. db.session.commit()
  730. # Set cache to prevent indexing the same document multiple times
  731. redis_client.setex(indexing_cache_key, 600, 1)
  732. add_document_to_index_task.delay(document_id)
  733. return {"result": "success"}, 200
  734. else:
  735. raise InvalidActionError()
  736. class DocumentPauseApi(DocumentResource):
  737. @setup_required
  738. @login_required
  739. @account_initialization_required
  740. def patch(self, dataset_id, document_id):
  741. """pause document."""
  742. dataset_id = str(dataset_id)
  743. document_id = str(document_id)
  744. dataset = DatasetService.get_dataset(dataset_id)
  745. if not dataset:
  746. raise NotFound("Dataset not found.")
  747. document = DocumentService.get_document(dataset.id, document_id)
  748. # 404 if document not found
  749. if document is None:
  750. raise NotFound("Document Not Exists.")
  751. # 403 if document is archived
  752. if DocumentService.check_archived(document):
  753. raise ArchivedDocumentImmutableError()
  754. try:
  755. # pause document
  756. DocumentService.pause_document(document)
  757. except services.errors.document.DocumentIndexingError:
  758. raise DocumentIndexingError("Cannot pause completed document.")
  759. return {"result": "success"}, 204
  760. class DocumentRecoverApi(DocumentResource):
  761. @setup_required
  762. @login_required
  763. @account_initialization_required
  764. def patch(self, dataset_id, document_id):
  765. """recover document."""
  766. dataset_id = str(dataset_id)
  767. document_id = str(document_id)
  768. dataset = DatasetService.get_dataset(dataset_id)
  769. if not dataset:
  770. raise NotFound("Dataset not found.")
  771. document = DocumentService.get_document(dataset.id, document_id)
  772. # 404 if document not found
  773. if document is None:
  774. raise NotFound("Document Not Exists.")
  775. # 403 if document is archived
  776. if DocumentService.check_archived(document):
  777. raise ArchivedDocumentImmutableError()
  778. try:
  779. # pause document
  780. DocumentService.recover_document(document)
  781. except services.errors.document.DocumentIndexingError:
  782. raise DocumentIndexingError("Document is not in paused status.")
  783. return {"result": "success"}, 204
  784. class DocumentRetryApi(DocumentResource):
  785. @setup_required
  786. @login_required
  787. @account_initialization_required
  788. def post(self, dataset_id):
  789. """retry document."""
  790. parser = reqparse.RequestParser()
  791. parser.add_argument("document_ids", type=list, required=True, nullable=False, location="json")
  792. args = parser.parse_args()
  793. dataset_id = str(dataset_id)
  794. dataset = DatasetService.get_dataset(dataset_id)
  795. retry_documents = []
  796. if not dataset:
  797. raise NotFound("Dataset not found.")
  798. for document_id in args["document_ids"]:
  799. try:
  800. document_id = str(document_id)
  801. document = DocumentService.get_document(dataset.id, document_id)
  802. # 404 if document not found
  803. if document is None:
  804. raise NotFound("Document Not Exists.")
  805. # 403 if document is archived
  806. if DocumentService.check_archived(document):
  807. raise ArchivedDocumentImmutableError()
  808. # 400 if document is completed
  809. if document.indexing_status == "completed":
  810. raise DocumentAlreadyFinishedError()
  811. retry_documents.append(document)
  812. except Exception as e:
  813. logging.error(f"Document {document_id} retry failed: {str(e)}")
  814. continue
  815. # retry document
  816. DocumentService.retry_document(dataset_id, retry_documents)
  817. return {"result": "success"}, 204
  818. class DocumentRenameApi(DocumentResource):
  819. @setup_required
  820. @login_required
  821. @account_initialization_required
  822. @marshal_with(document_fields)
  823. def post(self, dataset_id, document_id):
  824. # The role of the current user in the ta table must be admin, owner, editor, or dataset_operator
  825. if not current_user.is_dataset_editor:
  826. raise Forbidden()
  827. dataset = DatasetService.get_dataset(dataset_id)
  828. DatasetService.check_dataset_operator_permission(current_user, dataset)
  829. parser = reqparse.RequestParser()
  830. parser.add_argument("name", type=str, required=True, nullable=False, location="json")
  831. args = parser.parse_args()
  832. try:
  833. document = DocumentService.rename_document(dataset_id, document_id, args["name"])
  834. except services.errors.document.DocumentIndexingError:
  835. raise DocumentIndexingError("Cannot delete document during indexing.")
  836. return document
  837. class WebsiteDocumentSyncApi(DocumentResource):
  838. @setup_required
  839. @login_required
  840. @account_initialization_required
  841. def get(self, dataset_id, document_id):
  842. """sync website document."""
  843. dataset_id = str(dataset_id)
  844. dataset = DatasetService.get_dataset(dataset_id)
  845. if not dataset:
  846. raise NotFound("Dataset not found.")
  847. document_id = str(document_id)
  848. document = DocumentService.get_document(dataset.id, document_id)
  849. if not document:
  850. raise NotFound("Document not found.")
  851. if document.tenant_id != current_user.current_tenant_id:
  852. raise Forbidden("No permission.")
  853. if document.data_source_type != "website_crawl":
  854. raise ValueError("Document is not a website document.")
  855. # 403 if document is archived
  856. if DocumentService.check_archived(document):
  857. raise ArchivedDocumentImmutableError()
  858. # sync document
  859. DocumentService.sync_website_document(dataset_id, document)
  860. return {"result": "success"}, 200
  861. api.add_resource(GetProcessRuleApi, "/datasets/process-rule")
  862. api.add_resource(DatasetDocumentListApi, "/datasets/<uuid:dataset_id>/documents")
  863. api.add_resource(DatasetInitApi, "/datasets/init")
  864. api.add_resource(
  865. DocumentIndexingEstimateApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate"
  866. )
  867. api.add_resource(DocumentBatchIndexingEstimateApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate")
  868. api.add_resource(DocumentBatchIndexingStatusApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status")
  869. api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status")
  870. api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  871. api.add_resource(
  872. DocumentProcessingApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>"
  873. )
  874. api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  875. api.add_resource(DocumentMetadataApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata")
  876. api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>")
  877. api.add_resource(DocumentPauseApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause")
  878. api.add_resource(DocumentRecoverApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume")
  879. api.add_resource(DocumentRetryApi, "/datasets/<uuid:dataset_id>/retry")
  880. api.add_resource(DocumentRenameApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename")
  881. api.add_resource(WebsiteDocumentSyncApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync")