datasets_document.py 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from datetime import datetime, timezone
  4. from flask import request
  5. from flask_login import current_user
  6. from flask_restful import Resource, fields, marshal, marshal_with, reqparse
  7. from sqlalchemy import asc, desc, select
  8. from sqlalchemy.orm import Session
  9. from werkzeug.exceptions import Forbidden, NotFound
  10. import services
  11. from controllers.console import api
  12. from controllers.console.app.error import (
  13. ProviderModelCurrentlyNotSupportError,
  14. ProviderNotInitializeError,
  15. ProviderQuotaExceededError,
  16. )
  17. from controllers.console.datasets.error import (
  18. ArchivedDocumentImmutableError,
  19. DocumentAlreadyFinishedError,
  20. DocumentIndexingError,
  21. IndexingEstimateError,
  22. InvalidActionError,
  23. InvalidMetadataError,
  24. )
  25. from controllers.console.setup import setup_required
  26. from controllers.console.wraps import account_initialization_required, cloud_edition_billing_resource_check
  27. from core.errors.error import (
  28. LLMBadRequestError,
  29. ModelCurrentlyNotSupportError,
  30. ProviderTokenNotInitError,
  31. QuotaExceededError,
  32. )
  33. from core.indexing_runner import IndexingRunner
  34. from core.model_manager import ModelManager
  35. from core.model_runtime.entities.model_entities import ModelType
  36. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  37. from core.rag.extractor.entity.extract_setting import ExtractSetting
  38. from extensions.ext_database import db
  39. from extensions.ext_redis import redis_client
  40. from fields.document_fields import (
  41. dataset_and_document_fields,
  42. document_fields,
  43. document_status_fields,
  44. document_with_segments_fields,
  45. )
  46. from libs.login import login_required
  47. from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
  48. from services.dataset_service import DatasetService, DocumentService
  49. from tasks.add_document_to_index_task import add_document_to_index_task
  50. from tasks.remove_document_from_index_task import remove_document_from_index_task
  51. class DocumentResource(Resource):
  52. def get_document(self, dataset_id: str, document_id: str) -> Document:
  53. dataset = DatasetService.get_dataset(dataset_id)
  54. if not dataset:
  55. raise NotFound("Dataset not found.")
  56. try:
  57. DatasetService.check_dataset_permission(dataset, current_user)
  58. except services.errors.account.NoPermissionError as e:
  59. raise Forbidden(str(e))
  60. document = DocumentService.get_document(dataset_id, document_id)
  61. if not document:
  62. raise NotFound("Document not found.")
  63. if document.tenant_id != current_user.current_tenant_id:
  64. raise Forbidden("No permission.")
  65. return document
  66. def get_batch_documents(self, dataset_id: str, batch: str) -> list[Document]:
  67. dataset = DatasetService.get_dataset(dataset_id)
  68. if not dataset:
  69. raise NotFound("Dataset not found.")
  70. try:
  71. DatasetService.check_dataset_permission(dataset, current_user)
  72. except services.errors.account.NoPermissionError as e:
  73. raise Forbidden(str(e))
  74. documents = DocumentService.get_batch_documents(dataset_id, batch)
  75. if not documents:
  76. raise NotFound("Documents not found.")
  77. return documents
  78. class GetProcessRuleApi(Resource):
  79. @setup_required
  80. @login_required
  81. @account_initialization_required
  82. def get(self):
  83. req_data = request.args
  84. document_id = req_data.get("document_id")
  85. # get default rules
  86. mode = DocumentService.DEFAULT_RULES["mode"]
  87. rules = DocumentService.DEFAULT_RULES["rules"]
  88. if document_id:
  89. # get the latest process rule
  90. with Session(db.engine) as session:
  91. document = session.execute(select(Document).get_or_404(document_id)).scalar_one_or_none()
  92. dataset = DatasetService.get_dataset(document.dataset_id)
  93. if not dataset:
  94. raise NotFound("Dataset not found.")
  95. try:
  96. DatasetService.check_dataset_permission(dataset, current_user)
  97. except services.errors.account.NoPermissionError as e:
  98. raise Forbidden(str(e))
  99. # get the latest process rule
  100. dataset_process_rule = (
  101. db.session.query(DatasetProcessRule)
  102. .filter(DatasetProcessRule.dataset_id == document.dataset_id)
  103. .order_by(DatasetProcessRule.created_at.desc())
  104. .limit(1)
  105. .one_or_none()
  106. )
  107. if dataset_process_rule:
  108. mode = dataset_process_rule.mode
  109. rules = dataset_process_rule.rules_dict
  110. return {"mode": mode, "rules": rules}
  111. class DatasetDocumentListApi(Resource):
  112. @setup_required
  113. @login_required
  114. @account_initialization_required
  115. def get(self, dataset_id):
  116. dataset_id = str(dataset_id)
  117. page = request.args.get("page", default=1, type=int)
  118. limit = request.args.get("limit", default=20, type=int)
  119. search = request.args.get("keyword", default=None, type=str)
  120. sort = request.args.get("sort", default="-created_at", type=str)
  121. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  122. try:
  123. fetch_val = request.args.get("fetch", default="false")
  124. if isinstance(fetch_val, bool):
  125. fetch = fetch_val
  126. else:
  127. if fetch_val.lower() in ("yes", "true", "t", "y", "1"):
  128. fetch = True
  129. elif fetch_val.lower() in ("no", "false", "f", "n", "0"):
  130. fetch = False
  131. else:
  132. raise ArgumentTypeError(
  133. f"Truthy value expected: got {fetch_val} but expected one of yes/no, true/false, t/f, y/n, 1/0 "
  134. f"(case insensitive)."
  135. )
  136. except (ArgumentTypeError, ValueError, Exception) as e:
  137. fetch = False
  138. dataset = DatasetService.get_dataset(dataset_id)
  139. if not dataset:
  140. raise NotFound("Dataset not found.")
  141. try:
  142. DatasetService.check_dataset_permission(dataset, current_user)
  143. except services.errors.account.NoPermissionError as e:
  144. raise Forbidden(str(e))
  145. with Session(db.engine) as session:
  146. query = session.query(Document).filter_by(
  147. dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id
  148. )
  149. if search:
  150. search = f"%{search}%"
  151. query = query.filter(Document.name.like(search))
  152. if sort.startswith("-"):
  153. sort_logic = desc
  154. sort = sort[1:]
  155. else:
  156. sort_logic = asc
  157. if sort == "hit_count":
  158. sub_query = (
  159. db.select(
  160. DocumentSegment.document_id, db.func.sum(DocumentSegment.hit_count).label("total_hit_count")
  161. )
  162. .group_by(DocumentSegment.document_id)
  163. .subquery()
  164. )
  165. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id).order_by(
  166. sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)),
  167. sort_logic(Document.position),
  168. )
  169. elif sort == "created_at":
  170. query = query.order_by(
  171. sort_logic(Document.created_at),
  172. sort_logic(Document.position),
  173. )
  174. else:
  175. query = query.order_by(
  176. desc(Document.created_at),
  177. desc(Document.position),
  178. )
  179. paginated_documents = query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False)
  180. documents = paginated_documents.items
  181. if fetch:
  182. for document in documents:
  183. completed_segments = (
  184. session.query(DocumentSegment)
  185. .filter(
  186. DocumentSegment.completed_at.isnot(None),
  187. DocumentSegment.document_id == str(document.id),
  188. DocumentSegment.status != "re_segment",
  189. )
  190. .count()
  191. )
  192. total_segments = (
  193. session.query(DocumentSegment)
  194. .filter(DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment")
  195. .count()
  196. )
  197. document.completed_segments = completed_segments
  198. document.total_segments = total_segments
  199. data = marshal(documents, document_with_segments_fields)
  200. else:
  201. data = marshal(documents, document_fields)
  202. response = {
  203. "data": data,
  204. "has_more": len(documents) == limit,
  205. "limit": limit,
  206. "total": paginated_documents.total,
  207. "page": page,
  208. }
  209. return response
  210. documents_and_batch_fields = {"documents": fields.List(fields.Nested(document_fields)), "batch": fields.String}
  211. @setup_required
  212. @login_required
  213. @account_initialization_required
  214. @marshal_with(documents_and_batch_fields)
  215. @cloud_edition_billing_resource_check("vector_space")
  216. def post(self, dataset_id):
  217. dataset_id = str(dataset_id)
  218. dataset = DatasetService.get_dataset(dataset_id)
  219. if not dataset:
  220. raise NotFound("Dataset not found.")
  221. # The role of the current user in the ta table must be admin, owner, or editor
  222. if not current_user.is_dataset_editor:
  223. raise Forbidden()
  224. try:
  225. DatasetService.check_dataset_permission(dataset, current_user)
  226. except services.errors.account.NoPermissionError as e:
  227. raise Forbidden(str(e))
  228. parser = reqparse.RequestParser()
  229. parser.add_argument(
  230. "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
  231. )
  232. parser.add_argument("data_source", type=dict, required=False, location="json")
  233. parser.add_argument("process_rule", type=dict, required=False, location="json")
  234. parser.add_argument("duplicate", type=bool, default=True, nullable=False, location="json")
  235. parser.add_argument("original_document_id", type=str, required=False, location="json")
  236. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  237. parser.add_argument(
  238. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  239. )
  240. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  241. args = parser.parse_args()
  242. if not dataset.indexing_technique and not args["indexing_technique"]:
  243. raise ValueError("indexing_technique is required.")
  244. # validate args
  245. DocumentService.document_create_args_validate(args)
  246. try:
  247. documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
  248. except ProviderTokenNotInitError as ex:
  249. raise ProviderNotInitializeError(ex.description)
  250. except QuotaExceededError:
  251. raise ProviderQuotaExceededError()
  252. except ModelCurrentlyNotSupportError:
  253. raise ProviderModelCurrentlyNotSupportError()
  254. return {"documents": documents, "batch": batch}
  255. class DatasetInitApi(Resource):
  256. @setup_required
  257. @login_required
  258. @account_initialization_required
  259. @marshal_with(dataset_and_document_fields)
  260. @cloud_edition_billing_resource_check("vector_space")
  261. def post(self):
  262. # The role of the current user in the ta table must be admin, owner, or editor
  263. if not current_user.is_editor:
  264. raise Forbidden()
  265. parser = reqparse.RequestParser()
  266. parser.add_argument(
  267. "indexing_technique",
  268. type=str,
  269. choices=Dataset.INDEXING_TECHNIQUE_LIST,
  270. required=True,
  271. nullable=False,
  272. location="json",
  273. )
  274. parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
  275. parser.add_argument("process_rule", type=dict, required=True, nullable=True, location="json")
  276. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  277. parser.add_argument(
  278. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  279. )
  280. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  281. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  282. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  283. args = parser.parse_args()
  284. # The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
  285. if not current_user.is_dataset_editor:
  286. raise Forbidden()
  287. if args["indexing_technique"] == "high_quality":
  288. if args["embedding_model"] is None or args["embedding_model_provider"] is None:
  289. raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
  290. try:
  291. model_manager = ModelManager()
  292. model_manager.get_default_model_instance(
  293. tenant_id=current_user.current_tenant_id, model_type=ModelType.TEXT_EMBEDDING
  294. )
  295. except InvokeAuthorizationError:
  296. raise ProviderNotInitializeError(
  297. "No Embedding Model available. Please configure a valid provider "
  298. "in the Settings -> Model Provider."
  299. )
  300. except ProviderTokenNotInitError as ex:
  301. raise ProviderNotInitializeError(ex.description)
  302. # validate args
  303. DocumentService.document_create_args_validate(args)
  304. try:
  305. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  306. tenant_id=current_user.current_tenant_id, document_data=args, account=current_user
  307. )
  308. except ProviderTokenNotInitError as ex:
  309. raise ProviderNotInitializeError(ex.description)
  310. except QuotaExceededError:
  311. raise ProviderQuotaExceededError()
  312. except ModelCurrentlyNotSupportError:
  313. raise ProviderModelCurrentlyNotSupportError()
  314. response = {"dataset": dataset, "documents": documents, "batch": batch}
  315. return response
  316. class DocumentIndexingEstimateApi(DocumentResource):
  317. @setup_required
  318. @login_required
  319. @account_initialization_required
  320. def get(self, dataset_id, document_id):
  321. dataset_id = str(dataset_id)
  322. document_id = str(document_id)
  323. document = self.get_document(dataset_id, document_id)
  324. if document.indexing_status in {"completed", "error"}:
  325. raise DocumentAlreadyFinishedError()
  326. data_process_rule = document.dataset_process_rule
  327. data_process_rule_dict = data_process_rule.to_dict()
  328. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  329. if document.data_source_type == "upload_file":
  330. data_source_info = document.data_source_info_dict
  331. if data_source_info and "upload_file_id" in data_source_info:
  332. file_id = data_source_info["upload_file_id"]
  333. file = (
  334. db.session.query(UploadFile)
  335. .filter(UploadFile.tenant_id == document.tenant_id, UploadFile.id == file_id)
  336. .first()
  337. )
  338. # raise error if file not found
  339. if not file:
  340. raise NotFound("File not found.")
  341. extract_setting = ExtractSetting(
  342. datasource_type="upload_file", upload_file=file, document_model=document.doc_form
  343. )
  344. indexing_runner = IndexingRunner()
  345. try:
  346. response = indexing_runner.indexing_estimate(
  347. current_user.current_tenant_id,
  348. [extract_setting],
  349. data_process_rule_dict,
  350. document.doc_form,
  351. "English",
  352. dataset_id,
  353. )
  354. except LLMBadRequestError:
  355. raise ProviderNotInitializeError(
  356. "No Embedding Model available. Please configure a valid provider "
  357. "in the Settings -> Model Provider."
  358. )
  359. except ProviderTokenNotInitError as ex:
  360. raise ProviderNotInitializeError(ex.description)
  361. except Exception as e:
  362. raise IndexingEstimateError(str(e))
  363. return response
  364. class DocumentBatchIndexingEstimateApi(DocumentResource):
  365. @setup_required
  366. @login_required
  367. @account_initialization_required
  368. def get(self, dataset_id, batch):
  369. dataset_id = str(dataset_id)
  370. batch = str(batch)
  371. documents = self.get_batch_documents(dataset_id, batch)
  372. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  373. if not documents:
  374. return response
  375. data_process_rule = documents[0].dataset_process_rule
  376. data_process_rule_dict = data_process_rule.to_dict()
  377. info_list = []
  378. extract_settings = []
  379. for document in documents:
  380. if document.indexing_status in {"completed", "error"}:
  381. raise DocumentAlreadyFinishedError()
  382. data_source_info = document.data_source_info_dict
  383. # format document files info
  384. if data_source_info and "upload_file_id" in data_source_info:
  385. file_id = data_source_info["upload_file_id"]
  386. info_list.append(file_id)
  387. # format document notion info
  388. elif (
  389. data_source_info and "notion_workspace_id" in data_source_info and "notion_page_id" in data_source_info
  390. ):
  391. pages = []
  392. page = {"page_id": data_source_info["notion_page_id"], "type": data_source_info["type"]}
  393. pages.append(page)
  394. notion_info = {"workspace_id": data_source_info["notion_workspace_id"], "pages": pages}
  395. info_list.append(notion_info)
  396. if document.data_source_type == "upload_file":
  397. file_id = data_source_info["upload_file_id"]
  398. file_detail = (
  399. db.session.query(UploadFile)
  400. .filter(UploadFile.tenant_id == current_user.current_tenant_id, UploadFile.id == file_id)
  401. .first()
  402. )
  403. if file_detail is None:
  404. raise NotFound("File not found.")
  405. extract_setting = ExtractSetting(
  406. datasource_type="upload_file", upload_file=file_detail, document_model=document.doc_form
  407. )
  408. extract_settings.append(extract_setting)
  409. elif document.data_source_type == "notion_import":
  410. extract_setting = ExtractSetting(
  411. datasource_type="notion_import",
  412. notion_info={
  413. "notion_workspace_id": data_source_info["notion_workspace_id"],
  414. "notion_obj_id": data_source_info["notion_page_id"],
  415. "notion_page_type": data_source_info["type"],
  416. "tenant_id": current_user.current_tenant_id,
  417. },
  418. document_model=document.doc_form,
  419. )
  420. extract_settings.append(extract_setting)
  421. elif document.data_source_type == "website_crawl":
  422. extract_setting = ExtractSetting(
  423. datasource_type="website_crawl",
  424. website_info={
  425. "provider": data_source_info["provider"],
  426. "job_id": data_source_info["job_id"],
  427. "url": data_source_info["url"],
  428. "tenant_id": current_user.current_tenant_id,
  429. "mode": data_source_info["mode"],
  430. "only_main_content": data_source_info["only_main_content"],
  431. },
  432. document_model=document.doc_form,
  433. )
  434. extract_settings.append(extract_setting)
  435. else:
  436. raise ValueError("Data source type not support")
  437. indexing_runner = IndexingRunner()
  438. try:
  439. response = indexing_runner.indexing_estimate(
  440. current_user.current_tenant_id,
  441. extract_settings,
  442. data_process_rule_dict,
  443. document.doc_form,
  444. "English",
  445. dataset_id,
  446. )
  447. except LLMBadRequestError:
  448. raise ProviderNotInitializeError(
  449. "No Embedding Model available. Please configure a valid provider "
  450. "in the Settings -> Model Provider."
  451. )
  452. except ProviderTokenNotInitError as ex:
  453. raise ProviderNotInitializeError(ex.description)
  454. except Exception as e:
  455. raise IndexingEstimateError(str(e))
  456. return response
  457. class DocumentBatchIndexingStatusApi(DocumentResource):
  458. @setup_required
  459. @login_required
  460. @account_initialization_required
  461. def get(self, dataset_id, batch):
  462. dataset_id = str(dataset_id)
  463. batch = str(batch)
  464. documents = self.get_batch_documents(dataset_id, batch)
  465. documents_status = []
  466. for document in documents:
  467. completed_segments = DocumentSegment.query.filter(
  468. DocumentSegment.completed_at.isnot(None),
  469. DocumentSegment.document_id == str(document.id),
  470. DocumentSegment.status != "re_segment",
  471. ).count()
  472. total_segments = DocumentSegment.query.filter(
  473. DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment"
  474. ).count()
  475. document.completed_segments = completed_segments
  476. document.total_segments = total_segments
  477. if document.is_paused:
  478. document.indexing_status = "paused"
  479. documents_status.append(marshal(document, document_status_fields))
  480. data = {"data": documents_status}
  481. return data
  482. class DocumentIndexingStatusApi(DocumentResource):
  483. @setup_required
  484. @login_required
  485. @account_initialization_required
  486. def get(self, dataset_id, document_id):
  487. dataset_id = str(dataset_id)
  488. document_id = str(document_id)
  489. document = self.get_document(dataset_id, document_id)
  490. completed_segments = DocumentSegment.query.filter(
  491. DocumentSegment.completed_at.isnot(None),
  492. DocumentSegment.document_id == str(document_id),
  493. DocumentSegment.status != "re_segment",
  494. ).count()
  495. total_segments = DocumentSegment.query.filter(
  496. DocumentSegment.document_id == str(document_id), DocumentSegment.status != "re_segment"
  497. ).count()
  498. document.completed_segments = completed_segments
  499. document.total_segments = total_segments
  500. if document.is_paused:
  501. document.indexing_status = "paused"
  502. return marshal(document, document_status_fields)
  503. class DocumentDetailApi(DocumentResource):
  504. METADATA_CHOICES = {"all", "only", "without"}
  505. @setup_required
  506. @login_required
  507. @account_initialization_required
  508. def get(self, dataset_id, document_id):
  509. dataset_id = str(dataset_id)
  510. document_id = str(document_id)
  511. document = self.get_document(dataset_id, document_id)
  512. metadata = request.args.get("metadata", "all")
  513. if metadata not in self.METADATA_CHOICES:
  514. raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
  515. if metadata == "only":
  516. response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata}
  517. elif metadata == "without":
  518. process_rules = DatasetService.get_process_rules(dataset_id)
  519. data_source_info = document.data_source_detail_dict
  520. response = {
  521. "id": document.id,
  522. "position": document.position,
  523. "data_source_type": document.data_source_type,
  524. "data_source_info": data_source_info,
  525. "dataset_process_rule_id": document.dataset_process_rule_id,
  526. "dataset_process_rule": process_rules,
  527. "name": document.name,
  528. "created_from": document.created_from,
  529. "created_by": document.created_by,
  530. "created_at": document.created_at.timestamp(),
  531. "tokens": document.tokens,
  532. "indexing_status": document.indexing_status,
  533. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  534. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  535. "indexing_latency": document.indexing_latency,
  536. "error": document.error,
  537. "enabled": document.enabled,
  538. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  539. "disabled_by": document.disabled_by,
  540. "archived": document.archived,
  541. "segment_count": document.segment_count,
  542. "average_segment_length": document.average_segment_length,
  543. "hit_count": document.hit_count,
  544. "display_status": document.display_status,
  545. "doc_form": document.doc_form,
  546. "doc_language": document.doc_language,
  547. }
  548. else:
  549. process_rules = DatasetService.get_process_rules(dataset_id)
  550. data_source_info = document.data_source_detail_dict
  551. response = {
  552. "id": document.id,
  553. "position": document.position,
  554. "data_source_type": document.data_source_type,
  555. "data_source_info": data_source_info,
  556. "dataset_process_rule_id": document.dataset_process_rule_id,
  557. "dataset_process_rule": process_rules,
  558. "name": document.name,
  559. "created_from": document.created_from,
  560. "created_by": document.created_by,
  561. "created_at": document.created_at.timestamp(),
  562. "tokens": document.tokens,
  563. "indexing_status": document.indexing_status,
  564. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  565. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  566. "indexing_latency": document.indexing_latency,
  567. "error": document.error,
  568. "enabled": document.enabled,
  569. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  570. "disabled_by": document.disabled_by,
  571. "archived": document.archived,
  572. "doc_type": document.doc_type,
  573. "doc_metadata": document.doc_metadata,
  574. "segment_count": document.segment_count,
  575. "average_segment_length": document.average_segment_length,
  576. "hit_count": document.hit_count,
  577. "display_status": document.display_status,
  578. "doc_form": document.doc_form,
  579. "doc_language": document.doc_language,
  580. }
  581. return response, 200
  582. class DocumentProcessingApi(DocumentResource):
  583. @setup_required
  584. @login_required
  585. @account_initialization_required
  586. def patch(self, dataset_id, document_id, action):
  587. dataset_id = str(dataset_id)
  588. document_id = str(document_id)
  589. document = self.get_document(dataset_id, document_id)
  590. # The role of the current user in the ta table must be admin, owner, or editor
  591. if not current_user.is_editor:
  592. raise Forbidden()
  593. if action == "pause":
  594. if document.indexing_status != "indexing":
  595. raise InvalidActionError("Document not in indexing state.")
  596. document.paused_by = current_user.id
  597. document.paused_at = datetime.now(timezone.utc).replace(tzinfo=None)
  598. document.is_paused = True
  599. db.session.commit()
  600. elif action == "resume":
  601. if document.indexing_status not in {"paused", "error"}:
  602. raise InvalidActionError("Document not in paused or error state.")
  603. document.paused_by = None
  604. document.paused_at = None
  605. document.is_paused = False
  606. db.session.commit()
  607. else:
  608. raise InvalidActionError()
  609. return {"result": "success"}, 200
  610. class DocumentDeleteApi(DocumentResource):
  611. @setup_required
  612. @login_required
  613. @account_initialization_required
  614. def delete(self, dataset_id, document_id):
  615. dataset_id = str(dataset_id)
  616. document_id = str(document_id)
  617. dataset = DatasetService.get_dataset(dataset_id)
  618. if dataset is None:
  619. raise NotFound("Dataset not found.")
  620. # check user's model setting
  621. DatasetService.check_dataset_model_setting(dataset)
  622. document = self.get_document(dataset_id, document_id)
  623. try:
  624. DocumentService.delete_document(document)
  625. except services.errors.document.DocumentIndexingError:
  626. raise DocumentIndexingError("Cannot delete document during indexing.")
  627. return {"result": "success"}, 204
  628. class DocumentMetadataApi(DocumentResource):
  629. @setup_required
  630. @login_required
  631. @account_initialization_required
  632. def put(self, dataset_id, document_id):
  633. dataset_id = str(dataset_id)
  634. document_id = str(document_id)
  635. document = self.get_document(dataset_id, document_id)
  636. req_data = request.get_json()
  637. doc_type = req_data.get("doc_type")
  638. doc_metadata = req_data.get("doc_metadata")
  639. # The role of the current user in the ta table must be admin, owner, or editor
  640. if not current_user.is_editor:
  641. raise Forbidden()
  642. if doc_type is None or doc_metadata is None:
  643. raise ValueError("Both doc_type and doc_metadata must be provided.")
  644. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  645. raise ValueError("Invalid doc_type.")
  646. if not isinstance(doc_metadata, dict):
  647. raise ValueError("doc_metadata must be a dictionary.")
  648. metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
  649. document.doc_metadata = {}
  650. if doc_type == "others":
  651. document.doc_metadata = doc_metadata
  652. else:
  653. for key, value_type in metadata_schema.items():
  654. value = doc_metadata.get(key)
  655. if value is not None and isinstance(value, value_type):
  656. document.doc_metadata[key] = value
  657. document.doc_type = doc_type
  658. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  659. db.session.commit()
  660. return {"result": "success", "message": "Document metadata updated."}, 200
  661. class DocumentStatusApi(DocumentResource):
  662. @setup_required
  663. @login_required
  664. @account_initialization_required
  665. @cloud_edition_billing_resource_check("vector_space")
  666. def patch(self, dataset_id, document_id, action):
  667. dataset_id = str(dataset_id)
  668. document_id = str(document_id)
  669. dataset = DatasetService.get_dataset(dataset_id)
  670. if dataset is None:
  671. raise NotFound("Dataset not found.")
  672. # The role of the current user in the ta table must be admin, owner, or editor
  673. if not current_user.is_dataset_editor:
  674. raise Forbidden()
  675. # check user's model setting
  676. DatasetService.check_dataset_model_setting(dataset)
  677. # check user's permission
  678. DatasetService.check_dataset_permission(dataset, current_user)
  679. document = self.get_document(dataset_id, document_id)
  680. indexing_cache_key = "document_{}_indexing".format(document.id)
  681. cache_result = redis_client.get(indexing_cache_key)
  682. if cache_result is not None:
  683. raise InvalidActionError("Document is being indexed, please try again later")
  684. if action == "enable":
  685. if document.enabled:
  686. raise InvalidActionError("Document already enabled.")
  687. document.enabled = True
  688. document.disabled_at = None
  689. document.disabled_by = None
  690. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  691. db.session.commit()
  692. # Set cache to prevent indexing the same document multiple times
  693. redis_client.setex(indexing_cache_key, 600, 1)
  694. add_document_to_index_task.delay(document_id)
  695. return {"result": "success"}, 200
  696. elif action == "disable":
  697. if not document.completed_at or document.indexing_status != "completed":
  698. raise InvalidActionError("Document is not completed.")
  699. if not document.enabled:
  700. raise InvalidActionError("Document already disabled.")
  701. document.enabled = False
  702. document.disabled_at = datetime.now(timezone.utc).replace(tzinfo=None)
  703. document.disabled_by = current_user.id
  704. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  705. db.session.commit()
  706. # Set cache to prevent indexing the same document multiple times
  707. redis_client.setex(indexing_cache_key, 600, 1)
  708. remove_document_from_index_task.delay(document_id)
  709. return {"result": "success"}, 200
  710. elif action == "archive":
  711. if document.archived:
  712. raise InvalidActionError("Document already archived.")
  713. document.archived = True
  714. document.archived_at = datetime.now(timezone.utc).replace(tzinfo=None)
  715. document.archived_by = current_user.id
  716. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  717. db.session.commit()
  718. if document.enabled:
  719. # Set cache to prevent indexing the same document multiple times
  720. redis_client.setex(indexing_cache_key, 600, 1)
  721. remove_document_from_index_task.delay(document_id)
  722. return {"result": "success"}, 200
  723. elif action == "un_archive":
  724. if not document.archived:
  725. raise InvalidActionError("Document is not archived.")
  726. document.archived = False
  727. document.archived_at = None
  728. document.archived_by = None
  729. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  730. db.session.commit()
  731. # Set cache to prevent indexing the same document multiple times
  732. redis_client.setex(indexing_cache_key, 600, 1)
  733. add_document_to_index_task.delay(document_id)
  734. return {"result": "success"}, 200
  735. else:
  736. raise InvalidActionError()
  737. class DocumentPauseApi(DocumentResource):
  738. @setup_required
  739. @login_required
  740. @account_initialization_required
  741. def patch(self, dataset_id, document_id):
  742. """pause document."""
  743. dataset_id = str(dataset_id)
  744. document_id = str(document_id)
  745. dataset = DatasetService.get_dataset(dataset_id)
  746. if not dataset:
  747. raise NotFound("Dataset not found.")
  748. document = DocumentService.get_document(dataset.id, document_id)
  749. # 404 if document not found
  750. if document is None:
  751. raise NotFound("Document Not Exists.")
  752. # 403 if document is archived
  753. if DocumentService.check_archived(document):
  754. raise ArchivedDocumentImmutableError()
  755. try:
  756. # pause document
  757. DocumentService.pause_document(document)
  758. except services.errors.document.DocumentIndexingError:
  759. raise DocumentIndexingError("Cannot pause completed document.")
  760. return {"result": "success"}, 204
  761. class DocumentRecoverApi(DocumentResource):
  762. @setup_required
  763. @login_required
  764. @account_initialization_required
  765. def patch(self, dataset_id, document_id):
  766. """recover document."""
  767. dataset_id = str(dataset_id)
  768. document_id = str(document_id)
  769. dataset = DatasetService.get_dataset(dataset_id)
  770. if not dataset:
  771. raise NotFound("Dataset not found.")
  772. document = DocumentService.get_document(dataset.id, document_id)
  773. # 404 if document not found
  774. if document is None:
  775. raise NotFound("Document Not Exists.")
  776. # 403 if document is archived
  777. if DocumentService.check_archived(document):
  778. raise ArchivedDocumentImmutableError()
  779. try:
  780. # pause document
  781. DocumentService.recover_document(document)
  782. except services.errors.document.DocumentIndexingError:
  783. raise DocumentIndexingError("Document is not in paused status.")
  784. return {"result": "success"}, 204
  785. class DocumentRetryApi(DocumentResource):
  786. @setup_required
  787. @login_required
  788. @account_initialization_required
  789. def post(self, dataset_id):
  790. """retry document."""
  791. parser = reqparse.RequestParser()
  792. parser.add_argument("document_ids", type=list, required=True, nullable=False, location="json")
  793. args = parser.parse_args()
  794. dataset_id = str(dataset_id)
  795. dataset = DatasetService.get_dataset(dataset_id)
  796. retry_documents = []
  797. if not dataset:
  798. raise NotFound("Dataset not found.")
  799. for document_id in args["document_ids"]:
  800. try:
  801. document_id = str(document_id)
  802. document = DocumentService.get_document(dataset.id, document_id)
  803. # 404 if document not found
  804. if document is None:
  805. raise NotFound("Document Not Exists.")
  806. # 403 if document is archived
  807. if DocumentService.check_archived(document):
  808. raise ArchivedDocumentImmutableError()
  809. # 400 if document is completed
  810. if document.indexing_status == "completed":
  811. raise DocumentAlreadyFinishedError()
  812. retry_documents.append(document)
  813. except Exception as e:
  814. logging.error(f"Document {document_id} retry failed: {str(e)}")
  815. continue
  816. # retry document
  817. DocumentService.retry_document(dataset_id, retry_documents)
  818. return {"result": "success"}, 204
  819. class DocumentRenameApi(DocumentResource):
  820. @setup_required
  821. @login_required
  822. @account_initialization_required
  823. @marshal_with(document_fields)
  824. def post(self, dataset_id, document_id):
  825. # The role of the current user in the ta table must be admin, owner, editor, or dataset_operator
  826. if not current_user.is_dataset_editor:
  827. raise Forbidden()
  828. dataset = DatasetService.get_dataset(dataset_id)
  829. DatasetService.check_dataset_operator_permission(current_user, dataset)
  830. parser = reqparse.RequestParser()
  831. parser.add_argument("name", type=str, required=True, nullable=False, location="json")
  832. args = parser.parse_args()
  833. try:
  834. document = DocumentService.rename_document(dataset_id, document_id, args["name"])
  835. except services.errors.document.DocumentIndexingError:
  836. raise DocumentIndexingError("Cannot delete document during indexing.")
  837. return document
  838. class WebsiteDocumentSyncApi(DocumentResource):
  839. @setup_required
  840. @login_required
  841. @account_initialization_required
  842. def get(self, dataset_id, document_id):
  843. """sync website document."""
  844. dataset_id = str(dataset_id)
  845. dataset = DatasetService.get_dataset(dataset_id)
  846. if not dataset:
  847. raise NotFound("Dataset not found.")
  848. document_id = str(document_id)
  849. document = DocumentService.get_document(dataset.id, document_id)
  850. if not document:
  851. raise NotFound("Document not found.")
  852. if document.tenant_id != current_user.current_tenant_id:
  853. raise Forbidden("No permission.")
  854. if document.data_source_type != "website_crawl":
  855. raise ValueError("Document is not a website document.")
  856. # 403 if document is archived
  857. if DocumentService.check_archived(document):
  858. raise ArchivedDocumentImmutableError()
  859. # sync document
  860. DocumentService.sync_website_document(dataset_id, document)
  861. return {"result": "success"}, 200
  862. api.add_resource(GetProcessRuleApi, "/datasets/process-rule")
  863. api.add_resource(DatasetDocumentListApi, "/datasets/<uuid:dataset_id>/documents")
  864. api.add_resource(DatasetInitApi, "/datasets/init")
  865. api.add_resource(
  866. DocumentIndexingEstimateApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate"
  867. )
  868. api.add_resource(DocumentBatchIndexingEstimateApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate")
  869. api.add_resource(DocumentBatchIndexingStatusApi, "/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status")
  870. api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status")
  871. api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  872. api.add_resource(
  873. DocumentProcessingApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>"
  874. )
  875. api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  876. api.add_resource(DocumentMetadataApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata")
  877. api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>")
  878. api.add_resource(DocumentPauseApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause")
  879. api.add_resource(DocumentRecoverApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume")
  880. api.add_resource(DocumentRetryApi, "/datasets/<uuid:dataset_id>/retry")
  881. api.add_resource(DocumentRenameApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename")
  882. api.add_resource(WebsiteDocumentSyncApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync")