datasets_document.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. # -*- coding:utf-8 -*-
  2. from datetime import datetime
  3. from typing import List
  4. import services
  5. from controllers.console import api
  6. from controllers.console.app.error import (ProviderModelCurrentlyNotSupportError, ProviderNotInitializeError,
  7. ProviderQuotaExceededError)
  8. from controllers.console.datasets.error import (ArchivedDocumentImmutableError, DocumentAlreadyFinishedError,
  9. DocumentIndexingError, InvalidActionError, InvalidMetadataError)
  10. from controllers.console.setup import setup_required
  11. from controllers.console.wraps import account_initialization_required, cloud_edition_billing_resource_check
  12. from core.errors.error import (LLMBadRequestError, ModelCurrentlyNotSupportError, ProviderTokenNotInitError,
  13. QuotaExceededError)
  14. from core.indexing_runner import IndexingRunner
  15. from core.model_manager import ModelManager
  16. from core.model_runtime.entities.model_entities import ModelType
  17. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  18. from extensions.ext_database import db
  19. from extensions.ext_redis import redis_client
  20. from fields.document_fields import (dataset_and_document_fields, document_fields, document_status_fields,
  21. document_with_segments_fields)
  22. from flask import request
  23. from flask_login import current_user
  24. from flask_restful import Resource, fields, marshal, marshal_with, reqparse
  25. from libs.login import login_required
  26. from models.dataset import Dataset, DatasetProcessRule, Document, DocumentSegment
  27. from models.model import UploadFile
  28. from services.dataset_service import DatasetService, DocumentService
  29. from sqlalchemy import asc, desc
  30. from tasks.add_document_to_index_task import add_document_to_index_task
  31. from tasks.remove_document_from_index_task import remove_document_from_index_task
  32. from werkzeug.exceptions import Forbidden, NotFound
  33. class DocumentResource(Resource):
  34. def get_document(self, dataset_id: str, document_id: str) -> Document:
  35. dataset = DatasetService.get_dataset(dataset_id)
  36. if not dataset:
  37. raise NotFound('Dataset not found.')
  38. try:
  39. DatasetService.check_dataset_permission(dataset, current_user)
  40. except services.errors.account.NoPermissionError as e:
  41. raise Forbidden(str(e))
  42. document = DocumentService.get_document(dataset_id, document_id)
  43. if not document:
  44. raise NotFound('Document not found.')
  45. if document.tenant_id != current_user.current_tenant_id:
  46. raise Forbidden('No permission.')
  47. return document
  48. def get_batch_documents(self, dataset_id: str, batch: str) -> List[Document]:
  49. dataset = DatasetService.get_dataset(dataset_id)
  50. if not dataset:
  51. raise NotFound('Dataset not found.')
  52. try:
  53. DatasetService.check_dataset_permission(dataset, current_user)
  54. except services.errors.account.NoPermissionError as e:
  55. raise Forbidden(str(e))
  56. documents = DocumentService.get_batch_documents(dataset_id, batch)
  57. if not documents:
  58. raise NotFound('Documents not found.')
  59. return documents
  60. class GetProcessRuleApi(Resource):
  61. @setup_required
  62. @login_required
  63. @account_initialization_required
  64. def get(self):
  65. req_data = request.args
  66. document_id = req_data.get('document_id')
  67. # get default rules
  68. mode = DocumentService.DEFAULT_RULES['mode']
  69. rules = DocumentService.DEFAULT_RULES['rules']
  70. if document_id:
  71. # get the latest process rule
  72. document = Document.query.get_or_404(document_id)
  73. dataset = DatasetService.get_dataset(document.dataset_id)
  74. if not dataset:
  75. raise NotFound('Dataset not found.')
  76. try:
  77. DatasetService.check_dataset_permission(dataset, current_user)
  78. except services.errors.account.NoPermissionError as e:
  79. raise Forbidden(str(e))
  80. # get the latest process rule
  81. dataset_process_rule = db.session.query(DatasetProcessRule). \
  82. filter(DatasetProcessRule.dataset_id == document.dataset_id). \
  83. order_by(DatasetProcessRule.created_at.desc()). \
  84. limit(1). \
  85. one_or_none()
  86. if dataset_process_rule:
  87. mode = dataset_process_rule.mode
  88. rules = dataset_process_rule.rules_dict
  89. return {
  90. 'mode': mode,
  91. 'rules': rules
  92. }
  93. class DatasetDocumentListApi(Resource):
  94. @setup_required
  95. @login_required
  96. @account_initialization_required
  97. def get(self, dataset_id):
  98. dataset_id = str(dataset_id)
  99. page = request.args.get('page', default=1, type=int)
  100. limit = request.args.get('limit', default=20, type=int)
  101. search = request.args.get('keyword', default=None, type=str)
  102. sort = request.args.get('sort', default='-created_at', type=str)
  103. fetch = request.args.get('fetch', default=False, type=bool)
  104. dataset = DatasetService.get_dataset(dataset_id)
  105. if not dataset:
  106. raise NotFound('Dataset not found.')
  107. try:
  108. DatasetService.check_dataset_permission(dataset, current_user)
  109. except services.errors.account.NoPermissionError as e:
  110. raise Forbidden(str(e))
  111. query = Document.query.filter_by(
  112. dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  113. if search:
  114. search = f'%{search}%'
  115. query = query.filter(Document.name.like(search))
  116. if sort.startswith('-'):
  117. sort_logic = desc
  118. sort = sort[1:]
  119. else:
  120. sort_logic = asc
  121. if sort == 'hit_count':
  122. sub_query = db.select(DocumentSegment.document_id,
  123. db.func.sum(DocumentSegment.hit_count).label("total_hit_count")) \
  124. .group_by(DocumentSegment.document_id) \
  125. .subquery()
  126. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id) \
  127. .order_by(sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)))
  128. elif sort == 'created_at':
  129. query = query.order_by(sort_logic(Document.created_at))
  130. else:
  131. query = query.order_by(desc(Document.created_at))
  132. paginated_documents = query.paginate(
  133. page=page, per_page=limit, max_per_page=100, error_out=False)
  134. documents = paginated_documents.items
  135. if fetch:
  136. for document in documents:
  137. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  138. DocumentSegment.document_id == str(document.id),
  139. DocumentSegment.status != 're_segment').count()
  140. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  141. DocumentSegment.status != 're_segment').count()
  142. document.completed_segments = completed_segments
  143. document.total_segments = total_segments
  144. data = marshal(documents, document_with_segments_fields)
  145. else:
  146. data = marshal(documents, document_fields)
  147. response = {
  148. 'data': data,
  149. 'has_more': len(documents) == limit,
  150. 'limit': limit,
  151. 'total': paginated_documents.total,
  152. 'page': page
  153. }
  154. return response
  155. documents_and_batch_fields = {
  156. 'documents': fields.List(fields.Nested(document_fields)),
  157. 'batch': fields.String
  158. }
  159. @setup_required
  160. @login_required
  161. @account_initialization_required
  162. @marshal_with(documents_and_batch_fields)
  163. @cloud_edition_billing_resource_check('vector_space')
  164. def post(self, dataset_id):
  165. dataset_id = str(dataset_id)
  166. dataset = DatasetService.get_dataset(dataset_id)
  167. if not dataset:
  168. raise NotFound('Dataset not found.')
  169. # The role of the current user in the ta table must be admin or owner
  170. if not current_user.is_admin_or_owner:
  171. raise Forbidden()
  172. try:
  173. DatasetService.check_dataset_permission(dataset, current_user)
  174. except services.errors.account.NoPermissionError as e:
  175. raise Forbidden(str(e))
  176. parser = reqparse.RequestParser()
  177. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False,
  178. location='json')
  179. parser.add_argument('data_source', type=dict, required=False, location='json')
  180. parser.add_argument('process_rule', type=dict, required=False, location='json')
  181. parser.add_argument('duplicate', type=bool, nullable=False, location='json')
  182. parser.add_argument('original_document_id', type=str, required=False, location='json')
  183. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  184. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  185. location='json')
  186. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  187. location='json')
  188. args = parser.parse_args()
  189. if not dataset.indexing_technique and not args['indexing_technique']:
  190. raise ValueError('indexing_technique is required.')
  191. # validate args
  192. DocumentService.document_create_args_validate(args)
  193. try:
  194. documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
  195. except ProviderTokenNotInitError as ex:
  196. raise ProviderNotInitializeError(ex.description)
  197. except QuotaExceededError:
  198. raise ProviderQuotaExceededError()
  199. except ModelCurrentlyNotSupportError:
  200. raise ProviderModelCurrentlyNotSupportError()
  201. return {
  202. 'documents': documents,
  203. 'batch': batch
  204. }
  205. class DatasetInitApi(Resource):
  206. @setup_required
  207. @login_required
  208. @account_initialization_required
  209. @marshal_with(dataset_and_document_fields)
  210. @cloud_edition_billing_resource_check('vector_space')
  211. def post(self):
  212. # The role of the current user in the ta table must be admin or owner
  213. if not current_user.is_admin_or_owner:
  214. raise Forbidden()
  215. parser = reqparse.RequestParser()
  216. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, required=True,
  217. nullable=False, location='json')
  218. parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json')
  219. parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
  220. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  221. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  222. location='json')
  223. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  224. location='json')
  225. args = parser.parse_args()
  226. if args['indexing_technique'] == 'high_quality':
  227. try:
  228. model_manager = ModelManager()
  229. model_manager.get_default_model_instance(
  230. tenant_id=current_user.current_tenant_id,
  231. model_type=ModelType.TEXT_EMBEDDING
  232. )
  233. except InvokeAuthorizationError:
  234. raise ProviderNotInitializeError(
  235. f"No Embedding Model available. Please configure a valid provider "
  236. f"in the Settings -> Model Provider.")
  237. except ProviderTokenNotInitError as ex:
  238. raise ProviderNotInitializeError(ex.description)
  239. # validate args
  240. DocumentService.document_create_args_validate(args)
  241. try:
  242. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  243. tenant_id=current_user.current_tenant_id,
  244. document_data=args,
  245. account=current_user
  246. )
  247. except ProviderTokenNotInitError as ex:
  248. raise ProviderNotInitializeError(ex.description)
  249. except QuotaExceededError:
  250. raise ProviderQuotaExceededError()
  251. except ModelCurrentlyNotSupportError:
  252. raise ProviderModelCurrentlyNotSupportError()
  253. response = {
  254. 'dataset': dataset,
  255. 'documents': documents,
  256. 'batch': batch
  257. }
  258. return response
  259. class DocumentIndexingEstimateApi(DocumentResource):
  260. @setup_required
  261. @login_required
  262. @account_initialization_required
  263. def get(self, dataset_id, document_id):
  264. dataset_id = str(dataset_id)
  265. document_id = str(document_id)
  266. document = self.get_document(dataset_id, document_id)
  267. if document.indexing_status in ['completed', 'error']:
  268. raise DocumentAlreadyFinishedError()
  269. data_process_rule = document.dataset_process_rule
  270. data_process_rule_dict = data_process_rule.to_dict()
  271. response = {
  272. "tokens": 0,
  273. "total_price": 0,
  274. "currency": "USD",
  275. "total_segments": 0,
  276. "preview": []
  277. }
  278. if document.data_source_type == 'upload_file':
  279. data_source_info = document.data_source_info_dict
  280. if data_source_info and 'upload_file_id' in data_source_info:
  281. file_id = data_source_info['upload_file_id']
  282. file = db.session.query(UploadFile).filter(
  283. UploadFile.tenant_id == document.tenant_id,
  284. UploadFile.id == file_id
  285. ).first()
  286. # raise error if file not found
  287. if not file:
  288. raise NotFound('File not found.')
  289. indexing_runner = IndexingRunner()
  290. try:
  291. response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, [file],
  292. data_process_rule_dict, None,
  293. 'English', dataset_id)
  294. except LLMBadRequestError:
  295. raise ProviderNotInitializeError(
  296. f"No Embedding Model available. Please configure a valid provider "
  297. f"in the Settings -> Model Provider.")
  298. except ProviderTokenNotInitError as ex:
  299. raise ProviderNotInitializeError(ex.description)
  300. return response
  301. class DocumentBatchIndexingEstimateApi(DocumentResource):
  302. @setup_required
  303. @login_required
  304. @account_initialization_required
  305. def get(self, dataset_id, batch):
  306. dataset_id = str(dataset_id)
  307. batch = str(batch)
  308. dataset = DatasetService.get_dataset(dataset_id)
  309. if dataset is None:
  310. raise NotFound("Dataset not found.")
  311. documents = self.get_batch_documents(dataset_id, batch)
  312. response = {
  313. "tokens": 0,
  314. "total_price": 0,
  315. "currency": "USD",
  316. "total_segments": 0,
  317. "preview": []
  318. }
  319. if not documents:
  320. return response
  321. data_process_rule = documents[0].dataset_process_rule
  322. data_process_rule_dict = data_process_rule.to_dict()
  323. info_list = []
  324. for document in documents:
  325. if document.indexing_status in ['completed', 'error']:
  326. raise DocumentAlreadyFinishedError()
  327. data_source_info = document.data_source_info_dict
  328. # format document files info
  329. if data_source_info and 'upload_file_id' in data_source_info:
  330. file_id = data_source_info['upload_file_id']
  331. info_list.append(file_id)
  332. # format document notion info
  333. elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
  334. pages = []
  335. page = {
  336. 'page_id': data_source_info['notion_page_id'],
  337. 'type': data_source_info['type']
  338. }
  339. pages.append(page)
  340. notion_info = {
  341. 'workspace_id': data_source_info['notion_workspace_id'],
  342. 'pages': pages
  343. }
  344. info_list.append(notion_info)
  345. if dataset.data_source_type == 'upload_file':
  346. file_details = db.session.query(UploadFile).filter(
  347. UploadFile.tenant_id == current_user.current_tenant_id,
  348. UploadFile.id.in_(info_list)
  349. ).all()
  350. if file_details is None:
  351. raise NotFound("File not found.")
  352. indexing_runner = IndexingRunner()
  353. try:
  354. response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, file_details,
  355. data_process_rule_dict, None,
  356. 'English', dataset_id)
  357. except LLMBadRequestError:
  358. raise ProviderNotInitializeError(
  359. f"No Embedding Model available. Please configure a valid provider "
  360. f"in the Settings -> Model Provider.")
  361. except ProviderTokenNotInitError as ex:
  362. raise ProviderNotInitializeError(ex.description)
  363. elif dataset.data_source_type == 'notion_import':
  364. indexing_runner = IndexingRunner()
  365. try:
  366. response = indexing_runner.notion_indexing_estimate(current_user.current_tenant_id,
  367. info_list,
  368. data_process_rule_dict,
  369. None, 'English', dataset_id)
  370. except LLMBadRequestError:
  371. raise ProviderNotInitializeError(
  372. f"No Embedding Model available. Please configure a valid provider "
  373. f"in the Settings -> Model Provider.")
  374. except ProviderTokenNotInitError as ex:
  375. raise ProviderNotInitializeError(ex.description)
  376. else:
  377. raise ValueError('Data source type not support')
  378. return response
  379. class DocumentBatchIndexingStatusApi(DocumentResource):
  380. @setup_required
  381. @login_required
  382. @account_initialization_required
  383. def get(self, dataset_id, batch):
  384. dataset_id = str(dataset_id)
  385. batch = str(batch)
  386. documents = self.get_batch_documents(dataset_id, batch)
  387. documents_status = []
  388. for document in documents:
  389. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  390. DocumentSegment.document_id == str(document.id),
  391. DocumentSegment.status != 're_segment').count()
  392. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  393. DocumentSegment.status != 're_segment').count()
  394. document.completed_segments = completed_segments
  395. document.total_segments = total_segments
  396. if document.is_paused:
  397. document.indexing_status = 'paused'
  398. documents_status.append(marshal(document, document_status_fields))
  399. data = {
  400. 'data': documents_status
  401. }
  402. return data
  403. class DocumentIndexingStatusApi(DocumentResource):
  404. @setup_required
  405. @login_required
  406. @account_initialization_required
  407. def get(self, dataset_id, document_id):
  408. dataset_id = str(dataset_id)
  409. document_id = str(document_id)
  410. document = self.get_document(dataset_id, document_id)
  411. completed_segments = DocumentSegment.query \
  412. .filter(DocumentSegment.completed_at.isnot(None),
  413. DocumentSegment.document_id == str(document_id),
  414. DocumentSegment.status != 're_segment') \
  415. .count()
  416. total_segments = DocumentSegment.query \
  417. .filter(DocumentSegment.document_id == str(document_id),
  418. DocumentSegment.status != 're_segment') \
  419. .count()
  420. document.completed_segments = completed_segments
  421. document.total_segments = total_segments
  422. if document.is_paused:
  423. document.indexing_status = 'paused'
  424. return marshal(document, document_status_fields)
  425. class DocumentDetailApi(DocumentResource):
  426. METADATA_CHOICES = {'all', 'only', 'without'}
  427. @setup_required
  428. @login_required
  429. @account_initialization_required
  430. def get(self, dataset_id, document_id):
  431. dataset_id = str(dataset_id)
  432. document_id = str(document_id)
  433. document = self.get_document(dataset_id, document_id)
  434. metadata = request.args.get('metadata', 'all')
  435. if metadata not in self.METADATA_CHOICES:
  436. raise InvalidMetadataError(f'Invalid metadata value: {metadata}')
  437. if metadata == 'only':
  438. response = {
  439. 'id': document.id,
  440. 'doc_type': document.doc_type,
  441. 'doc_metadata': document.doc_metadata
  442. }
  443. elif metadata == 'without':
  444. process_rules = DatasetService.get_process_rules(dataset_id)
  445. data_source_info = document.data_source_detail_dict
  446. response = {
  447. 'id': document.id,
  448. 'position': document.position,
  449. 'data_source_type': document.data_source_type,
  450. 'data_source_info': data_source_info,
  451. 'dataset_process_rule_id': document.dataset_process_rule_id,
  452. 'dataset_process_rule': process_rules,
  453. 'name': document.name,
  454. 'created_from': document.created_from,
  455. 'created_by': document.created_by,
  456. 'created_at': document.created_at.timestamp(),
  457. 'tokens': document.tokens,
  458. 'indexing_status': document.indexing_status,
  459. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  460. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  461. 'indexing_latency': document.indexing_latency,
  462. 'error': document.error,
  463. 'enabled': document.enabled,
  464. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  465. 'disabled_by': document.disabled_by,
  466. 'archived': document.archived,
  467. 'segment_count': document.segment_count,
  468. 'average_segment_length': document.average_segment_length,
  469. 'hit_count': document.hit_count,
  470. 'display_status': document.display_status,
  471. 'doc_form': document.doc_form
  472. }
  473. else:
  474. process_rules = DatasetService.get_process_rules(dataset_id)
  475. data_source_info = document.data_source_detail_dict
  476. response = {
  477. 'id': document.id,
  478. 'position': document.position,
  479. 'data_source_type': document.data_source_type,
  480. 'data_source_info': data_source_info,
  481. 'dataset_process_rule_id': document.dataset_process_rule_id,
  482. 'dataset_process_rule': process_rules,
  483. 'name': document.name,
  484. 'created_from': document.created_from,
  485. 'created_by': document.created_by,
  486. 'created_at': document.created_at.timestamp(),
  487. 'tokens': document.tokens,
  488. 'indexing_status': document.indexing_status,
  489. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  490. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  491. 'indexing_latency': document.indexing_latency,
  492. 'error': document.error,
  493. 'enabled': document.enabled,
  494. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  495. 'disabled_by': document.disabled_by,
  496. 'archived': document.archived,
  497. 'doc_type': document.doc_type,
  498. 'doc_metadata': document.doc_metadata,
  499. 'segment_count': document.segment_count,
  500. 'average_segment_length': document.average_segment_length,
  501. 'hit_count': document.hit_count,
  502. 'display_status': document.display_status,
  503. 'doc_form': document.doc_form
  504. }
  505. return response, 200
  506. class DocumentProcessingApi(DocumentResource):
  507. @setup_required
  508. @login_required
  509. @account_initialization_required
  510. def patch(self, dataset_id, document_id, action):
  511. dataset_id = str(dataset_id)
  512. document_id = str(document_id)
  513. document = self.get_document(dataset_id, document_id)
  514. # The role of the current user in the ta table must be admin or owner
  515. if not current_user.is_admin_or_owner:
  516. raise Forbidden()
  517. if action == "pause":
  518. if document.indexing_status != "indexing":
  519. raise InvalidActionError('Document not in indexing state.')
  520. document.paused_by = current_user.id
  521. document.paused_at = datetime.utcnow()
  522. document.is_paused = True
  523. db.session.commit()
  524. elif action == "resume":
  525. if document.indexing_status not in ["paused", "error"]:
  526. raise InvalidActionError('Document not in paused or error state.')
  527. document.paused_by = None
  528. document.paused_at = None
  529. document.is_paused = False
  530. db.session.commit()
  531. else:
  532. raise InvalidActionError()
  533. return {'result': 'success'}, 200
  534. class DocumentDeleteApi(DocumentResource):
  535. @setup_required
  536. @login_required
  537. @account_initialization_required
  538. def delete(self, dataset_id, document_id):
  539. dataset_id = str(dataset_id)
  540. document_id = str(document_id)
  541. dataset = DatasetService.get_dataset(dataset_id)
  542. if dataset is None:
  543. raise NotFound("Dataset not found.")
  544. # check user's model setting
  545. DatasetService.check_dataset_model_setting(dataset)
  546. document = self.get_document(dataset_id, document_id)
  547. try:
  548. DocumentService.delete_document(document)
  549. except services.errors.document.DocumentIndexingError:
  550. raise DocumentIndexingError('Cannot delete document during indexing.')
  551. return {'result': 'success'}, 204
  552. class DocumentMetadataApi(DocumentResource):
  553. @setup_required
  554. @login_required
  555. @account_initialization_required
  556. def put(self, dataset_id, document_id):
  557. dataset_id = str(dataset_id)
  558. document_id = str(document_id)
  559. document = self.get_document(dataset_id, document_id)
  560. req_data = request.get_json()
  561. doc_type = req_data.get('doc_type')
  562. doc_metadata = req_data.get('doc_metadata')
  563. # The role of the current user in the ta table must be admin or owner
  564. if not current_user.is_admin_or_owner:
  565. raise Forbidden()
  566. if doc_type is None or doc_metadata is None:
  567. raise ValueError('Both doc_type and doc_metadata must be provided.')
  568. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  569. raise ValueError('Invalid doc_type.')
  570. if not isinstance(doc_metadata, dict):
  571. raise ValueError('doc_metadata must be a dictionary.')
  572. metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
  573. document.doc_metadata = {}
  574. if doc_type == 'others':
  575. document.doc_metadata = doc_metadata
  576. else:
  577. for key, value_type in metadata_schema.items():
  578. value = doc_metadata.get(key)
  579. if value is not None and isinstance(value, value_type):
  580. document.doc_metadata[key] = value
  581. document.doc_type = doc_type
  582. document.updated_at = datetime.utcnow()
  583. db.session.commit()
  584. return {'result': 'success', 'message': 'Document metadata updated.'}, 200
  585. class DocumentStatusApi(DocumentResource):
  586. @setup_required
  587. @login_required
  588. @account_initialization_required
  589. @cloud_edition_billing_resource_check('vector_space')
  590. def patch(self, dataset_id, document_id, action):
  591. dataset_id = str(dataset_id)
  592. document_id = str(document_id)
  593. dataset = DatasetService.get_dataset(dataset_id)
  594. if dataset is None:
  595. raise NotFound("Dataset not found.")
  596. # check user's model setting
  597. DatasetService.check_dataset_model_setting(dataset)
  598. document = self.get_document(dataset_id, document_id)
  599. # The role of the current user in the ta table must be admin or owner
  600. if not current_user.is_admin_or_owner:
  601. raise Forbidden()
  602. indexing_cache_key = 'document_{}_indexing'.format(document.id)
  603. cache_result = redis_client.get(indexing_cache_key)
  604. if cache_result is not None:
  605. raise InvalidActionError("Document is being indexed, please try again later")
  606. if action == "enable":
  607. if document.enabled:
  608. raise InvalidActionError('Document already enabled.')
  609. document.enabled = True
  610. document.disabled_at = None
  611. document.disabled_by = None
  612. document.updated_at = datetime.utcnow()
  613. db.session.commit()
  614. # Set cache to prevent indexing the same document multiple times
  615. redis_client.setex(indexing_cache_key, 600, 1)
  616. add_document_to_index_task.delay(document_id)
  617. return {'result': 'success'}, 200
  618. elif action == "disable":
  619. if not document.completed_at or document.indexing_status != 'completed':
  620. raise InvalidActionError('Document is not completed.')
  621. if not document.enabled:
  622. raise InvalidActionError('Document already disabled.')
  623. document.enabled = False
  624. document.disabled_at = datetime.utcnow()
  625. document.disabled_by = current_user.id
  626. document.updated_at = datetime.utcnow()
  627. db.session.commit()
  628. # Set cache to prevent indexing the same document multiple times
  629. redis_client.setex(indexing_cache_key, 600, 1)
  630. remove_document_from_index_task.delay(document_id)
  631. return {'result': 'success'}, 200
  632. elif action == "archive":
  633. if document.archived:
  634. raise InvalidActionError('Document already archived.')
  635. document.archived = True
  636. document.archived_at = datetime.utcnow()
  637. document.archived_by = current_user.id
  638. document.updated_at = datetime.utcnow()
  639. db.session.commit()
  640. if document.enabled:
  641. # Set cache to prevent indexing the same document multiple times
  642. redis_client.setex(indexing_cache_key, 600, 1)
  643. remove_document_from_index_task.delay(document_id)
  644. return {'result': 'success'}, 200
  645. elif action == "un_archive":
  646. if not document.archived:
  647. raise InvalidActionError('Document is not archived.')
  648. document.archived = False
  649. document.archived_at = None
  650. document.archived_by = None
  651. document.updated_at = datetime.utcnow()
  652. db.session.commit()
  653. # Set cache to prevent indexing the same document multiple times
  654. redis_client.setex(indexing_cache_key, 600, 1)
  655. add_document_to_index_task.delay(document_id)
  656. return {'result': 'success'}, 200
  657. else:
  658. raise InvalidActionError()
  659. class DocumentPauseApi(DocumentResource):
  660. @setup_required
  661. @login_required
  662. @account_initialization_required
  663. def patch(self, dataset_id, document_id):
  664. """pause document."""
  665. dataset_id = str(dataset_id)
  666. document_id = str(document_id)
  667. dataset = DatasetService.get_dataset(dataset_id)
  668. if not dataset:
  669. raise NotFound('Dataset not found.')
  670. document = DocumentService.get_document(dataset.id, document_id)
  671. # 404 if document not found
  672. if document is None:
  673. raise NotFound("Document Not Exists.")
  674. # 403 if document is archived
  675. if DocumentService.check_archived(document):
  676. raise ArchivedDocumentImmutableError()
  677. try:
  678. # pause document
  679. DocumentService.pause_document(document)
  680. except services.errors.document.DocumentIndexingError:
  681. raise DocumentIndexingError('Cannot pause completed document.')
  682. return {'result': 'success'}, 204
  683. class DocumentRecoverApi(DocumentResource):
  684. @setup_required
  685. @login_required
  686. @account_initialization_required
  687. def patch(self, dataset_id, document_id):
  688. """recover document."""
  689. dataset_id = str(dataset_id)
  690. document_id = str(document_id)
  691. dataset = DatasetService.get_dataset(dataset_id)
  692. if not dataset:
  693. raise NotFound('Dataset not found.')
  694. document = DocumentService.get_document(dataset.id, document_id)
  695. # 404 if document not found
  696. if document is None:
  697. raise NotFound("Document Not Exists.")
  698. # 403 if document is archived
  699. if DocumentService.check_archived(document):
  700. raise ArchivedDocumentImmutableError()
  701. try:
  702. # pause document
  703. DocumentService.recover_document(document)
  704. except services.errors.document.DocumentIndexingError:
  705. raise DocumentIndexingError('Document is not in paused status.')
  706. return {'result': 'success'}, 204
  707. api.add_resource(GetProcessRuleApi, '/datasets/process-rule')
  708. api.add_resource(DatasetDocumentListApi,
  709. '/datasets/<uuid:dataset_id>/documents')
  710. api.add_resource(DatasetInitApi,
  711. '/datasets/init')
  712. api.add_resource(DocumentIndexingEstimateApi,
  713. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
  714. api.add_resource(DocumentBatchIndexingEstimateApi,
  715. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate')
  716. api.add_resource(DocumentBatchIndexingStatusApi,
  717. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
  718. api.add_resource(DocumentIndexingStatusApi,
  719. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status')
  720. api.add_resource(DocumentDetailApi,
  721. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  722. api.add_resource(DocumentProcessingApi,
  723. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>')
  724. api.add_resource(DocumentDeleteApi,
  725. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  726. api.add_resource(DocumentMetadataApi,
  727. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata')
  728. api.add_resource(DocumentStatusApi,
  729. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>')
  730. api.add_resource(DocumentPauseApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause')
  731. api.add_resource(DocumentRecoverApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume')