datasets_document.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from datetime import datetime, timezone
  4. from flask import request
  5. from flask_login import current_user
  6. from flask_restful import Resource, fields, marshal, marshal_with, reqparse
  7. from sqlalchemy import asc, desc
  8. from transformers.hf_argparser import string_to_bool
  9. from werkzeug.exceptions import Forbidden, NotFound
  10. import services
  11. from controllers.console import api
  12. from controllers.console.app.error import (
  13. ProviderModelCurrentlyNotSupportError,
  14. ProviderNotInitializeError,
  15. ProviderQuotaExceededError,
  16. )
  17. from controllers.console.datasets.error import (
  18. ArchivedDocumentImmutableError,
  19. DocumentAlreadyFinishedError,
  20. DocumentIndexingError,
  21. InvalidActionError,
  22. InvalidMetadataError,
  23. )
  24. from controllers.console.setup import setup_required
  25. from controllers.console.wraps import account_initialization_required, cloud_edition_billing_resource_check
  26. from core.errors.error import (
  27. LLMBadRequestError,
  28. ModelCurrentlyNotSupportError,
  29. ProviderTokenNotInitError,
  30. QuotaExceededError,
  31. )
  32. from core.indexing_runner import IndexingRunner
  33. from core.model_manager import ModelManager
  34. from core.model_runtime.entities.model_entities import ModelType
  35. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  36. from core.rag.extractor.entity.extract_setting import ExtractSetting
  37. from extensions.ext_database import db
  38. from extensions.ext_redis import redis_client
  39. from fields.document_fields import (
  40. dataset_and_document_fields,
  41. document_fields,
  42. document_status_fields,
  43. document_with_segments_fields,
  44. )
  45. from libs.login import login_required
  46. from models.dataset import Dataset, DatasetProcessRule, Document, DocumentSegment
  47. from models.model import UploadFile
  48. from services.dataset_service import DatasetService, DocumentService
  49. from tasks.add_document_to_index_task import add_document_to_index_task
  50. from tasks.remove_document_from_index_task import remove_document_from_index_task
  51. class DocumentResource(Resource):
  52. def get_document(self, dataset_id: str, document_id: str) -> Document:
  53. dataset = DatasetService.get_dataset(dataset_id)
  54. if not dataset:
  55. raise NotFound('Dataset not found.')
  56. try:
  57. DatasetService.check_dataset_permission(dataset, current_user)
  58. except services.errors.account.NoPermissionError as e:
  59. raise Forbidden(str(e))
  60. document = DocumentService.get_document(dataset_id, document_id)
  61. if not document:
  62. raise NotFound('Document not found.')
  63. if document.tenant_id != current_user.current_tenant_id:
  64. raise Forbidden('No permission.')
  65. return document
  66. def get_batch_documents(self, dataset_id: str, batch: str) -> list[Document]:
  67. dataset = DatasetService.get_dataset(dataset_id)
  68. if not dataset:
  69. raise NotFound('Dataset not found.')
  70. try:
  71. DatasetService.check_dataset_permission(dataset, current_user)
  72. except services.errors.account.NoPermissionError as e:
  73. raise Forbidden(str(e))
  74. documents = DocumentService.get_batch_documents(dataset_id, batch)
  75. if not documents:
  76. raise NotFound('Documents not found.')
  77. return documents
  78. class GetProcessRuleApi(Resource):
  79. @setup_required
  80. @login_required
  81. @account_initialization_required
  82. def get(self):
  83. req_data = request.args
  84. document_id = req_data.get('document_id')
  85. # get default rules
  86. mode = DocumentService.DEFAULT_RULES['mode']
  87. rules = DocumentService.DEFAULT_RULES['rules']
  88. if document_id:
  89. # get the latest process rule
  90. document = Document.query.get_or_404(document_id)
  91. dataset = DatasetService.get_dataset(document.dataset_id)
  92. if not dataset:
  93. raise NotFound('Dataset not found.')
  94. try:
  95. DatasetService.check_dataset_permission(dataset, current_user)
  96. except services.errors.account.NoPermissionError as e:
  97. raise Forbidden(str(e))
  98. # get the latest process rule
  99. dataset_process_rule = db.session.query(DatasetProcessRule). \
  100. filter(DatasetProcessRule.dataset_id == document.dataset_id). \
  101. order_by(DatasetProcessRule.created_at.desc()). \
  102. limit(1). \
  103. one_or_none()
  104. if dataset_process_rule:
  105. mode = dataset_process_rule.mode
  106. rules = dataset_process_rule.rules_dict
  107. return {
  108. 'mode': mode,
  109. 'rules': rules
  110. }
  111. class DatasetDocumentListApi(Resource):
  112. @setup_required
  113. @login_required
  114. @account_initialization_required
  115. def get(self, dataset_id):
  116. dataset_id = str(dataset_id)
  117. page = request.args.get('page', default=1, type=int)
  118. limit = request.args.get('limit', default=20, type=int)
  119. search = request.args.get('keyword', default=None, type=str)
  120. sort = request.args.get('sort', default='-created_at', type=str)
  121. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  122. try:
  123. fetch = string_to_bool(request.args.get('fetch', default='false'))
  124. except (ArgumentTypeError, ValueError, Exception) as e:
  125. fetch = False
  126. dataset = DatasetService.get_dataset(dataset_id)
  127. if not dataset:
  128. raise NotFound('Dataset not found.')
  129. try:
  130. DatasetService.check_dataset_permission(dataset, current_user)
  131. except services.errors.account.NoPermissionError as e:
  132. raise Forbidden(str(e))
  133. query = Document.query.filter_by(
  134. dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  135. if search:
  136. search = f'%{search}%'
  137. query = query.filter(Document.name.like(search))
  138. if sort.startswith('-'):
  139. sort_logic = desc
  140. sort = sort[1:]
  141. else:
  142. sort_logic = asc
  143. if sort == 'hit_count':
  144. sub_query = db.select(DocumentSegment.document_id,
  145. db.func.sum(DocumentSegment.hit_count).label("total_hit_count")) \
  146. .group_by(DocumentSegment.document_id) \
  147. .subquery()
  148. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id) \
  149. .order_by(sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)))
  150. elif sort == 'created_at':
  151. query = query.order_by(sort_logic(Document.created_at))
  152. else:
  153. query = query.order_by(desc(Document.created_at))
  154. paginated_documents = query.paginate(
  155. page=page, per_page=limit, max_per_page=100, error_out=False)
  156. documents = paginated_documents.items
  157. if fetch:
  158. for document in documents:
  159. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  160. DocumentSegment.document_id == str(document.id),
  161. DocumentSegment.status != 're_segment').count()
  162. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  163. DocumentSegment.status != 're_segment').count()
  164. document.completed_segments = completed_segments
  165. document.total_segments = total_segments
  166. data = marshal(documents, document_with_segments_fields)
  167. else:
  168. data = marshal(documents, document_fields)
  169. response = {
  170. 'data': data,
  171. 'has_more': len(documents) == limit,
  172. 'limit': limit,
  173. 'total': paginated_documents.total,
  174. 'page': page
  175. }
  176. return response
  177. documents_and_batch_fields = {
  178. 'documents': fields.List(fields.Nested(document_fields)),
  179. 'batch': fields.String
  180. }
  181. @setup_required
  182. @login_required
  183. @account_initialization_required
  184. @marshal_with(documents_and_batch_fields)
  185. @cloud_edition_billing_resource_check('vector_space')
  186. def post(self, dataset_id):
  187. dataset_id = str(dataset_id)
  188. dataset = DatasetService.get_dataset(dataset_id)
  189. if not dataset:
  190. raise NotFound('Dataset not found.')
  191. # The role of the current user in the ta table must be admin, owner, or editor
  192. if not current_user.is_editor:
  193. raise Forbidden()
  194. try:
  195. DatasetService.check_dataset_permission(dataset, current_user)
  196. except services.errors.account.NoPermissionError as e:
  197. raise Forbidden(str(e))
  198. parser = reqparse.RequestParser()
  199. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False,
  200. location='json')
  201. parser.add_argument('data_source', type=dict, required=False, location='json')
  202. parser.add_argument('process_rule', type=dict, required=False, location='json')
  203. parser.add_argument('duplicate', type=bool, default=True, nullable=False, location='json')
  204. parser.add_argument('original_document_id', type=str, required=False, location='json')
  205. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  206. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  207. location='json')
  208. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  209. location='json')
  210. args = parser.parse_args()
  211. if not dataset.indexing_technique and not args['indexing_technique']:
  212. raise ValueError('indexing_technique is required.')
  213. # validate args
  214. DocumentService.document_create_args_validate(args)
  215. try:
  216. documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
  217. except ProviderTokenNotInitError as ex:
  218. raise ProviderNotInitializeError(ex.description)
  219. except QuotaExceededError:
  220. raise ProviderQuotaExceededError()
  221. except ModelCurrentlyNotSupportError:
  222. raise ProviderModelCurrentlyNotSupportError()
  223. return {
  224. 'documents': documents,
  225. 'batch': batch
  226. }
  227. class DatasetInitApi(Resource):
  228. @setup_required
  229. @login_required
  230. @account_initialization_required
  231. @marshal_with(dataset_and_document_fields)
  232. @cloud_edition_billing_resource_check('vector_space')
  233. def post(self):
  234. # The role of the current user in the ta table must be admin, owner, or editor
  235. if not current_user.is_editor:
  236. raise Forbidden()
  237. parser = reqparse.RequestParser()
  238. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, required=True,
  239. nullable=False, location='json')
  240. parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json')
  241. parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
  242. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  243. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  244. location='json')
  245. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  246. location='json')
  247. args = parser.parse_args()
  248. if args['indexing_technique'] == 'high_quality':
  249. try:
  250. model_manager = ModelManager()
  251. model_manager.get_default_model_instance(
  252. tenant_id=current_user.current_tenant_id,
  253. model_type=ModelType.TEXT_EMBEDDING
  254. )
  255. except InvokeAuthorizationError:
  256. raise ProviderNotInitializeError(
  257. "No Embedding Model available. Please configure a valid provider "
  258. "in the Settings -> Model Provider.")
  259. except ProviderTokenNotInitError as ex:
  260. raise ProviderNotInitializeError(ex.description)
  261. # validate args
  262. DocumentService.document_create_args_validate(args)
  263. try:
  264. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  265. tenant_id=current_user.current_tenant_id,
  266. document_data=args,
  267. account=current_user
  268. )
  269. except ProviderTokenNotInitError as ex:
  270. raise ProviderNotInitializeError(ex.description)
  271. except QuotaExceededError:
  272. raise ProviderQuotaExceededError()
  273. except ModelCurrentlyNotSupportError:
  274. raise ProviderModelCurrentlyNotSupportError()
  275. response = {
  276. 'dataset': dataset,
  277. 'documents': documents,
  278. 'batch': batch
  279. }
  280. return response
  281. class DocumentIndexingEstimateApi(DocumentResource):
  282. @setup_required
  283. @login_required
  284. @account_initialization_required
  285. def get(self, dataset_id, document_id):
  286. dataset_id = str(dataset_id)
  287. document_id = str(document_id)
  288. document = self.get_document(dataset_id, document_id)
  289. if document.indexing_status in ['completed', 'error']:
  290. raise DocumentAlreadyFinishedError()
  291. data_process_rule = document.dataset_process_rule
  292. data_process_rule_dict = data_process_rule.to_dict()
  293. response = {
  294. "tokens": 0,
  295. "total_price": 0,
  296. "currency": "USD",
  297. "total_segments": 0,
  298. "preview": []
  299. }
  300. if document.data_source_type == 'upload_file':
  301. data_source_info = document.data_source_info_dict
  302. if data_source_info and 'upload_file_id' in data_source_info:
  303. file_id = data_source_info['upload_file_id']
  304. file = db.session.query(UploadFile).filter(
  305. UploadFile.tenant_id == document.tenant_id,
  306. UploadFile.id == file_id
  307. ).first()
  308. # raise error if file not found
  309. if not file:
  310. raise NotFound('File not found.')
  311. extract_setting = ExtractSetting(
  312. datasource_type="upload_file",
  313. upload_file=file,
  314. document_model=document.doc_form
  315. )
  316. indexing_runner = IndexingRunner()
  317. try:
  318. response = indexing_runner.indexing_estimate(current_user.current_tenant_id, [extract_setting],
  319. data_process_rule_dict, document.doc_form,
  320. 'English', dataset_id)
  321. except LLMBadRequestError:
  322. raise ProviderNotInitializeError(
  323. "No Embedding Model available. Please configure a valid provider "
  324. "in the Settings -> Model Provider.")
  325. except ProviderTokenNotInitError as ex:
  326. raise ProviderNotInitializeError(ex.description)
  327. return response
  328. class DocumentBatchIndexingEstimateApi(DocumentResource):
  329. @setup_required
  330. @login_required
  331. @account_initialization_required
  332. def get(self, dataset_id, batch):
  333. dataset_id = str(dataset_id)
  334. batch = str(batch)
  335. documents = self.get_batch_documents(dataset_id, batch)
  336. response = {
  337. "tokens": 0,
  338. "total_price": 0,
  339. "currency": "USD",
  340. "total_segments": 0,
  341. "preview": []
  342. }
  343. if not documents:
  344. return response
  345. data_process_rule = documents[0].dataset_process_rule
  346. data_process_rule_dict = data_process_rule.to_dict()
  347. info_list = []
  348. extract_settings = []
  349. for document in documents:
  350. if document.indexing_status in ['completed', 'error']:
  351. raise DocumentAlreadyFinishedError()
  352. data_source_info = document.data_source_info_dict
  353. # format document files info
  354. if data_source_info and 'upload_file_id' in data_source_info:
  355. file_id = data_source_info['upload_file_id']
  356. info_list.append(file_id)
  357. # format document notion info
  358. elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
  359. pages = []
  360. page = {
  361. 'page_id': data_source_info['notion_page_id'],
  362. 'type': data_source_info['type']
  363. }
  364. pages.append(page)
  365. notion_info = {
  366. 'workspace_id': data_source_info['notion_workspace_id'],
  367. 'pages': pages
  368. }
  369. info_list.append(notion_info)
  370. if document.data_source_type == 'upload_file':
  371. file_id = data_source_info['upload_file_id']
  372. file_detail = db.session.query(UploadFile).filter(
  373. UploadFile.tenant_id == current_user.current_tenant_id,
  374. UploadFile.id == file_id
  375. ).first()
  376. if file_detail is None:
  377. raise NotFound("File not found.")
  378. extract_setting = ExtractSetting(
  379. datasource_type="upload_file",
  380. upload_file=file_detail,
  381. document_model=document.doc_form
  382. )
  383. extract_settings.append(extract_setting)
  384. elif document.data_source_type == 'notion_import':
  385. extract_setting = ExtractSetting(
  386. datasource_type="notion_import",
  387. notion_info={
  388. "notion_workspace_id": data_source_info['notion_workspace_id'],
  389. "notion_obj_id": data_source_info['notion_page_id'],
  390. "notion_page_type": data_source_info['type'],
  391. "tenant_id": current_user.current_tenant_id
  392. },
  393. document_model=document.doc_form
  394. )
  395. extract_settings.append(extract_setting)
  396. elif document.data_source_type == 'website_crawl':
  397. extract_setting = ExtractSetting(
  398. datasource_type="website_crawl",
  399. website_info={
  400. "provider": data_source_info['provider'],
  401. "job_id": data_source_info['job_id'],
  402. "url": data_source_info['url'],
  403. "tenant_id": current_user.current_tenant_id,
  404. "mode": data_source_info['mode'],
  405. "only_main_content": data_source_info['only_main_content']
  406. },
  407. document_model=document.doc_form
  408. )
  409. extract_settings.append(extract_setting)
  410. else:
  411. raise ValueError('Data source type not support')
  412. indexing_runner = IndexingRunner()
  413. try:
  414. response = indexing_runner.indexing_estimate(current_user.current_tenant_id, extract_settings,
  415. data_process_rule_dict, document.doc_form,
  416. 'English', dataset_id)
  417. except LLMBadRequestError:
  418. raise ProviderNotInitializeError(
  419. "No Embedding Model available. Please configure a valid provider "
  420. "in the Settings -> Model Provider.")
  421. except ProviderTokenNotInitError as ex:
  422. raise ProviderNotInitializeError(ex.description)
  423. return response
  424. class DocumentBatchIndexingStatusApi(DocumentResource):
  425. @setup_required
  426. @login_required
  427. @account_initialization_required
  428. def get(self, dataset_id, batch):
  429. dataset_id = str(dataset_id)
  430. batch = str(batch)
  431. documents = self.get_batch_documents(dataset_id, batch)
  432. documents_status = []
  433. for document in documents:
  434. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  435. DocumentSegment.document_id == str(document.id),
  436. DocumentSegment.status != 're_segment').count()
  437. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  438. DocumentSegment.status != 're_segment').count()
  439. document.completed_segments = completed_segments
  440. document.total_segments = total_segments
  441. if document.is_paused:
  442. document.indexing_status = 'paused'
  443. documents_status.append(marshal(document, document_status_fields))
  444. data = {
  445. 'data': documents_status
  446. }
  447. return data
  448. class DocumentIndexingStatusApi(DocumentResource):
  449. @setup_required
  450. @login_required
  451. @account_initialization_required
  452. def get(self, dataset_id, document_id):
  453. dataset_id = str(dataset_id)
  454. document_id = str(document_id)
  455. document = self.get_document(dataset_id, document_id)
  456. completed_segments = DocumentSegment.query \
  457. .filter(DocumentSegment.completed_at.isnot(None),
  458. DocumentSegment.document_id == str(document_id),
  459. DocumentSegment.status != 're_segment') \
  460. .count()
  461. total_segments = DocumentSegment.query \
  462. .filter(DocumentSegment.document_id == str(document_id),
  463. DocumentSegment.status != 're_segment') \
  464. .count()
  465. document.completed_segments = completed_segments
  466. document.total_segments = total_segments
  467. if document.is_paused:
  468. document.indexing_status = 'paused'
  469. return marshal(document, document_status_fields)
  470. class DocumentDetailApi(DocumentResource):
  471. METADATA_CHOICES = {'all', 'only', 'without'}
  472. @setup_required
  473. @login_required
  474. @account_initialization_required
  475. def get(self, dataset_id, document_id):
  476. dataset_id = str(dataset_id)
  477. document_id = str(document_id)
  478. document = self.get_document(dataset_id, document_id)
  479. metadata = request.args.get('metadata', 'all')
  480. if metadata not in self.METADATA_CHOICES:
  481. raise InvalidMetadataError(f'Invalid metadata value: {metadata}')
  482. if metadata == 'only':
  483. response = {
  484. 'id': document.id,
  485. 'doc_type': document.doc_type,
  486. 'doc_metadata': document.doc_metadata
  487. }
  488. elif metadata == 'without':
  489. process_rules = DatasetService.get_process_rules(dataset_id)
  490. data_source_info = document.data_source_detail_dict
  491. response = {
  492. 'id': document.id,
  493. 'position': document.position,
  494. 'data_source_type': document.data_source_type,
  495. 'data_source_info': data_source_info,
  496. 'dataset_process_rule_id': document.dataset_process_rule_id,
  497. 'dataset_process_rule': process_rules,
  498. 'name': document.name,
  499. 'created_from': document.created_from,
  500. 'created_by': document.created_by,
  501. 'created_at': document.created_at.timestamp(),
  502. 'tokens': document.tokens,
  503. 'indexing_status': document.indexing_status,
  504. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  505. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  506. 'indexing_latency': document.indexing_latency,
  507. 'error': document.error,
  508. 'enabled': document.enabled,
  509. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  510. 'disabled_by': document.disabled_by,
  511. 'archived': document.archived,
  512. 'segment_count': document.segment_count,
  513. 'average_segment_length': document.average_segment_length,
  514. 'hit_count': document.hit_count,
  515. 'display_status': document.display_status,
  516. 'doc_form': document.doc_form
  517. }
  518. else:
  519. process_rules = DatasetService.get_process_rules(dataset_id)
  520. data_source_info = document.data_source_detail_dict
  521. response = {
  522. 'id': document.id,
  523. 'position': document.position,
  524. 'data_source_type': document.data_source_type,
  525. 'data_source_info': data_source_info,
  526. 'dataset_process_rule_id': document.dataset_process_rule_id,
  527. 'dataset_process_rule': process_rules,
  528. 'name': document.name,
  529. 'created_from': document.created_from,
  530. 'created_by': document.created_by,
  531. 'created_at': document.created_at.timestamp(),
  532. 'tokens': document.tokens,
  533. 'indexing_status': document.indexing_status,
  534. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  535. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  536. 'indexing_latency': document.indexing_latency,
  537. 'error': document.error,
  538. 'enabled': document.enabled,
  539. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  540. 'disabled_by': document.disabled_by,
  541. 'archived': document.archived,
  542. 'doc_type': document.doc_type,
  543. 'doc_metadata': document.doc_metadata,
  544. 'segment_count': document.segment_count,
  545. 'average_segment_length': document.average_segment_length,
  546. 'hit_count': document.hit_count,
  547. 'display_status': document.display_status,
  548. 'doc_form': document.doc_form
  549. }
  550. return response, 200
  551. class DocumentProcessingApi(DocumentResource):
  552. @setup_required
  553. @login_required
  554. @account_initialization_required
  555. def patch(self, dataset_id, document_id, action):
  556. dataset_id = str(dataset_id)
  557. document_id = str(document_id)
  558. document = self.get_document(dataset_id, document_id)
  559. # The role of the current user in the ta table must be admin, owner, or editor
  560. if not current_user.is_editor:
  561. raise Forbidden()
  562. if action == "pause":
  563. if document.indexing_status != "indexing":
  564. raise InvalidActionError('Document not in indexing state.')
  565. document.paused_by = current_user.id
  566. document.paused_at = datetime.now(timezone.utc).replace(tzinfo=None)
  567. document.is_paused = True
  568. db.session.commit()
  569. elif action == "resume":
  570. if document.indexing_status not in ["paused", "error"]:
  571. raise InvalidActionError('Document not in paused or error state.')
  572. document.paused_by = None
  573. document.paused_at = None
  574. document.is_paused = False
  575. db.session.commit()
  576. else:
  577. raise InvalidActionError()
  578. return {'result': 'success'}, 200
  579. class DocumentDeleteApi(DocumentResource):
  580. @setup_required
  581. @login_required
  582. @account_initialization_required
  583. def delete(self, dataset_id, document_id):
  584. dataset_id = str(dataset_id)
  585. document_id = str(document_id)
  586. dataset = DatasetService.get_dataset(dataset_id)
  587. if dataset is None:
  588. raise NotFound("Dataset not found.")
  589. # check user's model setting
  590. DatasetService.check_dataset_model_setting(dataset)
  591. document = self.get_document(dataset_id, document_id)
  592. try:
  593. DocumentService.delete_document(document)
  594. except services.errors.document.DocumentIndexingError:
  595. raise DocumentIndexingError('Cannot delete document during indexing.')
  596. return {'result': 'success'}, 204
  597. class DocumentMetadataApi(DocumentResource):
  598. @setup_required
  599. @login_required
  600. @account_initialization_required
  601. def put(self, dataset_id, document_id):
  602. dataset_id = str(dataset_id)
  603. document_id = str(document_id)
  604. document = self.get_document(dataset_id, document_id)
  605. req_data = request.get_json()
  606. doc_type = req_data.get('doc_type')
  607. doc_metadata = req_data.get('doc_metadata')
  608. # The role of the current user in the ta table must be admin, owner, or editor
  609. if not current_user.is_editor:
  610. raise Forbidden()
  611. if doc_type is None or doc_metadata is None:
  612. raise ValueError('Both doc_type and doc_metadata must be provided.')
  613. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  614. raise ValueError('Invalid doc_type.')
  615. if not isinstance(doc_metadata, dict):
  616. raise ValueError('doc_metadata must be a dictionary.')
  617. metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
  618. document.doc_metadata = {}
  619. if doc_type == 'others':
  620. document.doc_metadata = doc_metadata
  621. else:
  622. for key, value_type in metadata_schema.items():
  623. value = doc_metadata.get(key)
  624. if value is not None and isinstance(value, value_type):
  625. document.doc_metadata[key] = value
  626. document.doc_type = doc_type
  627. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  628. db.session.commit()
  629. return {'result': 'success', 'message': 'Document metadata updated.'}, 200
  630. class DocumentStatusApi(DocumentResource):
  631. @setup_required
  632. @login_required
  633. @account_initialization_required
  634. @cloud_edition_billing_resource_check('vector_space')
  635. def patch(self, dataset_id, document_id, action):
  636. dataset_id = str(dataset_id)
  637. document_id = str(document_id)
  638. dataset = DatasetService.get_dataset(dataset_id)
  639. if dataset is None:
  640. raise NotFound("Dataset not found.")
  641. # check user's model setting
  642. DatasetService.check_dataset_model_setting(dataset)
  643. document = self.get_document(dataset_id, document_id)
  644. # The role of the current user in the ta table must be admin, owner, or editor
  645. if not current_user.is_editor:
  646. raise Forbidden()
  647. indexing_cache_key = 'document_{}_indexing'.format(document.id)
  648. cache_result = redis_client.get(indexing_cache_key)
  649. if cache_result is not None:
  650. raise InvalidActionError("Document is being indexed, please try again later")
  651. if action == "enable":
  652. if document.enabled:
  653. raise InvalidActionError('Document already enabled.')
  654. document.enabled = True
  655. document.disabled_at = None
  656. document.disabled_by = None
  657. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  658. db.session.commit()
  659. # Set cache to prevent indexing the same document multiple times
  660. redis_client.setex(indexing_cache_key, 600, 1)
  661. add_document_to_index_task.delay(document_id)
  662. return {'result': 'success'}, 200
  663. elif action == "disable":
  664. if not document.completed_at or document.indexing_status != 'completed':
  665. raise InvalidActionError('Document is not completed.')
  666. if not document.enabled:
  667. raise InvalidActionError('Document already disabled.')
  668. document.enabled = False
  669. document.disabled_at = datetime.now(timezone.utc).replace(tzinfo=None)
  670. document.disabled_by = current_user.id
  671. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  672. db.session.commit()
  673. # Set cache to prevent indexing the same document multiple times
  674. redis_client.setex(indexing_cache_key, 600, 1)
  675. remove_document_from_index_task.delay(document_id)
  676. return {'result': 'success'}, 200
  677. elif action == "archive":
  678. if document.archived:
  679. raise InvalidActionError('Document already archived.')
  680. document.archived = True
  681. document.archived_at = datetime.now(timezone.utc).replace(tzinfo=None)
  682. document.archived_by = current_user.id
  683. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  684. db.session.commit()
  685. if document.enabled:
  686. # Set cache to prevent indexing the same document multiple times
  687. redis_client.setex(indexing_cache_key, 600, 1)
  688. remove_document_from_index_task.delay(document_id)
  689. return {'result': 'success'}, 200
  690. elif action == "un_archive":
  691. if not document.archived:
  692. raise InvalidActionError('Document is not archived.')
  693. document.archived = False
  694. document.archived_at = None
  695. document.archived_by = None
  696. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  697. db.session.commit()
  698. # Set cache to prevent indexing the same document multiple times
  699. redis_client.setex(indexing_cache_key, 600, 1)
  700. add_document_to_index_task.delay(document_id)
  701. return {'result': 'success'}, 200
  702. else:
  703. raise InvalidActionError()
  704. class DocumentPauseApi(DocumentResource):
  705. @setup_required
  706. @login_required
  707. @account_initialization_required
  708. def patch(self, dataset_id, document_id):
  709. """pause document."""
  710. dataset_id = str(dataset_id)
  711. document_id = str(document_id)
  712. dataset = DatasetService.get_dataset(dataset_id)
  713. if not dataset:
  714. raise NotFound('Dataset not found.')
  715. document = DocumentService.get_document(dataset.id, document_id)
  716. # 404 if document not found
  717. if document is None:
  718. raise NotFound("Document Not Exists.")
  719. # 403 if document is archived
  720. if DocumentService.check_archived(document):
  721. raise ArchivedDocumentImmutableError()
  722. try:
  723. # pause document
  724. DocumentService.pause_document(document)
  725. except services.errors.document.DocumentIndexingError:
  726. raise DocumentIndexingError('Cannot pause completed document.')
  727. return {'result': 'success'}, 204
  728. class DocumentRecoverApi(DocumentResource):
  729. @setup_required
  730. @login_required
  731. @account_initialization_required
  732. def patch(self, dataset_id, document_id):
  733. """recover document."""
  734. dataset_id = str(dataset_id)
  735. document_id = str(document_id)
  736. dataset = DatasetService.get_dataset(dataset_id)
  737. if not dataset:
  738. raise NotFound('Dataset not found.')
  739. document = DocumentService.get_document(dataset.id, document_id)
  740. # 404 if document not found
  741. if document is None:
  742. raise NotFound("Document Not Exists.")
  743. # 403 if document is archived
  744. if DocumentService.check_archived(document):
  745. raise ArchivedDocumentImmutableError()
  746. try:
  747. # pause document
  748. DocumentService.recover_document(document)
  749. except services.errors.document.DocumentIndexingError:
  750. raise DocumentIndexingError('Document is not in paused status.')
  751. return {'result': 'success'}, 204
  752. class DocumentRetryApi(DocumentResource):
  753. @setup_required
  754. @login_required
  755. @account_initialization_required
  756. def post(self, dataset_id):
  757. """retry document."""
  758. parser = reqparse.RequestParser()
  759. parser.add_argument('document_ids', type=list, required=True, nullable=False,
  760. location='json')
  761. args = parser.parse_args()
  762. dataset_id = str(dataset_id)
  763. dataset = DatasetService.get_dataset(dataset_id)
  764. retry_documents = []
  765. if not dataset:
  766. raise NotFound('Dataset not found.')
  767. for document_id in args['document_ids']:
  768. try:
  769. document_id = str(document_id)
  770. document = DocumentService.get_document(dataset.id, document_id)
  771. # 404 if document not found
  772. if document is None:
  773. raise NotFound("Document Not Exists.")
  774. # 403 if document is archived
  775. if DocumentService.check_archived(document):
  776. raise ArchivedDocumentImmutableError()
  777. # 400 if document is completed
  778. if document.indexing_status == 'completed':
  779. raise DocumentAlreadyFinishedError()
  780. retry_documents.append(document)
  781. except Exception as e:
  782. logging.error(f"Document {document_id} retry failed: {str(e)}")
  783. continue
  784. # retry document
  785. DocumentService.retry_document(dataset_id, retry_documents)
  786. return {'result': 'success'}, 204
  787. class DocumentRenameApi(DocumentResource):
  788. @setup_required
  789. @login_required
  790. @account_initialization_required
  791. @marshal_with(document_fields)
  792. def post(self, dataset_id, document_id):
  793. # The role of the current user in the ta table must be admin or owner
  794. if not current_user.is_admin_or_owner:
  795. raise Forbidden()
  796. parser = reqparse.RequestParser()
  797. parser.add_argument('name', type=str, required=True, nullable=False, location='json')
  798. args = parser.parse_args()
  799. try:
  800. document = DocumentService.rename_document(dataset_id, document_id, args['name'])
  801. except services.errors.document.DocumentIndexingError:
  802. raise DocumentIndexingError('Cannot delete document during indexing.')
  803. return document
  804. class WebsiteDocumentSyncApi(DocumentResource):
  805. @setup_required
  806. @login_required
  807. @account_initialization_required
  808. def get(self, dataset_id, document_id):
  809. """sync website document."""
  810. dataset_id = str(dataset_id)
  811. dataset = DatasetService.get_dataset(dataset_id)
  812. if not dataset:
  813. raise NotFound('Dataset not found.')
  814. document_id = str(document_id)
  815. document = DocumentService.get_document(dataset.id, document_id)
  816. if not document:
  817. raise NotFound('Document not found.')
  818. if document.tenant_id != current_user.current_tenant_id:
  819. raise Forbidden('No permission.')
  820. if document.data_source_type != 'website_crawl':
  821. raise ValueError('Document is not a website document.')
  822. # 403 if document is archived
  823. if DocumentService.check_archived(document):
  824. raise ArchivedDocumentImmutableError()
  825. # sync document
  826. DocumentService.sync_website_document(dataset_id, document)
  827. return {'result': 'success'}, 200
  828. api.add_resource(GetProcessRuleApi, '/datasets/process-rule')
  829. api.add_resource(DatasetDocumentListApi,
  830. '/datasets/<uuid:dataset_id>/documents')
  831. api.add_resource(DatasetInitApi,
  832. '/datasets/init')
  833. api.add_resource(DocumentIndexingEstimateApi,
  834. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
  835. api.add_resource(DocumentBatchIndexingEstimateApi,
  836. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate')
  837. api.add_resource(DocumentBatchIndexingStatusApi,
  838. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
  839. api.add_resource(DocumentIndexingStatusApi,
  840. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status')
  841. api.add_resource(DocumentDetailApi,
  842. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  843. api.add_resource(DocumentProcessingApi,
  844. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>')
  845. api.add_resource(DocumentDeleteApi,
  846. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  847. api.add_resource(DocumentMetadataApi,
  848. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata')
  849. api.add_resource(DocumentStatusApi,
  850. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>')
  851. api.add_resource(DocumentPauseApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause')
  852. api.add_resource(DocumentRecoverApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume')
  853. api.add_resource(DocumentRetryApi, '/datasets/<uuid:dataset_id>/retry')
  854. api.add_resource(DocumentRenameApi,
  855. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename')
  856. api.add_resource(WebsiteDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync')