dataset.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. import json
  2. import logging
  3. import pickle
  4. from json import JSONDecodeError
  5. from sqlalchemy import func
  6. from sqlalchemy.dialects.postgresql import JSONB, UUID
  7. from extensions.ext_database import db
  8. from extensions.ext_storage import storage
  9. from models.account import Account
  10. from models.model import App, UploadFile
  11. class Dataset(db.Model):
  12. __tablename__ = 'datasets'
  13. __table_args__ = (
  14. db.PrimaryKeyConstraint('id', name='dataset_pkey'),
  15. db.Index('dataset_tenant_idx', 'tenant_id'),
  16. db.Index('retrieval_model_idx', "retrieval_model", postgresql_using='gin')
  17. )
  18. INDEXING_TECHNIQUE_LIST = ['high_quality', 'economy', None]
  19. id = db.Column(UUID, server_default=db.text('uuid_generate_v4()'))
  20. tenant_id = db.Column(UUID, nullable=False)
  21. name = db.Column(db.String(255), nullable=False)
  22. description = db.Column(db.Text, nullable=True)
  23. provider = db.Column(db.String(255), nullable=False,
  24. server_default=db.text("'vendor'::character varying"))
  25. permission = db.Column(db.String(255), nullable=False,
  26. server_default=db.text("'only_me'::character varying"))
  27. data_source_type = db.Column(db.String(255))
  28. indexing_technique = db.Column(db.String(255), nullable=True)
  29. index_struct = db.Column(db.Text, nullable=True)
  30. created_by = db.Column(UUID, nullable=False)
  31. created_at = db.Column(db.DateTime, nullable=False,
  32. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  33. updated_by = db.Column(UUID, nullable=True)
  34. updated_at = db.Column(db.DateTime, nullable=False,
  35. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  36. embedding_model = db.Column(db.String(255), nullable=True)
  37. embedding_model_provider = db.Column(db.String(255), nullable=True)
  38. collection_binding_id = db.Column(UUID, nullable=True)
  39. retrieval_model = db.Column(JSONB, nullable=True)
  40. @property
  41. def dataset_keyword_table(self):
  42. dataset_keyword_table = db.session.query(DatasetKeywordTable).filter(
  43. DatasetKeywordTable.dataset_id == self.id).first()
  44. if dataset_keyword_table:
  45. return dataset_keyword_table
  46. return None
  47. @property
  48. def index_struct_dict(self):
  49. return json.loads(self.index_struct) if self.index_struct else None
  50. @property
  51. def created_by_account(self):
  52. return Account.query.get(self.created_by)
  53. @property
  54. def latest_process_rule(self):
  55. return DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id) \
  56. .order_by(DatasetProcessRule.created_at.desc()).first()
  57. @property
  58. def app_count(self):
  59. return db.session.query(func.count(AppDatasetJoin.id)).filter(AppDatasetJoin.dataset_id == self.id).scalar()
  60. @property
  61. def document_count(self):
  62. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  63. @property
  64. def available_document_count(self):
  65. return db.session.query(func.count(Document.id)).filter(
  66. Document.dataset_id == self.id,
  67. Document.indexing_status == 'completed',
  68. Document.enabled == True,
  69. Document.archived == False
  70. ).scalar()
  71. @property
  72. def available_segment_count(self):
  73. return db.session.query(func.count(DocumentSegment.id)).filter(
  74. DocumentSegment.dataset_id == self.id,
  75. DocumentSegment.status == 'completed',
  76. DocumentSegment.enabled == True
  77. ).scalar()
  78. @property
  79. def word_count(self):
  80. return Document.query.with_entities(func.coalesce(func.sum(Document.word_count))) \
  81. .filter(Document.dataset_id == self.id).scalar()
  82. @property
  83. def doc_form(self):
  84. document = db.session.query(Document).filter(
  85. Document.dataset_id == self.id).first()
  86. if document:
  87. return document.doc_form
  88. return None
  89. @property
  90. def retrieval_model_dict(self):
  91. default_retrieval_model = {
  92. 'search_method': 'semantic_search',
  93. 'reranking_enable': False,
  94. 'reranking_model': {
  95. 'reranking_provider_name': '',
  96. 'reranking_model_name': ''
  97. },
  98. 'top_k': 2,
  99. 'score_threshold_enabled': False
  100. }
  101. return self.retrieval_model if self.retrieval_model else default_retrieval_model
  102. @staticmethod
  103. def gen_collection_name_by_id(dataset_id: str) -> str:
  104. normalized_dataset_id = dataset_id.replace("-", "_")
  105. return f'Vector_index_{normalized_dataset_id}_Node'
  106. class DatasetProcessRule(db.Model):
  107. __tablename__ = 'dataset_process_rules'
  108. __table_args__ = (
  109. db.PrimaryKeyConstraint('id', name='dataset_process_rule_pkey'),
  110. db.Index('dataset_process_rule_dataset_id_idx', 'dataset_id'),
  111. )
  112. id = db.Column(UUID, nullable=False,
  113. server_default=db.text('uuid_generate_v4()'))
  114. dataset_id = db.Column(UUID, nullable=False)
  115. mode = db.Column(db.String(255), nullable=False,
  116. server_default=db.text("'automatic'::character varying"))
  117. rules = db.Column(db.Text, nullable=True)
  118. created_by = db.Column(UUID, nullable=False)
  119. created_at = db.Column(db.DateTime, nullable=False,
  120. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  121. MODES = ['automatic', 'custom']
  122. PRE_PROCESSING_RULES = ['remove_stopwords', 'remove_extra_spaces', 'remove_urls_emails']
  123. AUTOMATIC_RULES = {
  124. 'pre_processing_rules': [
  125. {'id': 'remove_extra_spaces', 'enabled': True},
  126. {'id': 'remove_urls_emails', 'enabled': False}
  127. ],
  128. 'segmentation': {
  129. 'delimiter': '\n',
  130. 'max_tokens': 500,
  131. 'chunk_overlap': 50
  132. }
  133. }
  134. def to_dict(self):
  135. return {
  136. 'id': self.id,
  137. 'dataset_id': self.dataset_id,
  138. 'mode': self.mode,
  139. 'rules': self.rules_dict,
  140. 'created_by': self.created_by,
  141. 'created_at': self.created_at,
  142. }
  143. @property
  144. def rules_dict(self):
  145. try:
  146. return json.loads(self.rules) if self.rules else None
  147. except JSONDecodeError:
  148. return None
  149. class Document(db.Model):
  150. __tablename__ = 'documents'
  151. __table_args__ = (
  152. db.PrimaryKeyConstraint('id', name='document_pkey'),
  153. db.Index('document_dataset_id_idx', 'dataset_id'),
  154. db.Index('document_is_paused_idx', 'is_paused'),
  155. db.Index('document_tenant_idx', 'tenant_id'),
  156. )
  157. # initial fields
  158. id = db.Column(UUID, nullable=False,
  159. server_default=db.text('uuid_generate_v4()'))
  160. tenant_id = db.Column(UUID, nullable=False)
  161. dataset_id = db.Column(UUID, nullable=False)
  162. position = db.Column(db.Integer, nullable=False)
  163. data_source_type = db.Column(db.String(255), nullable=False)
  164. data_source_info = db.Column(db.Text, nullable=True)
  165. dataset_process_rule_id = db.Column(UUID, nullable=True)
  166. batch = db.Column(db.String(255), nullable=False)
  167. name = db.Column(db.String(255), nullable=False)
  168. created_from = db.Column(db.String(255), nullable=False)
  169. created_by = db.Column(UUID, nullable=False)
  170. created_api_request_id = db.Column(UUID, nullable=True)
  171. created_at = db.Column(db.DateTime, nullable=False,
  172. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  173. # start processing
  174. processing_started_at = db.Column(db.DateTime, nullable=True)
  175. # parsing
  176. file_id = db.Column(db.Text, nullable=True)
  177. word_count = db.Column(db.Integer, nullable=True)
  178. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  179. # cleaning
  180. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  181. # split
  182. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  183. # indexing
  184. tokens = db.Column(db.Integer, nullable=True)
  185. indexing_latency = db.Column(db.Float, nullable=True)
  186. completed_at = db.Column(db.DateTime, nullable=True)
  187. # pause
  188. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text('false'))
  189. paused_by = db.Column(UUID, nullable=True)
  190. paused_at = db.Column(db.DateTime, nullable=True)
  191. # error
  192. error = db.Column(db.Text, nullable=True)
  193. stopped_at = db.Column(db.DateTime, nullable=True)
  194. # basic fields
  195. indexing_status = db.Column(db.String(
  196. 255), nullable=False, server_default=db.text("'waiting'::character varying"))
  197. enabled = db.Column(db.Boolean, nullable=False,
  198. server_default=db.text('true'))
  199. disabled_at = db.Column(db.DateTime, nullable=True)
  200. disabled_by = db.Column(UUID, nullable=True)
  201. archived = db.Column(db.Boolean, nullable=False,
  202. server_default=db.text('false'))
  203. archived_reason = db.Column(db.String(255), nullable=True)
  204. archived_by = db.Column(UUID, nullable=True)
  205. archived_at = db.Column(db.DateTime, nullable=True)
  206. updated_at = db.Column(db.DateTime, nullable=False,
  207. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  208. doc_type = db.Column(db.String(40), nullable=True)
  209. doc_metadata = db.Column(db.JSON, nullable=True)
  210. doc_form = db.Column(db.String(
  211. 255), nullable=False, server_default=db.text("'text_model'::character varying"))
  212. doc_language = db.Column(db.String(255), nullable=True)
  213. DATA_SOURCES = ['upload_file', 'notion_import']
  214. @property
  215. def display_status(self):
  216. status = None
  217. if self.indexing_status == 'waiting':
  218. status = 'queuing'
  219. elif self.indexing_status not in ['completed', 'error', 'waiting'] and self.is_paused:
  220. status = 'paused'
  221. elif self.indexing_status in ['parsing', 'cleaning', 'splitting', 'indexing']:
  222. status = 'indexing'
  223. elif self.indexing_status == 'error':
  224. status = 'error'
  225. elif self.indexing_status == 'completed' and not self.archived and self.enabled:
  226. status = 'available'
  227. elif self.indexing_status == 'completed' and not self.archived and not self.enabled:
  228. status = 'disabled'
  229. elif self.indexing_status == 'completed' and self.archived:
  230. status = 'archived'
  231. return status
  232. @property
  233. def data_source_info_dict(self):
  234. if self.data_source_info:
  235. try:
  236. data_source_info_dict = json.loads(self.data_source_info)
  237. except JSONDecodeError:
  238. data_source_info_dict = {}
  239. return data_source_info_dict
  240. return None
  241. @property
  242. def data_source_detail_dict(self):
  243. if self.data_source_info:
  244. if self.data_source_type == 'upload_file':
  245. data_source_info_dict = json.loads(self.data_source_info)
  246. file_detail = db.session.query(UploadFile). \
  247. filter(UploadFile.id == data_source_info_dict['upload_file_id']). \
  248. one_or_none()
  249. if file_detail:
  250. return {
  251. 'upload_file': {
  252. 'id': file_detail.id,
  253. 'name': file_detail.name,
  254. 'size': file_detail.size,
  255. 'extension': file_detail.extension,
  256. 'mime_type': file_detail.mime_type,
  257. 'created_by': file_detail.created_by,
  258. 'created_at': file_detail.created_at.timestamp()
  259. }
  260. }
  261. elif self.data_source_type == 'notion_import':
  262. return json.loads(self.data_source_info)
  263. return {}
  264. @property
  265. def average_segment_length(self):
  266. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  267. return self.word_count // self.segment_count
  268. return 0
  269. @property
  270. def dataset_process_rule(self):
  271. if self.dataset_process_rule_id:
  272. return DatasetProcessRule.query.get(self.dataset_process_rule_id)
  273. return None
  274. @property
  275. def dataset(self):
  276. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  277. @property
  278. def segment_count(self):
  279. return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
  280. @property
  281. def hit_count(self):
  282. return DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count))) \
  283. .filter(DocumentSegment.document_id == self.id).scalar()
  284. class DocumentSegment(db.Model):
  285. __tablename__ = 'document_segments'
  286. __table_args__ = (
  287. db.PrimaryKeyConstraint('id', name='document_segment_pkey'),
  288. db.Index('document_segment_dataset_id_idx', 'dataset_id'),
  289. db.Index('document_segment_document_id_idx', 'document_id'),
  290. db.Index('document_segment_tenant_dataset_idx', 'dataset_id', 'tenant_id'),
  291. db.Index('document_segment_tenant_document_idx', 'document_id', 'tenant_id'),
  292. db.Index('document_segment_dataset_node_idx', 'dataset_id', 'index_node_id'),
  293. db.Index('document_segment_tenant_idx', 'tenant_id'),
  294. )
  295. # initial fields
  296. id = db.Column(UUID, nullable=False,
  297. server_default=db.text('uuid_generate_v4()'))
  298. tenant_id = db.Column(UUID, nullable=False)
  299. dataset_id = db.Column(UUID, nullable=False)
  300. document_id = db.Column(UUID, nullable=False)
  301. position = db.Column(db.Integer, nullable=False)
  302. content = db.Column(db.Text, nullable=False)
  303. answer = db.Column(db.Text, nullable=True)
  304. word_count = db.Column(db.Integer, nullable=False)
  305. tokens = db.Column(db.Integer, nullable=False)
  306. # indexing fields
  307. keywords = db.Column(db.JSON, nullable=True)
  308. index_node_id = db.Column(db.String(255), nullable=True)
  309. index_node_hash = db.Column(db.String(255), nullable=True)
  310. # basic fields
  311. hit_count = db.Column(db.Integer, nullable=False, default=0)
  312. enabled = db.Column(db.Boolean, nullable=False,
  313. server_default=db.text('true'))
  314. disabled_at = db.Column(db.DateTime, nullable=True)
  315. disabled_by = db.Column(UUID, nullable=True)
  316. status = db.Column(db.String(255), nullable=False,
  317. server_default=db.text("'waiting'::character varying"))
  318. created_by = db.Column(UUID, nullable=False)
  319. created_at = db.Column(db.DateTime, nullable=False,
  320. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  321. updated_by = db.Column(UUID, nullable=True)
  322. updated_at = db.Column(db.DateTime, nullable=False,
  323. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  324. indexing_at = db.Column(db.DateTime, nullable=True)
  325. completed_at = db.Column(db.DateTime, nullable=True)
  326. error = db.Column(db.Text, nullable=True)
  327. stopped_at = db.Column(db.DateTime, nullable=True)
  328. @property
  329. def dataset(self):
  330. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  331. @property
  332. def document(self):
  333. return db.session.query(Document).filter(Document.id == self.document_id).first()
  334. @property
  335. def previous_segment(self):
  336. return db.session.query(DocumentSegment).filter(
  337. DocumentSegment.document_id == self.document_id,
  338. DocumentSegment.position == self.position - 1
  339. ).first()
  340. @property
  341. def next_segment(self):
  342. return db.session.query(DocumentSegment).filter(
  343. DocumentSegment.document_id == self.document_id,
  344. DocumentSegment.position == self.position + 1
  345. ).first()
  346. class AppDatasetJoin(db.Model):
  347. __tablename__ = 'app_dataset_joins'
  348. __table_args__ = (
  349. db.PrimaryKeyConstraint('id', name='app_dataset_join_pkey'),
  350. db.Index('app_dataset_join_app_dataset_idx', 'dataset_id', 'app_id'),
  351. )
  352. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  353. app_id = db.Column(UUID, nullable=False)
  354. dataset_id = db.Column(UUID, nullable=False)
  355. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  356. @property
  357. def app(self):
  358. return App.query.get(self.app_id)
  359. class DatasetQuery(db.Model):
  360. __tablename__ = 'dataset_queries'
  361. __table_args__ = (
  362. db.PrimaryKeyConstraint('id', name='dataset_query_pkey'),
  363. db.Index('dataset_query_dataset_id_idx', 'dataset_id'),
  364. )
  365. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  366. dataset_id = db.Column(UUID, nullable=False)
  367. content = db.Column(db.Text, nullable=False)
  368. source = db.Column(db.String(255), nullable=False)
  369. source_app_id = db.Column(UUID, nullable=True)
  370. created_by_role = db.Column(db.String, nullable=False)
  371. created_by = db.Column(UUID, nullable=False)
  372. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  373. class DatasetKeywordTable(db.Model):
  374. __tablename__ = 'dataset_keyword_tables'
  375. __table_args__ = (
  376. db.PrimaryKeyConstraint('id', name='dataset_keyword_table_pkey'),
  377. db.Index('dataset_keyword_table_dataset_id_idx', 'dataset_id'),
  378. )
  379. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  380. dataset_id = db.Column(UUID, nullable=False, unique=True)
  381. keyword_table = db.Column(db.Text, nullable=False)
  382. data_source_type = db.Column(db.String(255), nullable=False,
  383. server_default=db.text("'database'::character varying"))
  384. @property
  385. def keyword_table_dict(self):
  386. class SetDecoder(json.JSONDecoder):
  387. def __init__(self, *args, **kwargs):
  388. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  389. def object_hook(self, dct):
  390. if isinstance(dct, dict):
  391. for keyword, node_idxs in dct.items():
  392. if isinstance(node_idxs, list):
  393. dct[keyword] = set(node_idxs)
  394. return dct
  395. # get dataset
  396. dataset = Dataset.query.filter_by(
  397. id=self.dataset_id
  398. ).first()
  399. if not dataset:
  400. return None
  401. if self.data_source_type == 'database':
  402. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  403. else:
  404. file_key = 'keyword_files/' + dataset.tenant_id + '/' + self.dataset_id + '.txt'
  405. try:
  406. keyword_table_text = storage.load_once(file_key)
  407. if keyword_table_text:
  408. return json.loads(keyword_table_text.decode('utf-8'), cls=SetDecoder)
  409. return None
  410. except Exception as e:
  411. logging.exception(str(e))
  412. return None
  413. class Embedding(db.Model):
  414. __tablename__ = 'embeddings'
  415. __table_args__ = (
  416. db.PrimaryKeyConstraint('id', name='embedding_pkey'),
  417. db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx')
  418. )
  419. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  420. model_name = db.Column(db.String(40), nullable=False,
  421. server_default=db.text("'text-embedding-ada-002'::character varying"))
  422. hash = db.Column(db.String(64), nullable=False)
  423. embedding = db.Column(db.LargeBinary, nullable=False)
  424. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
  425. provider_name = db.Column(db.String(40), nullable=False,
  426. server_default=db.text("''::character varying"))
  427. def set_embedding(self, embedding_data: list[float]):
  428. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  429. def get_embedding(self) -> list[float]:
  430. return pickle.loads(self.embedding)
  431. class DatasetCollectionBinding(db.Model):
  432. __tablename__ = 'dataset_collection_bindings'
  433. __table_args__ = (
  434. db.PrimaryKeyConstraint('id', name='dataset_collection_bindings_pkey'),
  435. db.Index('provider_model_name_idx', 'provider_name', 'model_name')
  436. )
  437. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  438. provider_name = db.Column(db.String(40), nullable=False)
  439. model_name = db.Column(db.String(40), nullable=False)
  440. type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
  441. collection_name = db.Column(db.String(64), nullable=False)
  442. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))