dataset.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. import json
  2. import pickle
  3. from json import JSONDecodeError
  4. from sqlalchemy import func
  5. from sqlalchemy.dialects.postgresql import UUID
  6. from extensions.ext_database import db
  7. from models.account import Account
  8. from models.model import App, UploadFile
  9. class Dataset(db.Model):
  10. __tablename__ = 'datasets'
  11. __table_args__ = (
  12. db.PrimaryKeyConstraint('id', name='dataset_pkey'),
  13. db.Index('dataset_tenant_idx', 'tenant_id'),
  14. )
  15. INDEXING_TECHNIQUE_LIST = ['high_quality', 'economy']
  16. id = db.Column(UUID, server_default=db.text('uuid_generate_v4()'))
  17. tenant_id = db.Column(UUID, nullable=False)
  18. name = db.Column(db.String(255), nullable=False)
  19. description = db.Column(db.Text, nullable=True)
  20. provider = db.Column(db.String(255), nullable=False,
  21. server_default=db.text("'vendor'::character varying"))
  22. permission = db.Column(db.String(255), nullable=False,
  23. server_default=db.text("'only_me'::character varying"))
  24. data_source_type = db.Column(db.String(255))
  25. indexing_technique = db.Column(db.String(255), nullable=True)
  26. index_struct = db.Column(db.Text, nullable=True)
  27. created_by = db.Column(UUID, nullable=False)
  28. created_at = db.Column(db.DateTime, nullable=False,
  29. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  30. updated_by = db.Column(UUID, nullable=True)
  31. updated_at = db.Column(db.DateTime, nullable=False,
  32. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  33. embedding_model = db.Column(db.String(255), nullable=True)
  34. embedding_model_provider = db.Column(db.String(255), nullable=True)
  35. collection_binding_id = db.Column(UUID, nullable=True)
  36. @property
  37. def dataset_keyword_table(self):
  38. dataset_keyword_table = db.session.query(DatasetKeywordTable).filter(
  39. DatasetKeywordTable.dataset_id == self.id).first()
  40. if dataset_keyword_table:
  41. return dataset_keyword_table
  42. return None
  43. @property
  44. def index_struct_dict(self):
  45. return json.loads(self.index_struct) if self.index_struct else None
  46. @property
  47. def created_by_account(self):
  48. return Account.query.get(self.created_by)
  49. @property
  50. def latest_process_rule(self):
  51. return DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id) \
  52. .order_by(DatasetProcessRule.created_at.desc()).first()
  53. @property
  54. def app_count(self):
  55. return db.session.query(func.count(AppDatasetJoin.id)).filter(AppDatasetJoin.dataset_id == self.id).scalar()
  56. @property
  57. def document_count(self):
  58. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  59. @property
  60. def available_document_count(self):
  61. return db.session.query(func.count(Document.id)).filter(
  62. Document.dataset_id == self.id,
  63. Document.indexing_status == 'completed',
  64. Document.enabled == True,
  65. Document.archived == False
  66. ).scalar()
  67. @property
  68. def available_segment_count(self):
  69. return db.session.query(func.count(DocumentSegment.id)).filter(
  70. DocumentSegment.dataset_id == self.id,
  71. DocumentSegment.status == 'completed',
  72. DocumentSegment.enabled == True
  73. ).scalar()
  74. @property
  75. def word_count(self):
  76. return Document.query.with_entities(func.coalesce(func.sum(Document.word_count))) \
  77. .filter(Document.dataset_id == self.id).scalar()
  78. class DatasetProcessRule(db.Model):
  79. __tablename__ = 'dataset_process_rules'
  80. __table_args__ = (
  81. db.PrimaryKeyConstraint('id', name='dataset_process_rule_pkey'),
  82. db.Index('dataset_process_rule_dataset_id_idx', 'dataset_id'),
  83. )
  84. id = db.Column(UUID, nullable=False,
  85. server_default=db.text('uuid_generate_v4()'))
  86. dataset_id = db.Column(UUID, nullable=False)
  87. mode = db.Column(db.String(255), nullable=False,
  88. server_default=db.text("'automatic'::character varying"))
  89. rules = db.Column(db.Text, nullable=True)
  90. created_by = db.Column(UUID, nullable=False)
  91. created_at = db.Column(db.DateTime, nullable=False,
  92. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  93. MODES = ['automatic', 'custom']
  94. PRE_PROCESSING_RULES = ['remove_stopwords', 'remove_extra_spaces', 'remove_urls_emails']
  95. AUTOMATIC_RULES = {
  96. 'pre_processing_rules': [
  97. {'id': 'remove_extra_spaces', 'enabled': True},
  98. {'id': 'remove_urls_emails', 'enabled': False}
  99. ],
  100. 'segmentation': {
  101. 'delimiter': '\n',
  102. 'max_tokens': 1000
  103. }
  104. }
  105. def to_dict(self):
  106. return {
  107. 'id': self.id,
  108. 'dataset_id': self.dataset_id,
  109. 'mode': self.mode,
  110. 'rules': self.rules_dict,
  111. 'created_by': self.created_by,
  112. 'created_at': self.created_at,
  113. }
  114. @property
  115. def rules_dict(self):
  116. try:
  117. return json.loads(self.rules) if self.rules else None
  118. except JSONDecodeError:
  119. return None
  120. class Document(db.Model):
  121. __tablename__ = 'documents'
  122. __table_args__ = (
  123. db.PrimaryKeyConstraint('id', name='document_pkey'),
  124. db.Index('document_dataset_id_idx', 'dataset_id'),
  125. db.Index('document_is_paused_idx', 'is_paused'),
  126. )
  127. # initial fields
  128. id = db.Column(UUID, nullable=False,
  129. server_default=db.text('uuid_generate_v4()'))
  130. tenant_id = db.Column(UUID, nullable=False)
  131. dataset_id = db.Column(UUID, nullable=False)
  132. position = db.Column(db.Integer, nullable=False)
  133. data_source_type = db.Column(db.String(255), nullable=False)
  134. data_source_info = db.Column(db.Text, nullable=True)
  135. dataset_process_rule_id = db.Column(UUID, nullable=True)
  136. batch = db.Column(db.String(255), nullable=False)
  137. name = db.Column(db.String(255), nullable=False)
  138. created_from = db.Column(db.String(255), nullable=False)
  139. created_by = db.Column(UUID, nullable=False)
  140. created_api_request_id = db.Column(UUID, nullable=True)
  141. created_at = db.Column(db.DateTime, nullable=False,
  142. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  143. # start processing
  144. processing_started_at = db.Column(db.DateTime, nullable=True)
  145. # parsing
  146. file_id = db.Column(db.Text, nullable=True)
  147. word_count = db.Column(db.Integer, nullable=True)
  148. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  149. # cleaning
  150. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  151. # split
  152. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  153. # indexing
  154. tokens = db.Column(db.Integer, nullable=True)
  155. indexing_latency = db.Column(db.Float, nullable=True)
  156. completed_at = db.Column(db.DateTime, nullable=True)
  157. # pause
  158. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text('false'))
  159. paused_by = db.Column(UUID, nullable=True)
  160. paused_at = db.Column(db.DateTime, nullable=True)
  161. # error
  162. error = db.Column(db.Text, nullable=True)
  163. stopped_at = db.Column(db.DateTime, nullable=True)
  164. # basic fields
  165. indexing_status = db.Column(db.String(
  166. 255), nullable=False, server_default=db.text("'waiting'::character varying"))
  167. enabled = db.Column(db.Boolean, nullable=False,
  168. server_default=db.text('true'))
  169. disabled_at = db.Column(db.DateTime, nullable=True)
  170. disabled_by = db.Column(UUID, nullable=True)
  171. archived = db.Column(db.Boolean, nullable=False,
  172. server_default=db.text('false'))
  173. archived_reason = db.Column(db.String(255), nullable=True)
  174. archived_by = db.Column(UUID, nullable=True)
  175. archived_at = db.Column(db.DateTime, nullable=True)
  176. updated_at = db.Column(db.DateTime, nullable=False,
  177. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  178. doc_type = db.Column(db.String(40), nullable=True)
  179. doc_metadata = db.Column(db.JSON, nullable=True)
  180. doc_form = db.Column(db.String(
  181. 255), nullable=False, server_default=db.text("'text_model'::character varying"))
  182. doc_language = db.Column(db.String(255), nullable=True)
  183. DATA_SOURCES = ['upload_file', 'notion_import']
  184. @property
  185. def display_status(self):
  186. status = None
  187. if self.indexing_status == 'waiting':
  188. status = 'queuing'
  189. elif self.indexing_status not in ['completed', 'error', 'waiting'] and self.is_paused:
  190. status = 'paused'
  191. elif self.indexing_status in ['parsing', 'cleaning', 'splitting', 'indexing']:
  192. status = 'indexing'
  193. elif self.indexing_status == 'error':
  194. status = 'error'
  195. elif self.indexing_status == 'completed' and not self.archived and self.enabled:
  196. status = 'available'
  197. elif self.indexing_status == 'completed' and not self.archived and not self.enabled:
  198. status = 'disabled'
  199. elif self.indexing_status == 'completed' and self.archived:
  200. status = 'archived'
  201. return status
  202. @property
  203. def data_source_info_dict(self):
  204. if self.data_source_info:
  205. try:
  206. data_source_info_dict = json.loads(self.data_source_info)
  207. except JSONDecodeError:
  208. data_source_info_dict = {}
  209. return data_source_info_dict
  210. return None
  211. @property
  212. def data_source_detail_dict(self):
  213. if self.data_source_info:
  214. if self.data_source_type == 'upload_file':
  215. data_source_info_dict = json.loads(self.data_source_info)
  216. file_detail = db.session.query(UploadFile). \
  217. filter(UploadFile.id == data_source_info_dict['upload_file_id']). \
  218. one_or_none()
  219. if file_detail:
  220. return {
  221. 'upload_file': {
  222. 'id': file_detail.id,
  223. 'name': file_detail.name,
  224. 'size': file_detail.size,
  225. 'extension': file_detail.extension,
  226. 'mime_type': file_detail.mime_type,
  227. 'created_by': file_detail.created_by,
  228. 'created_at': file_detail.created_at.timestamp()
  229. }
  230. }
  231. elif self.data_source_type == 'notion_import':
  232. return json.loads(self.data_source_info)
  233. return {}
  234. @property
  235. def average_segment_length(self):
  236. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  237. return self.word_count // self.segment_count
  238. return 0
  239. @property
  240. def dataset_process_rule(self):
  241. if self.dataset_process_rule_id:
  242. return DatasetProcessRule.query.get(self.dataset_process_rule_id)
  243. return None
  244. @property
  245. def dataset(self):
  246. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  247. @property
  248. def segment_count(self):
  249. return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
  250. @property
  251. def hit_count(self):
  252. return DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count))) \
  253. .filter(DocumentSegment.document_id == self.id).scalar()
  254. class DocumentSegment(db.Model):
  255. __tablename__ = 'document_segments'
  256. __table_args__ = (
  257. db.PrimaryKeyConstraint('id', name='document_segment_pkey'),
  258. db.Index('document_segment_dataset_id_idx', 'dataset_id'),
  259. db.Index('document_segment_document_id_idx', 'document_id'),
  260. db.Index('document_segment_tenant_dataset_idx', 'dataset_id', 'tenant_id'),
  261. db.Index('document_segment_tenant_document_idx', 'document_id', 'tenant_id'),
  262. db.Index('document_segment_dataset_node_idx', 'dataset_id', 'index_node_id'),
  263. )
  264. # initial fields
  265. id = db.Column(UUID, nullable=False,
  266. server_default=db.text('uuid_generate_v4()'))
  267. tenant_id = db.Column(UUID, nullable=False)
  268. dataset_id = db.Column(UUID, nullable=False)
  269. document_id = db.Column(UUID, nullable=False)
  270. position = db.Column(db.Integer, nullable=False)
  271. content = db.Column(db.Text, nullable=False)
  272. answer = db.Column(db.Text, nullable=True)
  273. word_count = db.Column(db.Integer, nullable=False)
  274. tokens = db.Column(db.Integer, nullable=False)
  275. # indexing fields
  276. keywords = db.Column(db.JSON, nullable=True)
  277. index_node_id = db.Column(db.String(255), nullable=True)
  278. index_node_hash = db.Column(db.String(255), nullable=True)
  279. # basic fields
  280. hit_count = db.Column(db.Integer, nullable=False, default=0)
  281. enabled = db.Column(db.Boolean, nullable=False,
  282. server_default=db.text('true'))
  283. disabled_at = db.Column(db.DateTime, nullable=True)
  284. disabled_by = db.Column(UUID, nullable=True)
  285. status = db.Column(db.String(255), nullable=False,
  286. server_default=db.text("'waiting'::character varying"))
  287. created_by = db.Column(UUID, nullable=False)
  288. created_at = db.Column(db.DateTime, nullable=False,
  289. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  290. updated_by = db.Column(UUID, nullable=True)
  291. updated_at = db.Column(db.DateTime, nullable=False,
  292. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  293. indexing_at = db.Column(db.DateTime, nullable=True)
  294. completed_at = db.Column(db.DateTime, nullable=True)
  295. error = db.Column(db.Text, nullable=True)
  296. stopped_at = db.Column(db.DateTime, nullable=True)
  297. @property
  298. def dataset(self):
  299. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  300. @property
  301. def document(self):
  302. return db.session.query(Document).filter(Document.id == self.document_id).first()
  303. @property
  304. def previous_segment(self):
  305. return db.session.query(DocumentSegment).filter(
  306. DocumentSegment.document_id == self.document_id,
  307. DocumentSegment.position == self.position - 1
  308. ).first()
  309. @property
  310. def next_segment(self):
  311. return db.session.query(DocumentSegment).filter(
  312. DocumentSegment.document_id == self.document_id,
  313. DocumentSegment.position == self.position + 1
  314. ).first()
  315. class AppDatasetJoin(db.Model):
  316. __tablename__ = 'app_dataset_joins'
  317. __table_args__ = (
  318. db.PrimaryKeyConstraint('id', name='app_dataset_join_pkey'),
  319. db.Index('app_dataset_join_app_dataset_idx', 'dataset_id', 'app_id'),
  320. )
  321. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  322. app_id = db.Column(UUID, nullable=False)
  323. dataset_id = db.Column(UUID, nullable=False)
  324. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  325. @property
  326. def app(self):
  327. return App.query.get(self.app_id)
  328. class DatasetQuery(db.Model):
  329. __tablename__ = 'dataset_queries'
  330. __table_args__ = (
  331. db.PrimaryKeyConstraint('id', name='dataset_query_pkey'),
  332. db.Index('dataset_query_dataset_id_idx', 'dataset_id'),
  333. )
  334. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  335. dataset_id = db.Column(UUID, nullable=False)
  336. content = db.Column(db.Text, nullable=False)
  337. source = db.Column(db.String(255), nullable=False)
  338. source_app_id = db.Column(UUID, nullable=True)
  339. created_by_role = db.Column(db.String, nullable=False)
  340. created_by = db.Column(UUID, nullable=False)
  341. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  342. class DatasetKeywordTable(db.Model):
  343. __tablename__ = 'dataset_keyword_tables'
  344. __table_args__ = (
  345. db.PrimaryKeyConstraint('id', name='dataset_keyword_table_pkey'),
  346. db.Index('dataset_keyword_table_dataset_id_idx', 'dataset_id'),
  347. )
  348. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  349. dataset_id = db.Column(UUID, nullable=False, unique=True)
  350. keyword_table = db.Column(db.Text, nullable=False)
  351. @property
  352. def keyword_table_dict(self):
  353. class SetDecoder(json.JSONDecoder):
  354. def __init__(self, *args, **kwargs):
  355. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  356. def object_hook(self, dct):
  357. if isinstance(dct, dict):
  358. for keyword, node_idxs in dct.items():
  359. if isinstance(node_idxs, list):
  360. dct[keyword] = set(node_idxs)
  361. return dct
  362. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  363. class Embedding(db.Model):
  364. __tablename__ = 'embeddings'
  365. __table_args__ = (
  366. db.PrimaryKeyConstraint('id', name='embedding_pkey'),
  367. db.UniqueConstraint('model_name', 'hash', name='embedding_hash_idx')
  368. )
  369. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  370. model_name = db.Column(db.String(40), nullable=False,
  371. server_default=db.text("'text-embedding-ada-002'::character varying"))
  372. hash = db.Column(db.String(64), nullable=False)
  373. embedding = db.Column(db.LargeBinary, nullable=False)
  374. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
  375. def set_embedding(self, embedding_data: list[float]):
  376. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  377. def get_embedding(self) -> list[float]:
  378. return pickle.loads(self.embedding)
  379. class DatasetCollectionBinding(db.Model):
  380. __tablename__ = 'dataset_collection_bindings'
  381. __table_args__ = (
  382. db.PrimaryKeyConstraint('id', name='dataset_collection_bindings_pkey'),
  383. db.Index('provider_model_name_idx', 'provider_name', 'model_name')
  384. )
  385. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  386. provider_name = db.Column(db.String(40), nullable=False)
  387. model_name = db.Column(db.String(40), nullable=False)
  388. collection_name = db.Column(db.String(64), nullable=False)
  389. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))