dataset.py 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from json import JSONDecodeError
  12. from typing import Any, cast
  13. from sqlalchemy import func
  14. from sqlalchemy.dialects.postgresql import JSONB
  15. from sqlalchemy.orm import Mapped
  16. from configs import dify_config
  17. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  18. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  19. from extensions.ext_storage import storage
  20. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  21. from .account import Account
  22. from .engine import db
  23. from .model import App, Tag, TagBinding, UploadFile
  24. from .types import StringUUID
  25. class DatasetPermissionEnum(enum.StrEnum):
  26. ONLY_ME = "only_me"
  27. ALL_TEAM = "all_team_members"
  28. PARTIAL_TEAM = "partial_members"
  29. class Dataset(db.Model): # type: ignore[name-defined]
  30. __tablename__ = "datasets"
  31. __table_args__ = (
  32. db.PrimaryKeyConstraint("id", name="dataset_pkey"),
  33. db.Index("dataset_tenant_idx", "tenant_id"),
  34. db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  35. )
  36. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  37. PROVIDER_LIST = ["vendor", "external", None]
  38. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  39. tenant_id = db.Column(StringUUID, nullable=False)
  40. name = db.Column(db.String(255), nullable=False)
  41. description = db.Column(db.Text, nullable=True)
  42. provider = db.Column(db.String(255), nullable=False, server_default=db.text("'vendor'::character varying"))
  43. permission = db.Column(db.String(255), nullable=False, server_default=db.text("'only_me'::character varying"))
  44. data_source_type = db.Column(db.String(255))
  45. indexing_technique = db.Column(db.String(255), nullable=True)
  46. index_struct = db.Column(db.Text, nullable=True)
  47. created_by = db.Column(StringUUID, nullable=False)
  48. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  49. updated_by = db.Column(StringUUID, nullable=True)
  50. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  51. embedding_model = db.Column(db.String(255), nullable=True)
  52. embedding_model_provider = db.Column(db.String(255), nullable=True)
  53. collection_binding_id = db.Column(StringUUID, nullable=True)
  54. retrieval_model = db.Column(JSONB, nullable=True)
  55. built_in_field_enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  56. dept_id = db.Column(StringUUID, nullable=True)
  57. edit_auth = db.Column(db.Integer, nullable=False)
  58. @property
  59. def dataset_keyword_table(self):
  60. dataset_keyword_table = (
  61. db.session.query(DatasetKeywordTable).filter(DatasetKeywordTable.dataset_id == self.id).first()
  62. )
  63. if dataset_keyword_table:
  64. return dataset_keyword_table
  65. return None
  66. @property
  67. def index_struct_dict(self):
  68. return json.loads(self.index_struct) if self.index_struct else None
  69. @property
  70. def external_retrieval_model(self):
  71. default_retrieval_model = {
  72. "top_k": 2,
  73. "score_threshold": 0.0,
  74. }
  75. return self.retrieval_model or default_retrieval_model
  76. @property
  77. def created_by_account(self):
  78. return db.session.get(Account, self.created_by)
  79. @property
  80. def latest_process_rule(self):
  81. return (
  82. DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id)
  83. .order_by(DatasetProcessRule.created_at.desc())
  84. .first()
  85. )
  86. @property
  87. def app_count(self):
  88. return (
  89. db.session.query(func.count(AppDatasetJoin.id))
  90. .filter(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  91. .scalar()
  92. )
  93. @property
  94. def document_count(self):
  95. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  96. @property
  97. def available_document_count(self):
  98. return (
  99. db.session.query(func.count(Document.id))
  100. .filter(
  101. Document.dataset_id == self.id,
  102. Document.indexing_status == "completed",
  103. Document.enabled == True,
  104. Document.archived == False,
  105. )
  106. .scalar()
  107. )
  108. @property
  109. def available_segment_count(self):
  110. return (
  111. db.session.query(func.count(DocumentSegment.id))
  112. .filter(
  113. DocumentSegment.dataset_id == self.id,
  114. DocumentSegment.status == "completed",
  115. DocumentSegment.enabled == True,
  116. )
  117. .scalar()
  118. )
  119. @property
  120. def word_count(self):
  121. return (
  122. Document.query.with_entities(func.coalesce(func.sum(Document.word_count)))
  123. .filter(Document.dataset_id == self.id)
  124. .scalar()
  125. )
  126. @property
  127. def doc_form(self):
  128. document = db.session.query(Document).filter(Document.dataset_id == self.id).first()
  129. if document:
  130. return document.doc_form
  131. return None
  132. @property
  133. def retrieval_model_dict(self):
  134. default_retrieval_model = {
  135. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  136. "reranking_enable": False,
  137. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  138. "top_k": 2,
  139. "score_threshold_enabled": False,
  140. }
  141. return self.retrieval_model or default_retrieval_model
  142. @property
  143. def tags(self):
  144. tags = (
  145. db.session.query(Tag)
  146. .join(TagBinding, Tag.id == TagBinding.tag_id)
  147. .filter(
  148. TagBinding.target_id == self.id,
  149. TagBinding.tenant_id == self.tenant_id,
  150. Tag.tenant_id == self.tenant_id,
  151. Tag.type == "knowledge",
  152. )
  153. .all()
  154. )
  155. return tags or []
  156. @property
  157. def categories(self):
  158. categories = (
  159. db.session.query(Tag)
  160. .join(TagBinding, Tag.id == TagBinding.tag_id)
  161. .filter(
  162. TagBinding.target_id == self.id,
  163. TagBinding.tenant_id == self.tenant_id,
  164. Tag.tenant_id == self.tenant_id,
  165. Tag.type == "knowledge_category",
  166. )
  167. .all()
  168. )
  169. return categories or []
  170. @property
  171. def external_knowledge_info(self):
  172. if self.provider != "external":
  173. return None
  174. external_knowledge_binding = (
  175. db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
  176. )
  177. if not external_knowledge_binding:
  178. return None
  179. external_knowledge_api = (
  180. db.session.query(ExternalKnowledgeApis)
  181. .filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
  182. .first()
  183. )
  184. if not external_knowledge_api:
  185. return None
  186. return {
  187. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  188. "external_knowledge_api_id": external_knowledge_api.id,
  189. "external_knowledge_api_name": external_knowledge_api.name,
  190. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  191. }
  192. @property
  193. def doc_metadata(self):
  194. dataset_metadatas = db.session.query(DatasetMetadata).filter(DatasetMetadata.dataset_id == self.id).all()
  195. doc_metadata = [
  196. {
  197. "id": dataset_metadata.id,
  198. "name": dataset_metadata.name,
  199. "type": dataset_metadata.type,
  200. }
  201. for dataset_metadata in dataset_metadatas
  202. ]
  203. if self.built_in_field_enabled:
  204. doc_metadata.append(
  205. {
  206. "id": "built-in",
  207. "name": BuiltInField.document_name.value,
  208. "type": "string",
  209. }
  210. )
  211. doc_metadata.append(
  212. {
  213. "id": "built-in",
  214. "name": BuiltInField.uploader.value,
  215. "type": "string",
  216. }
  217. )
  218. doc_metadata.append(
  219. {
  220. "id": "built-in",
  221. "name": BuiltInField.upload_date.value,
  222. "type": "time",
  223. }
  224. )
  225. doc_metadata.append(
  226. {
  227. "id": "built-in",
  228. "name": BuiltInField.last_update_date.value,
  229. "type": "time",
  230. }
  231. )
  232. doc_metadata.append(
  233. {
  234. "id": "built-in",
  235. "name": BuiltInField.source.value,
  236. "type": "string",
  237. }
  238. )
  239. return doc_metadata
  240. @staticmethod
  241. def gen_collection_name_by_id(dataset_id: str) -> str:
  242. normalized_dataset_id = dataset_id.replace("-", "_")
  243. return f"Vector_index_{normalized_dataset_id}_Node"
  244. class DatasetProcessRule(db.Model): # type: ignore[name-defined]
  245. __tablename__ = "dataset_process_rules"
  246. __table_args__ = (
  247. db.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  248. db.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  249. )
  250. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  251. dataset_id = db.Column(StringUUID, nullable=False)
  252. mode = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  253. rules = db.Column(db.Text, nullable=True)
  254. created_by = db.Column(StringUUID, nullable=False)
  255. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  256. MODES = ["automatic", "custom", "hierarchical"]
  257. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  258. AUTOMATIC_RULES: dict[str, Any] = {
  259. "pre_processing_rules": [
  260. {"id": "remove_extra_spaces", "enabled": True},
  261. {"id": "remove_urls_emails", "enabled": False},
  262. ],
  263. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  264. }
  265. def to_dict(self):
  266. return {
  267. "id": self.id,
  268. "dataset_id": self.dataset_id,
  269. "mode": self.mode,
  270. "rules": self.rules_dict,
  271. }
  272. @property
  273. def rules_dict(self):
  274. try:
  275. return json.loads(self.rules) if self.rules else None
  276. except JSONDecodeError:
  277. return None
  278. class Document(db.Model): # type: ignore[name-defined]
  279. __tablename__ = "documents"
  280. __table_args__ = (
  281. db.PrimaryKeyConstraint("id", name="document_pkey"),
  282. db.Index("document_dataset_id_idx", "dataset_id"),
  283. db.Index("document_is_paused_idx", "is_paused"),
  284. db.Index("document_tenant_idx", "tenant_id"),
  285. db.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  286. )
  287. # initial fields
  288. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  289. tenant_id = db.Column(StringUUID, nullable=False)
  290. dataset_id = db.Column(StringUUID, nullable=False)
  291. position = db.Column(db.Integer, nullable=False)
  292. data_source_type = db.Column(db.String(255), nullable=False)
  293. data_source_info = db.Column(db.Text, nullable=True)
  294. dataset_process_rule_id = db.Column(StringUUID, nullable=True)
  295. batch = db.Column(db.String(255), nullable=False)
  296. name = db.Column(db.String(255), nullable=False)
  297. created_from = db.Column(db.String(255), nullable=False)
  298. created_by = db.Column(StringUUID, nullable=False)
  299. created_api_request_id = db.Column(StringUUID, nullable=True)
  300. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  301. # start processing
  302. processing_started_at = db.Column(db.DateTime, nullable=True)
  303. # parsing
  304. file_id = db.Column(db.Text, nullable=True)
  305. word_count = db.Column(db.Integer, nullable=True)
  306. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  307. # cleaning
  308. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  309. # split
  310. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  311. # indexing
  312. tokens = db.Column(db.Integer, nullable=True)
  313. indexing_latency = db.Column(db.Float, nullable=True)
  314. completed_at = db.Column(db.DateTime, nullable=True)
  315. # pause
  316. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
  317. paused_by = db.Column(StringUUID, nullable=True)
  318. paused_at = db.Column(db.DateTime, nullable=True)
  319. # error
  320. error = db.Column(db.Text, nullable=True)
  321. stopped_at = db.Column(db.DateTime, nullable=True)
  322. # basic fields
  323. indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  324. enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  325. disabled_at = db.Column(db.DateTime, nullable=True)
  326. disabled_by = db.Column(StringUUID, nullable=True)
  327. archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  328. archived_reason = db.Column(db.String(255), nullable=True)
  329. archived_by = db.Column(StringUUID, nullable=True)
  330. archived_at = db.Column(db.DateTime, nullable=True)
  331. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  332. doc_type = db.Column(db.String(40), nullable=True)
  333. doc_metadata = db.Column(JSONB, nullable=True)
  334. doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
  335. doc_language = db.Column(db.String(255), nullable=True)
  336. check_status = db.Column(db.Integer, nullable=False)
  337. check_by = db.Column(db.String(40), nullable=True)
  338. check_at = db.Column(db.DateTime, nullable=True)
  339. disable_applicant = db.Column(StringUUID, nullable=True)
  340. enable_applicant = db.Column(db.String(40), nullable=False)
  341. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  342. @property
  343. def display_status(self):
  344. status = None
  345. if self.indexing_status == "waiting":
  346. status = "queuing"
  347. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  348. status = "paused"
  349. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  350. status = "indexing"
  351. elif self.indexing_status == "error":
  352. status = "error"
  353. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  354. status = "available"
  355. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  356. status = "disabled"
  357. elif self.indexing_status == "completed" and self.archived:
  358. status = "archived"
  359. return status
  360. @property
  361. def data_source_info_dict(self):
  362. if self.data_source_info:
  363. try:
  364. data_source_info_dict = json.loads(self.data_source_info)
  365. except JSONDecodeError:
  366. data_source_info_dict = {}
  367. return data_source_info_dict
  368. return None
  369. @property
  370. def data_source_detail_dict(self):
  371. if self.data_source_info:
  372. if self.data_source_type == "upload_file":
  373. data_source_info_dict = json.loads(self.data_source_info)
  374. file_detail = (
  375. db.session.query(UploadFile)
  376. .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
  377. .one_or_none()
  378. )
  379. if file_detail:
  380. return {
  381. "upload_file": {
  382. "id": file_detail.id,
  383. "name": file_detail.name,
  384. "size": file_detail.size,
  385. "extension": file_detail.extension,
  386. "mime_type": file_detail.mime_type,
  387. "created_by": file_detail.created_by,
  388. "created_at": file_detail.created_at.timestamp(),
  389. }
  390. }
  391. elif self.data_source_type in {"notion_import", "website_crawl"}:
  392. return json.loads(self.data_source_info)
  393. return {}
  394. @property
  395. def average_segment_length(self):
  396. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  397. return self.word_count // self.segment_count
  398. return 0
  399. @property
  400. def dataset_process_rule(self):
  401. if self.dataset_process_rule_id:
  402. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  403. return None
  404. @property
  405. def dataset(self):
  406. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  407. @property
  408. def segment_count(self):
  409. return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
  410. @property
  411. def hit_count(self):
  412. return (
  413. DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count)))
  414. .filter(DocumentSegment.document_id == self.id)
  415. .scalar()
  416. )
  417. @property
  418. def uploader(self):
  419. user = db.session.query(Account).filter(Account.id == self.created_by).first()
  420. return user.name if user else None
  421. @property
  422. def upload_date(self):
  423. return self.created_at
  424. @property
  425. def last_update_date(self):
  426. return self.updated_at
  427. @property
  428. def doc_metadata_details(self):
  429. if self.doc_metadata:
  430. document_metadatas = (
  431. db.session.query(DatasetMetadata)
  432. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  433. .filter(
  434. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  435. )
  436. .all()
  437. )
  438. metadata_list = []
  439. for metadata in document_metadatas:
  440. metadata_dict = {
  441. "id": metadata.id,
  442. "name": metadata.name,
  443. "type": metadata.type,
  444. "value": self.doc_metadata.get(metadata.name),
  445. }
  446. metadata_list.append(metadata_dict)
  447. # deal built-in fields
  448. metadata_list.extend(self.get_built_in_fields())
  449. return metadata_list
  450. return None
  451. @property
  452. def process_rule_dict(self):
  453. if self.dataset_process_rule_id:
  454. return self.dataset_process_rule.to_dict()
  455. return None
  456. def get_built_in_fields(self):
  457. built_in_fields = []
  458. built_in_fields.append(
  459. {
  460. "id": "built-in",
  461. "name": BuiltInField.document_name,
  462. "type": "string",
  463. "value": self.name,
  464. }
  465. )
  466. built_in_fields.append(
  467. {
  468. "id": "built-in",
  469. "name": BuiltInField.uploader,
  470. "type": "string",
  471. "value": self.uploader,
  472. }
  473. )
  474. built_in_fields.append(
  475. {
  476. "id": "built-in",
  477. "name": BuiltInField.upload_date,
  478. "type": "time",
  479. "value": self.created_at.timestamp(),
  480. }
  481. )
  482. built_in_fields.append(
  483. {
  484. "id": "built-in",
  485. "name": BuiltInField.last_update_date,
  486. "type": "time",
  487. "value": self.updated_at.timestamp(),
  488. }
  489. )
  490. built_in_fields.append(
  491. {
  492. "id": "built-in",
  493. "name": BuiltInField.source,
  494. "type": "string",
  495. "value": MetadataDataSource[self.data_source_type].value,
  496. }
  497. )
  498. return built_in_fields
  499. def to_dict(self):
  500. return {
  501. "id": self.id,
  502. "tenant_id": self.tenant_id,
  503. "dataset_id": self.dataset_id,
  504. "position": self.position,
  505. "data_source_type": self.data_source_type,
  506. "data_source_info": self.data_source_info,
  507. "dataset_process_rule_id": self.dataset_process_rule_id,
  508. "batch": self.batch,
  509. "name": self.name,
  510. "created_from": self.created_from,
  511. "created_by": self.created_by,
  512. "created_api_request_id": self.created_api_request_id,
  513. "created_at": self.created_at,
  514. "processing_started_at": self.processing_started_at,
  515. "file_id": self.file_id,
  516. "word_count": self.word_count,
  517. "parsing_completed_at": self.parsing_completed_at,
  518. "cleaning_completed_at": self.cleaning_completed_at,
  519. "splitting_completed_at": self.splitting_completed_at,
  520. "tokens": self.tokens,
  521. "indexing_latency": self.indexing_latency,
  522. "completed_at": self.completed_at,
  523. "is_paused": self.is_paused,
  524. "paused_by": self.paused_by,
  525. "paused_at": self.paused_at,
  526. "error": self.error,
  527. "stopped_at": self.stopped_at,
  528. "indexing_status": self.indexing_status,
  529. "enabled": self.enabled,
  530. "disabled_at": self.disabled_at,
  531. "disabled_by": self.disabled_by,
  532. "archived": self.archived,
  533. "archived_reason": self.archived_reason,
  534. "archived_by": self.archived_by,
  535. "archived_at": self.archived_at,
  536. "updated_at": self.updated_at,
  537. "doc_type": self.doc_type,
  538. "doc_metadata": self.doc_metadata,
  539. "doc_form": self.doc_form,
  540. "doc_language": self.doc_language,
  541. "display_status": self.display_status,
  542. "data_source_info_dict": self.data_source_info_dict,
  543. "average_segment_length": self.average_segment_length,
  544. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  545. "dataset": self.dataset.to_dict() if self.dataset else None,
  546. "segment_count": self.segment_count,
  547. "hit_count": self.hit_count,
  548. }
  549. @classmethod
  550. def from_dict(cls, data: dict):
  551. return cls(
  552. id=data.get("id"),
  553. tenant_id=data.get("tenant_id"),
  554. dataset_id=data.get("dataset_id"),
  555. position=data.get("position"),
  556. data_source_type=data.get("data_source_type"),
  557. data_source_info=data.get("data_source_info"),
  558. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  559. batch=data.get("batch"),
  560. name=data.get("name"),
  561. created_from=data.get("created_from"),
  562. created_by=data.get("created_by"),
  563. created_api_request_id=data.get("created_api_request_id"),
  564. created_at=data.get("created_at"),
  565. processing_started_at=data.get("processing_started_at"),
  566. file_id=data.get("file_id"),
  567. word_count=data.get("word_count"),
  568. parsing_completed_at=data.get("parsing_completed_at"),
  569. cleaning_completed_at=data.get("cleaning_completed_at"),
  570. splitting_completed_at=data.get("splitting_completed_at"),
  571. tokens=data.get("tokens"),
  572. indexing_latency=data.get("indexing_latency"),
  573. completed_at=data.get("completed_at"),
  574. is_paused=data.get("is_paused"),
  575. paused_by=data.get("paused_by"),
  576. paused_at=data.get("paused_at"),
  577. error=data.get("error"),
  578. stopped_at=data.get("stopped_at"),
  579. indexing_status=data.get("indexing_status"),
  580. enabled=data.get("enabled"),
  581. disabled_at=data.get("disabled_at"),
  582. disabled_by=data.get("disabled_by"),
  583. archived=data.get("archived"),
  584. archived_reason=data.get("archived_reason"),
  585. archived_by=data.get("archived_by"),
  586. archived_at=data.get("archived_at"),
  587. updated_at=data.get("updated_at"),
  588. doc_type=data.get("doc_type"),
  589. doc_metadata=data.get("doc_metadata"),
  590. doc_form=data.get("doc_form"),
  591. doc_language=data.get("doc_language"),
  592. )
  593. class Template(db.Model): # type: ignore[name-defined]
  594. __tablename__ = "template"
  595. __table_args__ = (
  596. db.PrimaryKeyConstraint("id", name="template_pkey"),
  597. db.Index("template_dataset_id_idx", "dataset_id"),
  598. db.Index("template_is_paused_idx", "is_paused"),
  599. db.Index("template_tenant_idx", "tenant_id"),
  600. db.Index("template_metadata_idx", "doc_metadata", postgresql_using="gin"),
  601. )
  602. # initial fields
  603. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  604. tenant_id = db.Column(StringUUID, nullable=False)
  605. dataset_id = db.Column(StringUUID, nullable=False)
  606. position = db.Column(db.Integer, nullable=False)
  607. data_source_type = db.Column(db.String(255), nullable=False)
  608. data_source_info = db.Column(db.Text, nullable=True)
  609. dataset_process_rule_id = db.Column(StringUUID, nullable=True)
  610. batch = db.Column(db.String(255), nullable=False)
  611. name = db.Column(db.String(255), nullable=False)
  612. created_from = db.Column(db.String(255), nullable=False)
  613. created_by = db.Column(StringUUID, nullable=False)
  614. created_api_request_id = db.Column(StringUUID, nullable=True)
  615. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  616. # start processing
  617. processing_started_at = db.Column(db.DateTime, nullable=True)
  618. # parsing
  619. file_id = db.Column(db.Text, nullable=True)
  620. file_url = db.Column(db.Text, nullable=True)
  621. word_count = db.Column(db.Integer, nullable=True)
  622. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  623. # cleaning
  624. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  625. # split
  626. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  627. # indexing
  628. tokens = db.Column(db.Integer, nullable=True)
  629. indexing_latency = db.Column(db.Float, nullable=True)
  630. completed_at = db.Column(db.DateTime, nullable=True)
  631. # pause
  632. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
  633. paused_by = db.Column(StringUUID, nullable=True)
  634. paused_at = db.Column(db.DateTime, nullable=True)
  635. # error
  636. error = db.Column(db.Text, nullable=True)
  637. stopped_at = db.Column(db.DateTime, nullable=True)
  638. # basic fields
  639. indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  640. enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  641. disabled_at = db.Column(db.DateTime, nullable=True)
  642. disabled_by = db.Column(StringUUID, nullable=True)
  643. archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  644. archived_reason = db.Column(db.String(255), nullable=True)
  645. archived_by = db.Column(StringUUID, nullable=True)
  646. archived_at = db.Column(db.DateTime, nullable=True)
  647. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  648. doc_type = db.Column(db.String(40), nullable=True)
  649. doc_metadata = db.Column(JSONB, nullable=True)
  650. doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
  651. doc_language = db.Column(db.String(255), nullable=True)
  652. DATA_SOURCES = ["upload_file"]
  653. @property
  654. def display_status(self):
  655. status = None
  656. if self.indexing_status == "waiting":
  657. status = "queuing"
  658. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  659. status = "paused"
  660. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  661. status = "indexing"
  662. elif self.indexing_status == "error":
  663. status = "error"
  664. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  665. status = "available"
  666. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  667. status = "disabled"
  668. elif self.indexing_status == "completed" and self.archived:
  669. status = "archived"
  670. return status
  671. @property
  672. def data_source_info_dict(self):
  673. if self.data_source_info:
  674. try:
  675. data_source_info_dict = json.loads(self.data_source_info)
  676. except JSONDecodeError:
  677. data_source_info_dict = {}
  678. return data_source_info_dict
  679. return None
  680. @property
  681. def data_source_detail_dict(self):
  682. if self.data_source_info:
  683. if self.data_source_type == "upload_file":
  684. data_source_info_dict = json.loads(self.data_source_info)
  685. file_detail = (
  686. db.session.query(UploadFile)
  687. .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
  688. .one_or_none()
  689. )
  690. if file_detail:
  691. return {
  692. "upload_file": {
  693. "id": file_detail.id,
  694. "name": file_detail.name,
  695. "size": file_detail.size,
  696. "extension": file_detail.extension,
  697. "mime_type": file_detail.mime_type,
  698. "created_by": file_detail.created_by,
  699. "created_at": file_detail.created_at.timestamp(),
  700. }
  701. }
  702. elif self.data_source_type in {"notion_import", "website_crawl"}:
  703. return json.loads(self.data_source_info)
  704. return {}
  705. @property
  706. def average_segment_length(self):
  707. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  708. return self.word_count // self.segment_count
  709. return 0
  710. @property
  711. def dataset_process_rule(self):
  712. if self.dataset_process_rule_id:
  713. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  714. return None
  715. @property
  716. def dataset(self):
  717. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  718. @property
  719. def segment_count(self):
  720. return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
  721. @property
  722. def hit_count(self):
  723. return (
  724. DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count)))
  725. .filter(DocumentSegment.document_id == self.id)
  726. .scalar()
  727. )
  728. @property
  729. def uploader(self):
  730. user = db.session.query(Account).filter(Account.id == self.created_by).first()
  731. return user.name if user else None
  732. @property
  733. def upload_date(self):
  734. return self.created_at
  735. @property
  736. def last_update_date(self):
  737. return self.updated_at
  738. @property
  739. def doc_metadata_details(self):
  740. if self.doc_metadata:
  741. document_metadatas = (
  742. db.session.query(DatasetMetadata)
  743. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  744. .filter(
  745. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  746. )
  747. .all()
  748. )
  749. metadata_list = []
  750. for metadata in document_metadatas:
  751. metadata_dict = {
  752. "id": metadata.id,
  753. "name": metadata.name,
  754. "type": metadata.type,
  755. "value": self.doc_metadata.get(metadata.name),
  756. }
  757. metadata_list.append(metadata_dict)
  758. # deal built-in fields
  759. metadata_list.extend(self.get_built_in_fields())
  760. return metadata_list
  761. return None
  762. @property
  763. def process_rule_dict(self):
  764. if self.dataset_process_rule_id:
  765. return self.dataset_process_rule.to_dict()
  766. return None
  767. def get_built_in_fields(self):
  768. built_in_fields = []
  769. built_in_fields.append(
  770. {
  771. "id": "built-in",
  772. "name": BuiltInField.document_name,
  773. "type": "string",
  774. "value": self.name,
  775. }
  776. )
  777. built_in_fields.append(
  778. {
  779. "id": "built-in",
  780. "name": BuiltInField.uploader,
  781. "type": "string",
  782. "value": self.uploader,
  783. }
  784. )
  785. built_in_fields.append(
  786. {
  787. "id": "built-in",
  788. "name": BuiltInField.upload_date,
  789. "type": "time",
  790. "value": self.created_at.timestamp(),
  791. }
  792. )
  793. built_in_fields.append(
  794. {
  795. "id": "built-in",
  796. "name": BuiltInField.last_update_date,
  797. "type": "time",
  798. "value": self.updated_at.timestamp(),
  799. }
  800. )
  801. built_in_fields.append(
  802. {
  803. "id": "built-in",
  804. "name": BuiltInField.source,
  805. "type": "string",
  806. "value": MetadataDataSource[self.data_source_type].value,
  807. }
  808. )
  809. return built_in_fields
  810. def to_dict(self):
  811. return {
  812. "id": self.id,
  813. "tenant_id": self.tenant_id,
  814. "dataset_id": self.dataset_id,
  815. "position": self.position,
  816. "data_source_type": self.data_source_type,
  817. "data_source_info": self.data_source_info,
  818. "dataset_process_rule_id": self.dataset_process_rule_id,
  819. "batch": self.batch,
  820. "name": self.name,
  821. "created_from": self.created_from,
  822. "created_by": self.created_by,
  823. "created_api_request_id": self.created_api_request_id,
  824. "created_at": self.created_at,
  825. "processing_started_at": self.processing_started_at,
  826. "file_id": self.file_id,
  827. "word_count": self.word_count,
  828. "parsing_completed_at": self.parsing_completed_at,
  829. "cleaning_completed_at": self.cleaning_completed_at,
  830. "splitting_completed_at": self.splitting_completed_at,
  831. "tokens": self.tokens,
  832. "indexing_latency": self.indexing_latency,
  833. "completed_at": self.completed_at,
  834. "is_paused": self.is_paused,
  835. "paused_by": self.paused_by,
  836. "paused_at": self.paused_at,
  837. "error": self.error,
  838. "stopped_at": self.stopped_at,
  839. "indexing_status": self.indexing_status,
  840. "enabled": self.enabled,
  841. "disabled_at": self.disabled_at,
  842. "disabled_by": self.disabled_by,
  843. "archived": self.archived,
  844. "archived_reason": self.archived_reason,
  845. "archived_by": self.archived_by,
  846. "archived_at": self.archived_at,
  847. "updated_at": self.updated_at,
  848. "doc_type": self.doc_type,
  849. "doc_metadata": self.doc_metadata,
  850. "doc_form": self.doc_form,
  851. "doc_language": self.doc_language,
  852. "display_status": self.display_status,
  853. "data_source_info_dict": self.data_source_info_dict,
  854. "average_segment_length": self.average_segment_length,
  855. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  856. "dataset": self.dataset.to_dict() if self.dataset else None,
  857. "segment_count": self.segment_count,
  858. "hit_count": self.hit_count,
  859. }
  860. @classmethod
  861. def from_dict(cls, data: dict):
  862. return cls(
  863. id=data.get("id"),
  864. tenant_id=data.get("tenant_id"),
  865. dataset_id=data.get("dataset_id"),
  866. position=data.get("position"),
  867. data_source_type=data.get("data_source_type"),
  868. data_source_info=data.get("data_source_info"),
  869. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  870. batch=data.get("batch"),
  871. name=data.get("name"),
  872. created_from=data.get("created_from"),
  873. created_by=data.get("created_by"),
  874. created_api_request_id=data.get("created_api_request_id"),
  875. created_at=data.get("created_at"),
  876. processing_started_at=data.get("processing_started_at"),
  877. file_id=data.get("file_id"),
  878. word_count=data.get("word_count"),
  879. parsing_completed_at=data.get("parsing_completed_at"),
  880. cleaning_completed_at=data.get("cleaning_completed_at"),
  881. splitting_completed_at=data.get("splitting_completed_at"),
  882. tokens=data.get("tokens"),
  883. indexing_latency=data.get("indexing_latency"),
  884. completed_at=data.get("completed_at"),
  885. is_paused=data.get("is_paused"),
  886. paused_by=data.get("paused_by"),
  887. paused_at=data.get("paused_at"),
  888. error=data.get("error"),
  889. stopped_at=data.get("stopped_at"),
  890. indexing_status=data.get("indexing_status"),
  891. enabled=data.get("enabled"),
  892. disabled_at=data.get("disabled_at"),
  893. disabled_by=data.get("disabled_by"),
  894. archived=data.get("archived"),
  895. archived_reason=data.get("archived_reason"),
  896. archived_by=data.get("archived_by"),
  897. archived_at=data.get("archived_at"),
  898. updated_at=data.get("updated_at"),
  899. doc_type=data.get("doc_type"),
  900. doc_metadata=data.get("doc_metadata"),
  901. doc_form=data.get("doc_form"),
  902. doc_language=data.get("doc_language"),
  903. )
  904. class DocumentSegment(db.Model): # type: ignore[name-defined]
  905. __tablename__ = "document_segments"
  906. __table_args__ = (
  907. db.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  908. db.Index("document_segment_dataset_id_idx", "dataset_id"),
  909. db.Index("document_segment_document_id_idx", "document_id"),
  910. db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  911. db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  912. db.Index("document_segment_dataset_node_idx", "dataset_id", "index_node_id"),
  913. db.Index("document_segment_tenant_idx", "tenant_id"),
  914. )
  915. # initial fields
  916. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  917. tenant_id = db.Column(StringUUID, nullable=False)
  918. dataset_id = db.Column(StringUUID, nullable=False)
  919. document_id = db.Column(StringUUID, nullable=False)
  920. position: Mapped[int]
  921. content = db.Column(db.Text, nullable=False)
  922. answer = db.Column(db.Text, nullable=True)
  923. word_count = db.Column(db.Integer, nullable=False)
  924. tokens = db.Column(db.Integer, nullable=False)
  925. # indexing fields
  926. keywords = db.Column(db.JSON, nullable=True)
  927. index_node_id = db.Column(db.String(255), nullable=True)
  928. index_node_hash = db.Column(db.String(255), nullable=True)
  929. # basic fields
  930. hit_count = db.Column(db.Integer, nullable=False, default=0)
  931. enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  932. disabled_at = db.Column(db.DateTime, nullable=True)
  933. disabled_by = db.Column(StringUUID, nullable=True)
  934. status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  935. created_by = db.Column(StringUUID, nullable=False)
  936. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  937. updated_by = db.Column(StringUUID, nullable=True)
  938. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  939. indexing_at = db.Column(db.DateTime, nullable=True)
  940. completed_at = db.Column(db.DateTime, nullable=True)
  941. error = db.Column(db.Text, nullable=True)
  942. stopped_at = db.Column(db.DateTime, nullable=True)
  943. @property
  944. def dataset(self):
  945. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  946. @property
  947. def document(self):
  948. return db.session.query(Document).filter(Document.id == self.document_id).first()
  949. @property
  950. def previous_segment(self):
  951. return (
  952. db.session.query(DocumentSegment)
  953. .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1)
  954. .first()
  955. )
  956. @property
  957. def next_segment(self):
  958. return (
  959. db.session.query(DocumentSegment)
  960. .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1)
  961. .first()
  962. )
  963. @property
  964. def child_chunks(self):
  965. process_rule = self.document.dataset_process_rule
  966. if process_rule.mode == "hierarchical":
  967. rules = Rule(**process_rule.rules_dict)
  968. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  969. child_chunks = (
  970. db.session.query(ChildChunk)
  971. .filter(ChildChunk.segment_id == self.id)
  972. .order_by(ChildChunk.position.asc())
  973. .all()
  974. )
  975. return child_chunks or []
  976. else:
  977. return []
  978. else:
  979. return []
  980. def get_child_chunks(self):
  981. process_rule = self.document.dataset_process_rule
  982. if process_rule.mode == "hierarchical":
  983. rules = Rule(**process_rule.rules_dict)
  984. if rules.parent_mode:
  985. child_chunks = (
  986. db.session.query(ChildChunk)
  987. .filter(ChildChunk.segment_id == self.id)
  988. .order_by(ChildChunk.position.asc())
  989. .all()
  990. )
  991. return child_chunks or []
  992. else:
  993. return []
  994. else:
  995. return []
  996. @property
  997. def sign_content(self):
  998. return self.get_sign_content()
  999. def get_sign_content(self):
  1000. signed_urls = []
  1001. text = self.content
  1002. # For data before v0.10.0
  1003. pattern = r"/files/([a-f0-9\-]+)/image-preview"
  1004. matches = re.finditer(pattern, text)
  1005. for match in matches:
  1006. upload_file_id = match.group(1)
  1007. nonce = os.urandom(16).hex()
  1008. timestamp = str(int(time.time()))
  1009. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  1010. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  1011. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  1012. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  1013. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  1014. signed_url = f"{match.group(0)}?{params}"
  1015. signed_urls.append((match.start(), match.end(), signed_url))
  1016. # For data after v0.10.0
  1017. pattern = r"/files/([a-f0-9\-]+)/file-preview"
  1018. matches = re.finditer(pattern, text)
  1019. for match in matches:
  1020. upload_file_id = match.group(1)
  1021. nonce = os.urandom(16).hex()
  1022. timestamp = str(int(time.time()))
  1023. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  1024. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  1025. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  1026. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  1027. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  1028. signed_url = f"{match.group(0)}?{params}"
  1029. signed_urls.append((match.start(), match.end(), signed_url))
  1030. # Reconstruct the text with signed URLs
  1031. offset = 0
  1032. for start, end, signed_url in signed_urls:
  1033. text = text[: start + offset] + signed_url + text[end + offset :]
  1034. offset += len(signed_url) - (end - start)
  1035. return text
  1036. class ChildChunk(db.Model): # type: ignore[name-defined]
  1037. __tablename__ = "child_chunks"
  1038. __table_args__ = (
  1039. db.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  1040. db.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  1041. )
  1042. # initial fields
  1043. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  1044. tenant_id = db.Column(StringUUID, nullable=False)
  1045. dataset_id = db.Column(StringUUID, nullable=False)
  1046. document_id = db.Column(StringUUID, nullable=False)
  1047. segment_id = db.Column(StringUUID, nullable=False)
  1048. position = db.Column(db.Integer, nullable=False)
  1049. content = db.Column(db.Text, nullable=False)
  1050. word_count = db.Column(db.Integer, nullable=False)
  1051. # indexing fields
  1052. index_node_id = db.Column(db.String(255), nullable=True)
  1053. index_node_hash = db.Column(db.String(255), nullable=True)
  1054. type = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  1055. created_by = db.Column(StringUUID, nullable=False)
  1056. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1057. updated_by = db.Column(StringUUID, nullable=True)
  1058. updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1059. indexing_at = db.Column(db.DateTime, nullable=True)
  1060. completed_at = db.Column(db.DateTime, nullable=True)
  1061. error = db.Column(db.Text, nullable=True)
  1062. @property
  1063. def dataset(self):
  1064. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  1065. @property
  1066. def document(self):
  1067. return db.session.query(Document).filter(Document.id == self.document_id).first()
  1068. @property
  1069. def segment(self):
  1070. return db.session.query(DocumentSegment).filter(DocumentSegment.id == self.segment_id).first()
  1071. class AppDatasetJoin(db.Model): # type: ignore[name-defined]
  1072. __tablename__ = "app_dataset_joins"
  1073. __table_args__ = (
  1074. db.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  1075. db.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  1076. )
  1077. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  1078. app_id = db.Column(StringUUID, nullable=False)
  1079. dataset_id = db.Column(StringUUID, nullable=False)
  1080. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  1081. @property
  1082. def app(self):
  1083. return db.session.get(App, self.app_id)
  1084. class DatasetQuery(db.Model): # type: ignore[name-defined]
  1085. __tablename__ = "dataset_queries"
  1086. __table_args__ = (
  1087. db.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  1088. db.Index("dataset_query_dataset_id_idx", "dataset_id"),
  1089. )
  1090. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  1091. dataset_id = db.Column(StringUUID, nullable=False)
  1092. content = db.Column(db.Text, nullable=False)
  1093. source = db.Column(db.String(255), nullable=False)
  1094. source_app_id = db.Column(StringUUID, nullable=True)
  1095. created_by_role = db.Column(db.String, nullable=False)
  1096. created_by = db.Column(StringUUID, nullable=False)
  1097. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  1098. class DatasetKeywordTable(db.Model): # type: ignore[name-defined]
  1099. __tablename__ = "dataset_keyword_tables"
  1100. __table_args__ = (
  1101. db.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  1102. db.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  1103. )
  1104. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  1105. dataset_id = db.Column(StringUUID, nullable=False, unique=True)
  1106. keyword_table = db.Column(db.Text, nullable=False)
  1107. data_source_type = db.Column(
  1108. db.String(255), nullable=False, server_default=db.text("'database'::character varying")
  1109. )
  1110. @property
  1111. def keyword_table_dict(self):
  1112. class SetDecoder(json.JSONDecoder):
  1113. def __init__(self, *args, **kwargs):
  1114. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  1115. def object_hook(self, dct):
  1116. if isinstance(dct, dict):
  1117. for keyword, node_idxs in dct.items():
  1118. if isinstance(node_idxs, list):
  1119. dct[keyword] = set(node_idxs)
  1120. return dct
  1121. # get dataset
  1122. dataset = Dataset.query.filter_by(id=self.dataset_id).first()
  1123. if not dataset:
  1124. return None
  1125. if self.data_source_type == "database":
  1126. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  1127. else:
  1128. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  1129. try:
  1130. keyword_table_text = storage.load_once(file_key)
  1131. if keyword_table_text:
  1132. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  1133. return None
  1134. except Exception as e:
  1135. logging.exception(f"Failed to load keyword table from file: {file_key}")
  1136. return None
  1137. class Embedding(db.Model): # type: ignore[name-defined]
  1138. __tablename__ = "embeddings"
  1139. __table_args__ = (
  1140. db.PrimaryKeyConstraint("id", name="embedding_pkey"),
  1141. db.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  1142. db.Index("created_at_idx", "created_at"),
  1143. )
  1144. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  1145. model_name = db.Column(
  1146. db.String(255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying")
  1147. )
  1148. hash = db.Column(db.String(64), nullable=False)
  1149. embedding = db.Column(db.LargeBinary, nullable=False)
  1150. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1151. provider_name = db.Column(db.String(255), nullable=False, server_default=db.text("''::character varying"))
  1152. def set_embedding(self, embedding_data: list[float]):
  1153. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  1154. def get_embedding(self) -> list[float]:
  1155. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  1156. class DatasetCollectionBinding(db.Model): # type: ignore[name-defined]
  1157. __tablename__ = "dataset_collection_bindings"
  1158. __table_args__ = (
  1159. db.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  1160. db.Index("provider_model_name_idx", "provider_name", "model_name"),
  1161. )
  1162. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  1163. provider_name = db.Column(db.String(255), nullable=False)
  1164. model_name = db.Column(db.String(255), nullable=False)
  1165. type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
  1166. collection_name = db.Column(db.String(64), nullable=False)
  1167. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1168. class TidbAuthBinding(db.Model): # type: ignore[name-defined]
  1169. __tablename__ = "tidb_auth_bindings"
  1170. __table_args__ = (
  1171. db.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  1172. db.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  1173. db.Index("tidb_auth_bindings_active_idx", "active"),
  1174. db.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  1175. db.Index("tidb_auth_bindings_status_idx", "status"),
  1176. )
  1177. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  1178. tenant_id = db.Column(StringUUID, nullable=True)
  1179. cluster_id = db.Column(db.String(255), nullable=False)
  1180. cluster_name = db.Column(db.String(255), nullable=False)
  1181. active = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1182. status = db.Column(db.String(255), nullable=False, server_default=db.text("CREATING"))
  1183. account = db.Column(db.String(255), nullable=False)
  1184. password = db.Column(db.String(255), nullable=False)
  1185. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1186. class Whitelist(db.Model): # type: ignore[name-defined]
  1187. __tablename__ = "whitelists"
  1188. __table_args__ = (
  1189. db.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  1190. db.Index("whitelists_tenant_idx", "tenant_id"),
  1191. )
  1192. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  1193. tenant_id = db.Column(StringUUID, nullable=True)
  1194. category = db.Column(db.String(255), nullable=False)
  1195. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1196. class DatasetPermission(db.Model): # type: ignore[name-defined]
  1197. __tablename__ = "dataset_permissions"
  1198. __table_args__ = (
  1199. db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  1200. db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  1201. db.Index("idx_dataset_permissions_account_id", "account_id"),
  1202. db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  1203. )
  1204. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
  1205. dataset_id = db.Column(StringUUID, nullable=False)
  1206. account_id = db.Column(StringUUID, nullable=False)
  1207. tenant_id = db.Column(StringUUID, nullable=False)
  1208. has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  1209. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1210. class DatasetPermissionAll(db.Model): # type: ignore[name-defined]
  1211. __tablename__ = "dataset_permissions_all"
  1212. __table_args__ = (
  1213. db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  1214. db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  1215. db.Index("idx_dataset_permissions_account_id", "account_id"),
  1216. db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  1217. )
  1218. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
  1219. dataset_id = db.Column(StringUUID, nullable=False)
  1220. account_id = db.Column(StringUUID, nullable=False)
  1221. tenant_id = db.Column(StringUUID, nullable=False)
  1222. has_edit_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  1223. has_read_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  1224. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1225. created_by = db.Column(StringUUID, nullable=False)
  1226. updated_by = db.Column(StringUUID, nullable=True)
  1227. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1228. email = db.Column(db.String(255), nullable=False)
  1229. class ExternalKnowledgeApis(db.Model): # type: ignore[name-defined]
  1230. __tablename__ = "external_knowledge_apis"
  1231. __table_args__ = (
  1232. db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  1233. db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  1234. db.Index("external_knowledge_apis_name_idx", "name"),
  1235. )
  1236. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  1237. name = db.Column(db.String(255), nullable=False)
  1238. description = db.Column(db.String(255), nullable=False)
  1239. tenant_id = db.Column(StringUUID, nullable=False)
  1240. settings = db.Column(db.Text, nullable=True)
  1241. created_by = db.Column(StringUUID, nullable=False)
  1242. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1243. updated_by = db.Column(StringUUID, nullable=True)
  1244. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1245. def to_dict(self):
  1246. return {
  1247. "id": self.id,
  1248. "tenant_id": self.tenant_id,
  1249. "name": self.name,
  1250. "description": self.description,
  1251. "settings": self.settings_dict,
  1252. "dataset_bindings": self.dataset_bindings,
  1253. "created_by": self.created_by,
  1254. "created_at": self.created_at.isoformat(),
  1255. }
  1256. @property
  1257. def settings_dict(self):
  1258. try:
  1259. return json.loads(self.settings) if self.settings else None
  1260. except JSONDecodeError:
  1261. return None
  1262. @property
  1263. def dataset_bindings(self):
  1264. external_knowledge_bindings = (
  1265. db.session.query(ExternalKnowledgeBindings)
  1266. .filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  1267. .all()
  1268. )
  1269. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  1270. datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
  1271. dataset_bindings = []
  1272. for dataset in datasets:
  1273. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  1274. return dataset_bindings
  1275. class ExternalKnowledgeBindings(db.Model): # type: ignore[name-defined]
  1276. __tablename__ = "external_knowledge_bindings"
  1277. __table_args__ = (
  1278. db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  1279. db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  1280. db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  1281. db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  1282. db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  1283. )
  1284. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  1285. tenant_id = db.Column(StringUUID, nullable=False)
  1286. external_knowledge_api_id = db.Column(StringUUID, nullable=False)
  1287. dataset_id = db.Column(StringUUID, nullable=False)
  1288. external_knowledge_id = db.Column(db.Text, nullable=False)
  1289. created_by = db.Column(StringUUID, nullable=False)
  1290. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1291. updated_by = db.Column(StringUUID, nullable=True)
  1292. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1293. class DatasetAutoDisableLog(db.Model): # type: ignore[name-defined]
  1294. __tablename__ = "dataset_auto_disable_logs"
  1295. __table_args__ = (
  1296. db.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  1297. db.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  1298. db.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  1299. db.Index("dataset_auto_disable_log_created_atx", "created_at"),
  1300. )
  1301. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1302. tenant_id = db.Column(StringUUID, nullable=False)
  1303. dataset_id = db.Column(StringUUID, nullable=False)
  1304. document_id = db.Column(StringUUID, nullable=False)
  1305. notified = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1306. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1307. class RateLimitLog(db.Model): # type: ignore[name-defined]
  1308. __tablename__ = "rate_limit_logs"
  1309. __table_args__ = (
  1310. db.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  1311. db.Index("rate_limit_log_tenant_idx", "tenant_id"),
  1312. db.Index("rate_limit_log_operation_idx", "operation"),
  1313. )
  1314. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1315. tenant_id = db.Column(StringUUID, nullable=False)
  1316. subscription_plan = db.Column(db.String(255), nullable=False)
  1317. operation = db.Column(db.String(255), nullable=False)
  1318. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1319. class DatasetMetadata(db.Model): # type: ignore[name-defined]
  1320. __tablename__ = "dataset_metadatas"
  1321. __table_args__ = (
  1322. db.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  1323. db.Index("dataset_metadata_tenant_idx", "tenant_id"),
  1324. db.Index("dataset_metadata_dataset_idx", "dataset_id"),
  1325. )
  1326. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1327. tenant_id = db.Column(StringUUID, nullable=False)
  1328. dataset_id = db.Column(StringUUID, nullable=False)
  1329. type = db.Column(db.String(255), nullable=False)
  1330. name = db.Column(db.String(255), nullable=False)
  1331. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1332. updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1333. created_by = db.Column(StringUUID, nullable=False)
  1334. updated_by = db.Column(StringUUID, nullable=True)
  1335. class DatasetMetadataBinding(db.Model): # type: ignore[name-defined]
  1336. __tablename__ = "dataset_metadata_bindings"
  1337. __table_args__ = (
  1338. db.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  1339. db.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  1340. db.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  1341. db.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  1342. db.Index("dataset_metadata_binding_document_idx", "document_id"),
  1343. )
  1344. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1345. tenant_id = db.Column(StringUUID, nullable=False)
  1346. dataset_id = db.Column(StringUUID, nullable=False)
  1347. metadata_id = db.Column(StringUUID, nullable=False)
  1348. document_id = db.Column(StringUUID, nullable=False)
  1349. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1350. created_by = db.Column(StringUUID, nullable=False)