| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525 | 
							- import base64
 
- import enum
 
- import hashlib
 
- import hmac
 
- import json
 
- import logging
 
- import os
 
- import pickle
 
- import re
 
- import time
 
- from json import JSONDecodeError
 
- from typing import Any, cast
 
- from sqlalchemy import func
 
- from sqlalchemy.dialects.postgresql import JSONB
 
- from sqlalchemy.orm import Mapped
 
- from configs import dify_config
 
- from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
 
- from core.rag.retrieval.retrieval_methods import RetrievalMethod
 
- from extensions.ext_storage import storage
 
- from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
 
- from .account import Account
 
- from .engine import db
 
- from .model import App, Tag, TagBinding, UploadFile
 
- from .types import StringUUID
 
- class DatasetPermissionEnum(enum.StrEnum):
 
-     ONLY_ME = "only_me"
 
-     ALL_TEAM = "all_team_members"
 
-     PARTIAL_TEAM = "partial_members"
 
- class Dataset(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "datasets"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_pkey"),
 
-         db.Index("dataset_tenant_idx", "tenant_id"),
 
-         db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
 
-     )
 
-     INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
 
-     PROVIDER_LIST = ["vendor", "external", None]
 
-     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     name = db.Column(db.String(255), nullable=False)
 
-     description = db.Column(db.Text, nullable=True)
 
-     provider = db.Column(db.String(255), nullable=False, server_default=db.text("'vendor'::character varying"))
 
-     permission = db.Column(db.String(255), nullable=False, server_default=db.text("'only_me'::character varying"))
 
-     data_source_type = db.Column(db.String(255))
 
-     indexing_technique = db.Column(db.String(255), nullable=True)
 
-     index_struct = db.Column(db.Text, nullable=True)
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     updated_by = db.Column(StringUUID, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     embedding_model = db.Column(db.String(255), nullable=True)
 
-     embedding_model_provider = db.Column(db.String(255), nullable=True)
 
-     collection_binding_id = db.Column(StringUUID, nullable=True)
 
-     retrieval_model = db.Column(JSONB, nullable=True)
 
-     built_in_field_enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 
-     dept_id = db.Column(StringUUID, nullable=True)
 
-     edit_auth = db.Column(db.Integer, nullable=False)
 
-     @property
 
-     def dataset_keyword_table(self):
 
-         dataset_keyword_table = (
 
-             db.session.query(DatasetKeywordTable).filter(DatasetKeywordTable.dataset_id == self.id).first()
 
-         )
 
-         if dataset_keyword_table:
 
-             return dataset_keyword_table
 
-         return None
 
-     @property
 
-     def index_struct_dict(self):
 
-         return json.loads(self.index_struct) if self.index_struct else None
 
-     @property
 
-     def external_retrieval_model(self):
 
-         default_retrieval_model = {
 
-             "top_k": 2,
 
-             "score_threshold": 0.0,
 
-         }
 
-         return self.retrieval_model or default_retrieval_model
 
-     @property
 
-     def created_by_account(self):
 
-         return db.session.get(Account, self.created_by)
 
-     @property
 
-     def latest_process_rule(self):
 
-         return (
 
-             DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id)
 
-             .order_by(DatasetProcessRule.created_at.desc())
 
-             .first()
 
-         )
 
-     @property
 
-     def app_count(self):
 
-         return (
 
-             db.session.query(func.count(AppDatasetJoin.id))
 
-             .filter(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
 
-             .scalar()
 
-         )
 
-     @property
 
-     def document_count(self):
 
-         return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
 
-     @property
 
-     def available_document_count(self):
 
-         return (
 
-             db.session.query(func.count(Document.id))
 
-             .filter(
 
-                 Document.dataset_id == self.id,
 
-                 Document.indexing_status == "completed",
 
-                 Document.enabled == True,
 
-                 Document.archived == False,
 
-             )
 
-             .scalar()
 
-         )
 
-     @property
 
-     def available_segment_count(self):
 
-         return (
 
-             db.session.query(func.count(DocumentSegment.id))
 
-             .filter(
 
-                 DocumentSegment.dataset_id == self.id,
 
-                 DocumentSegment.status == "completed",
 
-                 DocumentSegment.enabled == True,
 
-             )
 
-             .scalar()
 
-         )
 
-     @property
 
-     def word_count(self):
 
-         return (
 
-             Document.query.with_entities(func.coalesce(func.sum(Document.word_count)))
 
-             .filter(Document.dataset_id == self.id)
 
-             .scalar()
 
-         )
 
-     @property
 
-     def doc_form(self):
 
-         document = db.session.query(Document).filter(Document.dataset_id == self.id).first()
 
-         if document:
 
-             return document.doc_form
 
-         return None
 
-     @property
 
-     def retrieval_model_dict(self):
 
-         default_retrieval_model = {
 
-             "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
 
-             "reranking_enable": False,
 
-             "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
 
-             "top_k": 2,
 
-             "score_threshold_enabled": False,
 
-         }
 
-         return self.retrieval_model or default_retrieval_model
 
-     @property
 
-     def tags(self):
 
-         tags = (
 
-             db.session.query(Tag)
 
-             .join(TagBinding, Tag.id == TagBinding.tag_id)
 
-             .filter(
 
-                 TagBinding.target_id == self.id,
 
-                 TagBinding.tenant_id == self.tenant_id,
 
-                 Tag.tenant_id == self.tenant_id,
 
-                 Tag.type == "knowledge",
 
-             )
 
-             .all()
 
-         )
 
-         return tags or []
 
-     @property
 
-     def categories(self):
 
-         categories = (
 
-             db.session.query(Tag)
 
-             .join(TagBinding, Tag.id == TagBinding.tag_id)
 
-             .filter(
 
-                 TagBinding.target_id == self.id,
 
-                 TagBinding.tenant_id == self.tenant_id,
 
-                 Tag.tenant_id == self.tenant_id,
 
-                 Tag.type == "knowledge_category",
 
-             )
 
-             .all()
 
-         )
 
-         return categories or []
 
-     @property
 
-     def external_knowledge_info(self):
 
-         if self.provider != "external":
 
-             return None
 
-         external_knowledge_binding = (
 
-             db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
 
-         )
 
-         if not external_knowledge_binding:
 
-             return None
 
-         external_knowledge_api = (
 
-             db.session.query(ExternalKnowledgeApis)
 
-             .filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
 
-             .first()
 
-         )
 
-         if not external_knowledge_api:
 
-             return None
 
-         return {
 
-             "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
 
-             "external_knowledge_api_id": external_knowledge_api.id,
 
-             "external_knowledge_api_name": external_knowledge_api.name,
 
-             "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
 
-         }
 
-     @property
 
-     def doc_metadata(self):
 
-         dataset_metadatas = db.session.query(DatasetMetadata).filter(DatasetMetadata.dataset_id == self.id).all()
 
-         doc_metadata = [
 
-             {
 
-                 "id": dataset_metadata.id,
 
-                 "name": dataset_metadata.name,
 
-                 "type": dataset_metadata.type,
 
-             }
 
-             for dataset_metadata in dataset_metadatas
 
-         ]
 
-         if self.built_in_field_enabled:
 
-             doc_metadata.append(
 
-                 {
 
-                     "id": "built-in",
 
-                     "name": BuiltInField.document_name.value,
 
-                     "type": "string",
 
-                 }
 
-             )
 
-             doc_metadata.append(
 
-                 {
 
-                     "id": "built-in",
 
-                     "name": BuiltInField.uploader.value,
 
-                     "type": "string",
 
-                 }
 
-             )
 
-             doc_metadata.append(
 
-                 {
 
-                     "id": "built-in",
 
-                     "name": BuiltInField.upload_date.value,
 
-                     "type": "time",
 
-                 }
 
-             )
 
-             doc_metadata.append(
 
-                 {
 
-                     "id": "built-in",
 
-                     "name": BuiltInField.last_update_date.value,
 
-                     "type": "time",
 
-                 }
 
-             )
 
-             doc_metadata.append(
 
-                 {
 
-                     "id": "built-in",
 
-                     "name": BuiltInField.source.value,
 
-                     "type": "string",
 
-                 }
 
-             )
 
-         return doc_metadata
 
-     @staticmethod
 
-     def gen_collection_name_by_id(dataset_id: str) -> str:
 
-         normalized_dataset_id = dataset_id.replace("-", "_")
 
-         return f"Vector_index_{normalized_dataset_id}_Node"
 
- class DatasetProcessRule(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_process_rules"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
 
-         db.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
 
-     )
 
-     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     mode = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
 
-     rules = db.Column(db.Text, nullable=True)
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     MODES = ["automatic", "custom", "hierarchical"]
 
-     PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
 
-     AUTOMATIC_RULES: dict[str, Any] = {
 
-         "pre_processing_rules": [
 
-             {"id": "remove_extra_spaces", "enabled": True},
 
-             {"id": "remove_urls_emails", "enabled": False},
 
-         ],
 
-         "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
 
-     }
 
-     def to_dict(self):
 
-         return {
 
-             "id": self.id,
 
-             "dataset_id": self.dataset_id,
 
-             "mode": self.mode,
 
-             "rules": self.rules_dict,
 
-         }
 
-     @property
 
-     def rules_dict(self):
 
-         try:
 
-             return json.loads(self.rules) if self.rules else None
 
-         except JSONDecodeError:
 
-             return None
 
- class Document(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "documents"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="document_pkey"),
 
-         db.Index("document_dataset_id_idx", "dataset_id"),
 
-         db.Index("document_is_paused_idx", "is_paused"),
 
-         db.Index("document_tenant_idx", "tenant_id"),
 
-         db.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
 
-     )
 
-     # initial fields
 
-     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     position = db.Column(db.Integer, nullable=False)
 
-     data_source_type = db.Column(db.String(255), nullable=False)
 
-     data_source_info = db.Column(db.Text, nullable=True)
 
-     dataset_process_rule_id = db.Column(StringUUID, nullable=True)
 
-     batch = db.Column(db.String(255), nullable=False)
 
-     name = db.Column(db.String(255), nullable=False)
 
-     created_from = db.Column(db.String(255), nullable=False)
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_api_request_id = db.Column(StringUUID, nullable=True)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     # start processing
 
-     processing_started_at = db.Column(db.DateTime, nullable=True)
 
-     # parsing
 
-     file_id = db.Column(db.Text, nullable=True)
 
-     word_count = db.Column(db.Integer, nullable=True)
 
-     parsing_completed_at = db.Column(db.DateTime, nullable=True)
 
-     # cleaning
 
-     cleaning_completed_at = db.Column(db.DateTime, nullable=True)
 
-     # split
 
-     splitting_completed_at = db.Column(db.DateTime, nullable=True)
 
-     # indexing
 
-     tokens = db.Column(db.Integer, nullable=True)
 
-     indexing_latency = db.Column(db.Float, nullable=True)
 
-     completed_at = db.Column(db.DateTime, nullable=True)
 
-     # pause
 
-     is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
 
-     paused_by = db.Column(StringUUID, nullable=True)
 
-     paused_at = db.Column(db.DateTime, nullable=True)
 
-     # error
 
-     error = db.Column(db.Text, nullable=True)
 
-     stopped_at = db.Column(db.DateTime, nullable=True)
 
-     # basic fields
 
-     indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
 
-     enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 
-     disabled_at = db.Column(db.DateTime, nullable=True)
 
-     disabled_by = db.Column(StringUUID, nullable=True)
 
-     archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 
-     archived_reason = db.Column(db.String(255), nullable=True)
 
-     archived_by = db.Column(StringUUID, nullable=True)
 
-     archived_at = db.Column(db.DateTime, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     doc_type = db.Column(db.String(40), nullable=True)
 
-     doc_metadata = db.Column(JSONB, nullable=True)
 
-     doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
 
-     doc_language = db.Column(db.String(255), nullable=True)
 
-     check_status = db.Column(db.Integer, nullable=False)
 
-     check_by = db.Column(db.String(40), nullable=True)
 
-     check_at = db.Column(db.DateTime, nullable=True)
 
-     disable_applicant = db.Column(StringUUID, nullable=True)
 
-     enable_applicant = db.Column(db.String(40), nullable=False)
 
-     DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
 
-     @property
 
-     def display_status(self):
 
-         status = None
 
-         if self.indexing_status == "waiting":
 
-             status = "queuing"
 
-         elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
 
-             status = "paused"
 
-         elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
 
-             status = "indexing"
 
-         elif self.indexing_status == "error":
 
-             status = "error"
 
-         elif self.indexing_status == "completed" and not self.archived and self.enabled:
 
-             status = "available"
 
-         elif self.indexing_status == "completed" and not self.archived and not self.enabled:
 
-             status = "disabled"
 
-         elif self.indexing_status == "completed" and self.archived:
 
-             status = "archived"
 
-         return status
 
-     @property
 
-     def data_source_info_dict(self):
 
-         if self.data_source_info:
 
-             try:
 
-                 data_source_info_dict = json.loads(self.data_source_info)
 
-             except JSONDecodeError:
 
-                 data_source_info_dict = {}
 
-             return data_source_info_dict
 
-         return None
 
-     @property
 
-     def data_source_detail_dict(self):
 
-         if self.data_source_info:
 
-             if self.data_source_type == "upload_file":
 
-                 data_source_info_dict = json.loads(self.data_source_info)
 
-                 file_detail = (
 
-                     db.session.query(UploadFile)
 
-                     .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
 
-                     .one_or_none()
 
-                 )
 
-                 if file_detail:
 
-                     return {
 
-                         "upload_file": {
 
-                             "id": file_detail.id,
 
-                             "name": file_detail.name,
 
-                             "size": file_detail.size,
 
-                             "extension": file_detail.extension,
 
-                             "mime_type": file_detail.mime_type,
 
-                             "created_by": file_detail.created_by,
 
-                             "created_at": file_detail.created_at.timestamp(),
 
-                         }
 
-                     }
 
-             elif self.data_source_type in {"notion_import", "website_crawl"}:
 
-                 return json.loads(self.data_source_info)
 
-         return {}
 
-     @property
 
-     def average_segment_length(self):
 
-         if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
 
-             return self.word_count // self.segment_count
 
-         return 0
 
-     @property
 
-     def dataset_process_rule(self):
 
-         if self.dataset_process_rule_id:
 
-             return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
 
-         return None
 
-     @property
 
-     def dataset(self):
 
-         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
 
-     @property
 
-     def segment_count(self):
 
-         return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
 
-     @property
 
-     def hit_count(self):
 
-         return (
 
-             DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count)))
 
-             .filter(DocumentSegment.document_id == self.id)
 
-             .scalar()
 
-         )
 
-     @property
 
-     def uploader(self):
 
-         user = db.session.query(Account).filter(Account.id == self.created_by).first()
 
-         return user.name if user else None
 
-     @property
 
-     def upload_date(self):
 
-         return self.created_at
 
-     @property
 
-     def last_update_date(self):
 
-         return self.updated_at
 
-     @property
 
-     def doc_metadata_details(self):
 
-         if self.doc_metadata:
 
-             document_metadatas = (
 
-                 db.session.query(DatasetMetadata)
 
-                 .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
 
-                 .filter(
 
-                     DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
 
-                 )
 
-                 .all()
 
-             )
 
-             metadata_list = []
 
-             for metadata in document_metadatas:
 
-                 metadata_dict = {
 
-                     "id": metadata.id,
 
-                     "name": metadata.name,
 
-                     "type": metadata.type,
 
-                     "value": self.doc_metadata.get(metadata.name),
 
-                 }
 
-                 metadata_list.append(metadata_dict)
 
-             # deal built-in fields
 
-             metadata_list.extend(self.get_built_in_fields())
 
-             return metadata_list
 
-         return None
 
-     @property
 
-     def process_rule_dict(self):
 
-         if self.dataset_process_rule_id:
 
-             return self.dataset_process_rule.to_dict()
 
-         return None
 
-     def get_built_in_fields(self):
 
-         built_in_fields = []
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.document_name,
 
-                 "type": "string",
 
-                 "value": self.name,
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.uploader,
 
-                 "type": "string",
 
-                 "value": self.uploader,
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.upload_date,
 
-                 "type": "time",
 
-                 "value": self.created_at.timestamp(),
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.last_update_date,
 
-                 "type": "time",
 
-                 "value": self.updated_at.timestamp(),
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.source,
 
-                 "type": "string",
 
-                 "value": MetadataDataSource[self.data_source_type].value,
 
-             }
 
-         )
 
-         return built_in_fields
 
-     def to_dict(self):
 
-         return {
 
-             "id": self.id,
 
-             "tenant_id": self.tenant_id,
 
-             "dataset_id": self.dataset_id,
 
-             "position": self.position,
 
-             "data_source_type": self.data_source_type,
 
-             "data_source_info": self.data_source_info,
 
-             "dataset_process_rule_id": self.dataset_process_rule_id,
 
-             "batch": self.batch,
 
-             "name": self.name,
 
-             "created_from": self.created_from,
 
-             "created_by": self.created_by,
 
-             "created_api_request_id": self.created_api_request_id,
 
-             "created_at": self.created_at,
 
-             "processing_started_at": self.processing_started_at,
 
-             "file_id": self.file_id,
 
-             "word_count": self.word_count,
 
-             "parsing_completed_at": self.parsing_completed_at,
 
-             "cleaning_completed_at": self.cleaning_completed_at,
 
-             "splitting_completed_at": self.splitting_completed_at,
 
-             "tokens": self.tokens,
 
-             "indexing_latency": self.indexing_latency,
 
-             "completed_at": self.completed_at,
 
-             "is_paused": self.is_paused,
 
-             "paused_by": self.paused_by,
 
-             "paused_at": self.paused_at,
 
-             "error": self.error,
 
-             "stopped_at": self.stopped_at,
 
-             "indexing_status": self.indexing_status,
 
-             "enabled": self.enabled,
 
-             "disabled_at": self.disabled_at,
 
-             "disabled_by": self.disabled_by,
 
-             "archived": self.archived,
 
-             "archived_reason": self.archived_reason,
 
-             "archived_by": self.archived_by,
 
-             "archived_at": self.archived_at,
 
-             "updated_at": self.updated_at,
 
-             "doc_type": self.doc_type,
 
-             "doc_metadata": self.doc_metadata,
 
-             "doc_form": self.doc_form,
 
-             "doc_language": self.doc_language,
 
-             "display_status": self.display_status,
 
-             "data_source_info_dict": self.data_source_info_dict,
 
-             "average_segment_length": self.average_segment_length,
 
-             "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
 
-             "dataset": self.dataset.to_dict() if self.dataset else None,
 
-             "segment_count": self.segment_count,
 
-             "hit_count": self.hit_count,
 
-         }
 
-     @classmethod
 
-     def from_dict(cls, data: dict):
 
-         return cls(
 
-             id=data.get("id"),
 
-             tenant_id=data.get("tenant_id"),
 
-             dataset_id=data.get("dataset_id"),
 
-             position=data.get("position"),
 
-             data_source_type=data.get("data_source_type"),
 
-             data_source_info=data.get("data_source_info"),
 
-             dataset_process_rule_id=data.get("dataset_process_rule_id"),
 
-             batch=data.get("batch"),
 
-             name=data.get("name"),
 
-             created_from=data.get("created_from"),
 
-             created_by=data.get("created_by"),
 
-             created_api_request_id=data.get("created_api_request_id"),
 
-             created_at=data.get("created_at"),
 
-             processing_started_at=data.get("processing_started_at"),
 
-             file_id=data.get("file_id"),
 
-             word_count=data.get("word_count"),
 
-             parsing_completed_at=data.get("parsing_completed_at"),
 
-             cleaning_completed_at=data.get("cleaning_completed_at"),
 
-             splitting_completed_at=data.get("splitting_completed_at"),
 
-             tokens=data.get("tokens"),
 
-             indexing_latency=data.get("indexing_latency"),
 
-             completed_at=data.get("completed_at"),
 
-             is_paused=data.get("is_paused"),
 
-             paused_by=data.get("paused_by"),
 
-             paused_at=data.get("paused_at"),
 
-             error=data.get("error"),
 
-             stopped_at=data.get("stopped_at"),
 
-             indexing_status=data.get("indexing_status"),
 
-             enabled=data.get("enabled"),
 
-             disabled_at=data.get("disabled_at"),
 
-             disabled_by=data.get("disabled_by"),
 
-             archived=data.get("archived"),
 
-             archived_reason=data.get("archived_reason"),
 
-             archived_by=data.get("archived_by"),
 
-             archived_at=data.get("archived_at"),
 
-             updated_at=data.get("updated_at"),
 
-             doc_type=data.get("doc_type"),
 
-             doc_metadata=data.get("doc_metadata"),
 
-             doc_form=data.get("doc_form"),
 
-             doc_language=data.get("doc_language"),
 
-         )
 
- class Template(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "template"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="template_pkey"),
 
-         db.Index("template_dataset_id_idx", "dataset_id"),
 
-         db.Index("template_is_paused_idx", "is_paused"),
 
-         db.Index("template_tenant_idx", "tenant_id"),
 
-         db.Index("template_metadata_idx", "doc_metadata", postgresql_using="gin"),
 
-     )
 
-     # initial fields
 
-     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     position = db.Column(db.Integer, nullable=False)
 
-     data_source_type = db.Column(db.String(255), nullable=False)
 
-     data_source_info = db.Column(db.Text, nullable=True)
 
-     dataset_process_rule_id = db.Column(StringUUID, nullable=True)
 
-     batch = db.Column(db.String(255), nullable=False)
 
-     name = db.Column(db.String(255), nullable=False)
 
-     created_from = db.Column(db.String(255), nullable=False)
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_api_request_id = db.Column(StringUUID, nullable=True)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     # start processing
 
-     processing_started_at = db.Column(db.DateTime, nullable=True)
 
-     # parsing
 
-     file_id = db.Column(db.Text, nullable=True)
 
-     file_url = db.Column(db.Text, nullable=True)
 
-     word_count = db.Column(db.Integer, nullable=True)
 
-     parsing_completed_at = db.Column(db.DateTime, nullable=True)
 
-     # cleaning
 
-     cleaning_completed_at = db.Column(db.DateTime, nullable=True)
 
-     # split
 
-     splitting_completed_at = db.Column(db.DateTime, nullable=True)
 
-     # indexing
 
-     tokens = db.Column(db.Integer, nullable=True)
 
-     indexing_latency = db.Column(db.Float, nullable=True)
 
-     completed_at = db.Column(db.DateTime, nullable=True)
 
-     # pause
 
-     is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
 
-     paused_by = db.Column(StringUUID, nullable=True)
 
-     paused_at = db.Column(db.DateTime, nullable=True)
 
-     # error
 
-     error = db.Column(db.Text, nullable=True)
 
-     stopped_at = db.Column(db.DateTime, nullable=True)
 
-     # basic fields
 
-     indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
 
-     enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 
-     disabled_at = db.Column(db.DateTime, nullable=True)
 
-     disabled_by = db.Column(StringUUID, nullable=True)
 
-     archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 
-     archived_reason = db.Column(db.String(255), nullable=True)
 
-     archived_by = db.Column(StringUUID, nullable=True)
 
-     archived_at = db.Column(db.DateTime, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     doc_type = db.Column(db.String(40), nullable=True)
 
-     doc_metadata = db.Column(JSONB, nullable=True)
 
-     doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
 
-     doc_language = db.Column(db.String(255), nullable=True)
 
-     DATA_SOURCES = ["upload_file"]
 
-     @property
 
-     def display_status(self):
 
-         status = None
 
-         if self.indexing_status == "waiting":
 
-             status = "queuing"
 
-         elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
 
-             status = "paused"
 
-         elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
 
-             status = "indexing"
 
-         elif self.indexing_status == "error":
 
-             status = "error"
 
-         elif self.indexing_status == "completed" and not self.archived and self.enabled:
 
-             status = "available"
 
-         elif self.indexing_status == "completed" and not self.archived and not self.enabled:
 
-             status = "disabled"
 
-         elif self.indexing_status == "completed" and self.archived:
 
-             status = "archived"
 
-         return status
 
-     @property
 
-     def data_source_info_dict(self):
 
-         if self.data_source_info:
 
-             try:
 
-                 data_source_info_dict = json.loads(self.data_source_info)
 
-             except JSONDecodeError:
 
-                 data_source_info_dict = {}
 
-             return data_source_info_dict
 
-         return None
 
-     @property
 
-     def data_source_detail_dict(self):
 
-         if self.data_source_info:
 
-             if self.data_source_type == "upload_file":
 
-                 data_source_info_dict = json.loads(self.data_source_info)
 
-                 file_detail = (
 
-                     db.session.query(UploadFile)
 
-                     .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
 
-                     .one_or_none()
 
-                 )
 
-                 if file_detail:
 
-                     return {
 
-                         "upload_file": {
 
-                             "id": file_detail.id,
 
-                             "name": file_detail.name,
 
-                             "size": file_detail.size,
 
-                             "extension": file_detail.extension,
 
-                             "mime_type": file_detail.mime_type,
 
-                             "created_by": file_detail.created_by,
 
-                             "created_at": file_detail.created_at.timestamp(),
 
-                         }
 
-                     }
 
-             elif self.data_source_type in {"notion_import", "website_crawl"}:
 
-                 return json.loads(self.data_source_info)
 
-         return {}
 
-     @property
 
-     def average_segment_length(self):
 
-         if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
 
-             return self.word_count // self.segment_count
 
-         return 0
 
-     @property
 
-     def dataset_process_rule(self):
 
-         if self.dataset_process_rule_id:
 
-             return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
 
-         return None
 
-     @property
 
-     def dataset(self):
 
-         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
 
-     @property
 
-     def segment_count(self):
 
-         return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
 
-     @property
 
-     def hit_count(self):
 
-         return (
 
-             DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count)))
 
-             .filter(DocumentSegment.document_id == self.id)
 
-             .scalar()
 
-         )
 
-     @property
 
-     def uploader(self):
 
-         user = db.session.query(Account).filter(Account.id == self.created_by).first()
 
-         return user.name if user else None
 
-     @property
 
-     def upload_date(self):
 
-         return self.created_at
 
-     @property
 
-     def last_update_date(self):
 
-         return self.updated_at
 
-     @property
 
-     def doc_metadata_details(self):
 
-         if self.doc_metadata:
 
-             document_metadatas = (
 
-                 db.session.query(DatasetMetadata)
 
-                 .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
 
-                 .filter(
 
-                     DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
 
-                 )
 
-                 .all()
 
-             )
 
-             metadata_list = []
 
-             for metadata in document_metadatas:
 
-                 metadata_dict = {
 
-                     "id": metadata.id,
 
-                     "name": metadata.name,
 
-                     "type": metadata.type,
 
-                     "value": self.doc_metadata.get(metadata.name),
 
-                 }
 
-                 metadata_list.append(metadata_dict)
 
-             # deal built-in fields
 
-             metadata_list.extend(self.get_built_in_fields())
 
-             return metadata_list
 
-         return None
 
-     @property
 
-     def process_rule_dict(self):
 
-         if self.dataset_process_rule_id:
 
-             return self.dataset_process_rule.to_dict()
 
-         return None
 
-     def get_built_in_fields(self):
 
-         built_in_fields = []
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.document_name,
 
-                 "type": "string",
 
-                 "value": self.name,
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.uploader,
 
-                 "type": "string",
 
-                 "value": self.uploader,
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.upload_date,
 
-                 "type": "time",
 
-                 "value": self.created_at.timestamp(),
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.last_update_date,
 
-                 "type": "time",
 
-                 "value": self.updated_at.timestamp(),
 
-             }
 
-         )
 
-         built_in_fields.append(
 
-             {
 
-                 "id": "built-in",
 
-                 "name": BuiltInField.source,
 
-                 "type": "string",
 
-                 "value": MetadataDataSource[self.data_source_type].value,
 
-             }
 
-         )
 
-         return built_in_fields
 
-     def to_dict(self):
 
-         return {
 
-             "id": self.id,
 
-             "tenant_id": self.tenant_id,
 
-             "dataset_id": self.dataset_id,
 
-             "position": self.position,
 
-             "data_source_type": self.data_source_type,
 
-             "data_source_info": self.data_source_info,
 
-             "dataset_process_rule_id": self.dataset_process_rule_id,
 
-             "batch": self.batch,
 
-             "name": self.name,
 
-             "created_from": self.created_from,
 
-             "created_by": self.created_by,
 
-             "created_api_request_id": self.created_api_request_id,
 
-             "created_at": self.created_at,
 
-             "processing_started_at": self.processing_started_at,
 
-             "file_id": self.file_id,
 
-             "word_count": self.word_count,
 
-             "parsing_completed_at": self.parsing_completed_at,
 
-             "cleaning_completed_at": self.cleaning_completed_at,
 
-             "splitting_completed_at": self.splitting_completed_at,
 
-             "tokens": self.tokens,
 
-             "indexing_latency": self.indexing_latency,
 
-             "completed_at": self.completed_at,
 
-             "is_paused": self.is_paused,
 
-             "paused_by": self.paused_by,
 
-             "paused_at": self.paused_at,
 
-             "error": self.error,
 
-             "stopped_at": self.stopped_at,
 
-             "indexing_status": self.indexing_status,
 
-             "enabled": self.enabled,
 
-             "disabled_at": self.disabled_at,
 
-             "disabled_by": self.disabled_by,
 
-             "archived": self.archived,
 
-             "archived_reason": self.archived_reason,
 
-             "archived_by": self.archived_by,
 
-             "archived_at": self.archived_at,
 
-             "updated_at": self.updated_at,
 
-             "doc_type": self.doc_type,
 
-             "doc_metadata": self.doc_metadata,
 
-             "doc_form": self.doc_form,
 
-             "doc_language": self.doc_language,
 
-             "display_status": self.display_status,
 
-             "data_source_info_dict": self.data_source_info_dict,
 
-             "average_segment_length": self.average_segment_length,
 
-             "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
 
-             "dataset": self.dataset.to_dict() if self.dataset else None,
 
-             "segment_count": self.segment_count,
 
-             "hit_count": self.hit_count,
 
-         }
 
-     @classmethod
 
-     def from_dict(cls, data: dict):
 
-         return cls(
 
-             id=data.get("id"),
 
-             tenant_id=data.get("tenant_id"),
 
-             dataset_id=data.get("dataset_id"),
 
-             position=data.get("position"),
 
-             data_source_type=data.get("data_source_type"),
 
-             data_source_info=data.get("data_source_info"),
 
-             dataset_process_rule_id=data.get("dataset_process_rule_id"),
 
-             batch=data.get("batch"),
 
-             name=data.get("name"),
 
-             created_from=data.get("created_from"),
 
-             created_by=data.get("created_by"),
 
-             created_api_request_id=data.get("created_api_request_id"),
 
-             created_at=data.get("created_at"),
 
-             processing_started_at=data.get("processing_started_at"),
 
-             file_id=data.get("file_id"),
 
-             word_count=data.get("word_count"),
 
-             parsing_completed_at=data.get("parsing_completed_at"),
 
-             cleaning_completed_at=data.get("cleaning_completed_at"),
 
-             splitting_completed_at=data.get("splitting_completed_at"),
 
-             tokens=data.get("tokens"),
 
-             indexing_latency=data.get("indexing_latency"),
 
-             completed_at=data.get("completed_at"),
 
-             is_paused=data.get("is_paused"),
 
-             paused_by=data.get("paused_by"),
 
-             paused_at=data.get("paused_at"),
 
-             error=data.get("error"),
 
-             stopped_at=data.get("stopped_at"),
 
-             indexing_status=data.get("indexing_status"),
 
-             enabled=data.get("enabled"),
 
-             disabled_at=data.get("disabled_at"),
 
-             disabled_by=data.get("disabled_by"),
 
-             archived=data.get("archived"),
 
-             archived_reason=data.get("archived_reason"),
 
-             archived_by=data.get("archived_by"),
 
-             archived_at=data.get("archived_at"),
 
-             updated_at=data.get("updated_at"),
 
-             doc_type=data.get("doc_type"),
 
-             doc_metadata=data.get("doc_metadata"),
 
-             doc_form=data.get("doc_form"),
 
-             doc_language=data.get("doc_language"),
 
-         )
 
- class DocumentSegment(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "document_segments"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="document_segment_pkey"),
 
-         db.Index("document_segment_dataset_id_idx", "dataset_id"),
 
-         db.Index("document_segment_document_id_idx", "document_id"),
 
-         db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
 
-         db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
 
-         db.Index("document_segment_dataset_node_idx", "dataset_id", "index_node_id"),
 
-         db.Index("document_segment_tenant_idx", "tenant_id"),
 
-     )
 
-     # initial fields
 
-     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     document_id = db.Column(StringUUID, nullable=False)
 
-     position: Mapped[int]
 
-     content = db.Column(db.Text, nullable=False)
 
-     answer = db.Column(db.Text, nullable=True)
 
-     word_count = db.Column(db.Integer, nullable=False)
 
-     tokens = db.Column(db.Integer, nullable=False)
 
-     # indexing fields
 
-     keywords = db.Column(db.JSON, nullable=True)
 
-     index_node_id = db.Column(db.String(255), nullable=True)
 
-     index_node_hash = db.Column(db.String(255), nullable=True)
 
-     # basic fields
 
-     hit_count = db.Column(db.Integer, nullable=False, default=0)
 
-     enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 
-     disabled_at = db.Column(db.DateTime, nullable=True)
 
-     disabled_by = db.Column(StringUUID, nullable=True)
 
-     status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     updated_by = db.Column(StringUUID, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     indexing_at = db.Column(db.DateTime, nullable=True)
 
-     completed_at = db.Column(db.DateTime, nullable=True)
 
-     error = db.Column(db.Text, nullable=True)
 
-     stopped_at = db.Column(db.DateTime, nullable=True)
 
-     @property
 
-     def dataset(self):
 
-         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
 
-     @property
 
-     def document(self):
 
-         return db.session.query(Document).filter(Document.id == self.document_id).first()
 
-     @property
 
-     def previous_segment(self):
 
-         return (
 
-             db.session.query(DocumentSegment)
 
-             .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1)
 
-             .first()
 
-         )
 
-     @property
 
-     def next_segment(self):
 
-         return (
 
-             db.session.query(DocumentSegment)
 
-             .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1)
 
-             .first()
 
-         )
 
-     @property
 
-     def child_chunks(self):
 
-         process_rule = self.document.dataset_process_rule
 
-         if process_rule.mode == "hierarchical":
 
-             rules = Rule(**process_rule.rules_dict)
 
-             if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
 
-                 child_chunks = (
 
-                     db.session.query(ChildChunk)
 
-                     .filter(ChildChunk.segment_id == self.id)
 
-                     .order_by(ChildChunk.position.asc())
 
-                     .all()
 
-                 )
 
-                 return child_chunks or []
 
-             else:
 
-                 return []
 
-         else:
 
-             return []
 
-     def get_child_chunks(self):
 
-         process_rule = self.document.dataset_process_rule
 
-         if process_rule.mode == "hierarchical":
 
-             rules = Rule(**process_rule.rules_dict)
 
-             if rules.parent_mode:
 
-                 child_chunks = (
 
-                     db.session.query(ChildChunk)
 
-                     .filter(ChildChunk.segment_id == self.id)
 
-                     .order_by(ChildChunk.position.asc())
 
-                     .all()
 
-                 )
 
-                 return child_chunks or []
 
-             else:
 
-                 return []
 
-         else:
 
-             return []
 
-     @property
 
-     def sign_content(self):
 
-         return self.get_sign_content()
 
-     def get_sign_content(self):
 
-         signed_urls = []
 
-         text = self.content
 
-         # For data before v0.10.0
 
-         pattern = r"/files/([a-f0-9\-]+)/image-preview"
 
-         matches = re.finditer(pattern, text)
 
-         for match in matches:
 
-             upload_file_id = match.group(1)
 
-             nonce = os.urandom(16).hex()
 
-             timestamp = str(int(time.time()))
 
-             data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
 
-             secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
 
-             sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
 
-             encoded_sign = base64.urlsafe_b64encode(sign).decode()
 
-             params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
 
-             signed_url = f"{match.group(0)}?{params}"
 
-             signed_urls.append((match.start(), match.end(), signed_url))
 
-         # For data after v0.10.0
 
-         pattern = r"/files/([a-f0-9\-]+)/file-preview"
 
-         matches = re.finditer(pattern, text)
 
-         for match in matches:
 
-             upload_file_id = match.group(1)
 
-             nonce = os.urandom(16).hex()
 
-             timestamp = str(int(time.time()))
 
-             data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
 
-             secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
 
-             sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
 
-             encoded_sign = base64.urlsafe_b64encode(sign).decode()
 
-             params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
 
-             signed_url = f"{match.group(0)}?{params}"
 
-             signed_urls.append((match.start(), match.end(), signed_url))
 
-         # Reconstruct the text with signed URLs
 
-         offset = 0
 
-         for start, end, signed_url in signed_urls:
 
-             text = text[: start + offset] + signed_url + text[end + offset :]
 
-             offset += len(signed_url) - (end - start)
 
-         return text
 
- class ChildChunk(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "child_chunks"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
 
-         db.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
 
-     )
 
-     # initial fields
 
-     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     document_id = db.Column(StringUUID, nullable=False)
 
-     segment_id = db.Column(StringUUID, nullable=False)
 
-     position = db.Column(db.Integer, nullable=False)
 
-     content = db.Column(db.Text, nullable=False)
 
-     word_count = db.Column(db.Integer, nullable=False)
 
-     # indexing fields
 
-     index_node_id = db.Column(db.String(255), nullable=True)
 
-     index_node_hash = db.Column(db.String(255), nullable=True)
 
-     type = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 
-     updated_by = db.Column(StringUUID, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 
-     indexing_at = db.Column(db.DateTime, nullable=True)
 
-     completed_at = db.Column(db.DateTime, nullable=True)
 
-     error = db.Column(db.Text, nullable=True)
 
-     @property
 
-     def dataset(self):
 
-         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
 
-     @property
 
-     def document(self):
 
-         return db.session.query(Document).filter(Document.id == self.document_id).first()
 
-     @property
 
-     def segment(self):
 
-         return db.session.query(DocumentSegment).filter(DocumentSegment.id == self.segment_id).first()
 
- class AppDatasetJoin(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "app_dataset_joins"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
 
-         db.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
 
-     )
 
-     id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     app_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
 
-     @property
 
-     def app(self):
 
-         return db.session.get(App, self.app_id)
 
- class DatasetQuery(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_queries"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
 
-         db.Index("dataset_query_dataset_id_idx", "dataset_id"),
 
-     )
 
-     id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     content = db.Column(db.Text, nullable=False)
 
-     source = db.Column(db.String(255), nullable=False)
 
-     source_app_id = db.Column(StringUUID, nullable=True)
 
-     created_by_role = db.Column(db.String, nullable=False)
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
 
- class DatasetKeywordTable(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_keyword_tables"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
 
-         db.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
 
-     )
 
-     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 
-     dataset_id = db.Column(StringUUID, nullable=False, unique=True)
 
-     keyword_table = db.Column(db.Text, nullable=False)
 
-     data_source_type = db.Column(
 
-         db.String(255), nullable=False, server_default=db.text("'database'::character varying")
 
-     )
 
-     @property
 
-     def keyword_table_dict(self):
 
-         class SetDecoder(json.JSONDecoder):
 
-             def __init__(self, *args, **kwargs):
 
-                 super().__init__(object_hook=self.object_hook, *args, **kwargs)
 
-             def object_hook(self, dct):
 
-                 if isinstance(dct, dict):
 
-                     for keyword, node_idxs in dct.items():
 
-                         if isinstance(node_idxs, list):
 
-                             dct[keyword] = set(node_idxs)
 
-                 return dct
 
-         # get dataset
 
-         dataset = Dataset.query.filter_by(id=self.dataset_id).first()
 
-         if not dataset:
 
-             return None
 
-         if self.data_source_type == "database":
 
-             return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
 
-         else:
 
-             file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
 
-             try:
 
-                 keyword_table_text = storage.load_once(file_key)
 
-                 if keyword_table_text:
 
-                     return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
 
-                 return None
 
-             except Exception as e:
 
-                 logging.exception(f"Failed to load keyword table from file: {file_key}")
 
-                 return None
 
- class Embedding(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "embeddings"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="embedding_pkey"),
 
-         db.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
 
-         db.Index("created_at_idx", "created_at"),
 
-     )
 
-     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 
-     model_name = db.Column(
 
-         db.String(255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying")
 
-     )
 
-     hash = db.Column(db.String(64), nullable=False)
 
-     embedding = db.Column(db.LargeBinary, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     provider_name = db.Column(db.String(255), nullable=False, server_default=db.text("''::character varying"))
 
-     def set_embedding(self, embedding_data: list[float]):
 
-         self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
 
-     def get_embedding(self) -> list[float]:
 
-         return cast(list[float], pickle.loads(self.embedding))  # noqa: S301
 
- class DatasetCollectionBinding(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_collection_bindings"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
 
-         db.Index("provider_model_name_idx", "provider_name", "model_name"),
 
-     )
 
-     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 
-     provider_name = db.Column(db.String(255), nullable=False)
 
-     model_name = db.Column(db.String(255), nullable=False)
 
-     type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
 
-     collection_name = db.Column(db.String(64), nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
- class TidbAuthBinding(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "tidb_auth_bindings"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
 
-         db.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
 
-         db.Index("tidb_auth_bindings_active_idx", "active"),
 
-         db.Index("tidb_auth_bindings_created_at_idx", "created_at"),
 
-         db.Index("tidb_auth_bindings_status_idx", "status"),
 
-     )
 
-     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=True)
 
-     cluster_id = db.Column(db.String(255), nullable=False)
 
-     cluster_name = db.Column(db.String(255), nullable=False)
 
-     active = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 
-     status = db.Column(db.String(255), nullable=False, server_default=db.text("CREATING"))
 
-     account = db.Column(db.String(255), nullable=False)
 
-     password = db.Column(db.String(255), nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
- class Whitelist(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "whitelists"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="whitelists_pkey"),
 
-         db.Index("whitelists_tenant_idx", "tenant_id"),
 
-     )
 
-     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=True)
 
-     category = db.Column(db.String(255), nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
- class DatasetPermission(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_permissions"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
 
-         db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
 
-         db.Index("idx_dataset_permissions_account_id", "account_id"),
 
-         db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
 
-     )
 
-     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     account_id = db.Column(StringUUID, nullable=False)
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
- class DatasetPermissionAll(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_permissions_all"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
 
-         db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
 
-         db.Index("idx_dataset_permissions_account_id", "account_id"),
 
-         db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
 
-     )
 
-     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     account_id = db.Column(StringUUID, nullable=False)
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     has_edit_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 
-     has_read_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     updated_by = db.Column(StringUUID, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     email = db.Column(db.String(255), nullable=False)
 
- class ExternalKnowledgeApis(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "external_knowledge_apis"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
 
-         db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
 
-         db.Index("external_knowledge_apis_name_idx", "name"),
 
-     )
 
-     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     name = db.Column(db.String(255), nullable=False)
 
-     description = db.Column(db.String(255), nullable=False)
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     settings = db.Column(db.Text, nullable=True)
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     updated_by = db.Column(StringUUID, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     def to_dict(self):
 
-         return {
 
-             "id": self.id,
 
-             "tenant_id": self.tenant_id,
 
-             "name": self.name,
 
-             "description": self.description,
 
-             "settings": self.settings_dict,
 
-             "dataset_bindings": self.dataset_bindings,
 
-             "created_by": self.created_by,
 
-             "created_at": self.created_at.isoformat(),
 
-         }
 
-     @property
 
-     def settings_dict(self):
 
-         try:
 
-             return json.loads(self.settings) if self.settings else None
 
-         except JSONDecodeError:
 
-             return None
 
-     @property
 
-     def dataset_bindings(self):
 
-         external_knowledge_bindings = (
 
-             db.session.query(ExternalKnowledgeBindings)
 
-             .filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
 
-             .all()
 
-         )
 
-         dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
 
-         datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
 
-         dataset_bindings = []
 
-         for dataset in datasets:
 
-             dataset_bindings.append({"id": dataset.id, "name": dataset.name})
 
-         return dataset_bindings
 
- class ExternalKnowledgeBindings(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "external_knowledge_bindings"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
 
-         db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
 
-         db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
 
-         db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
 
-         db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
 
-     )
 
-     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     external_knowledge_api_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     external_knowledge_id = db.Column(db.Text, nullable=False)
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     updated_by = db.Column(StringUUID, nullable=True)
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
- class DatasetAutoDisableLog(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_auto_disable_logs"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
 
-         db.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
 
-         db.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
 
-         db.Index("dataset_auto_disable_log_created_atx", "created_at"),
 
-     )
 
-     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     document_id = db.Column(StringUUID, nullable=False)
 
-     notified = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 
- class RateLimitLog(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "rate_limit_logs"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
 
-         db.Index("rate_limit_log_tenant_idx", "tenant_id"),
 
-         db.Index("rate_limit_log_operation_idx", "operation"),
 
-     )
 
-     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     subscription_plan = db.Column(db.String(255), nullable=False)
 
-     operation = db.Column(db.String(255), nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 
- class DatasetMetadata(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_metadatas"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
 
-         db.Index("dataset_metadata_tenant_idx", "tenant_id"),
 
-         db.Index("dataset_metadata_dataset_idx", "dataset_id"),
 
-     )
 
-     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     type = db.Column(db.String(255), nullable=False)
 
-     name = db.Column(db.String(255), nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 
-     updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 
-     created_by = db.Column(StringUUID, nullable=False)
 
-     updated_by = db.Column(StringUUID, nullable=True)
 
- class DatasetMetadataBinding(db.Model):  # type: ignore[name-defined]
 
-     __tablename__ = "dataset_metadata_bindings"
 
-     __table_args__ = (
 
-         db.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
 
-         db.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
 
-         db.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
 
-         db.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
 
-         db.Index("dataset_metadata_binding_document_idx", "document_id"),
 
-     )
 
-     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 
-     tenant_id = db.Column(StringUUID, nullable=False)
 
-     dataset_id = db.Column(StringUUID, nullable=False)
 
-     metadata_id = db.Column(StringUUID, nullable=False)
 
-     document_id = db.Column(StringUUID, nullable=False)
 
-     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 
-     created_by = db.Column(StringUUID, nullable=False)
 
 
  |