|
@@ -634,7 +634,345 @@ class Document(db.Model): # type: ignore[name-defined]
|
|
|
doc_language=data.get("doc_language"),
|
|
|
)
|
|
|
|
|
|
+class Template(db.Model): # type: ignore[name-defined]
|
|
|
+ __tablename__ = "template"
|
|
|
+ __table_args__ = (
|
|
|
+ db.PrimaryKeyConstraint("id", name="template_pkey"),
|
|
|
+ db.Index("template_dataset_id_idx", "dataset_id"),
|
|
|
+ db.Index("template_is_paused_idx", "is_paused"),
|
|
|
+ db.Index("template_tenant_idx", "tenant_id"),
|
|
|
+ db.Index("template_metadata_idx", "doc_metadata", postgresql_using="gin"),
|
|
|
+ )
|
|
|
+
|
|
|
+ # initial fields
|
|
|
+ id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
|
|
|
+ tenant_id = db.Column(StringUUID, nullable=False)
|
|
|
+ dataset_id = db.Column(StringUUID, nullable=False)
|
|
|
+ position = db.Column(db.Integer, nullable=False)
|
|
|
+ data_source_type = db.Column(db.String(255), nullable=False)
|
|
|
+ data_source_info = db.Column(db.Text, nullable=True)
|
|
|
+ dataset_process_rule_id = db.Column(StringUUID, nullable=True)
|
|
|
+ batch = db.Column(db.String(255), nullable=False)
|
|
|
+ name = db.Column(db.String(255), nullable=False)
|
|
|
+ created_from = db.Column(db.String(255), nullable=False)
|
|
|
+ created_by = db.Column(StringUUID, nullable=False)
|
|
|
+ created_api_request_id = db.Column(StringUUID, nullable=True)
|
|
|
+ created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
|
|
+
|
|
|
+ # start processing
|
|
|
+ processing_started_at = db.Column(db.DateTime, nullable=True)
|
|
|
+
|
|
|
+ # parsing
|
|
|
+ file_id = db.Column(db.Text, nullable=True)
|
|
|
+ file_url=db.Column(db.Text,nullable=True)
|
|
|
+ word_count = db.Column(db.Integer, nullable=True)
|
|
|
+ parsing_completed_at = db.Column(db.DateTime, nullable=True)
|
|
|
|
|
|
+ # cleaning
|
|
|
+ cleaning_completed_at = db.Column(db.DateTime, nullable=True)
|
|
|
+
|
|
|
+ # split
|
|
|
+ splitting_completed_at = db.Column(db.DateTime, nullable=True)
|
|
|
+
|
|
|
+ # indexing
|
|
|
+ tokens = db.Column(db.Integer, nullable=True)
|
|
|
+ indexing_latency = db.Column(db.Float, nullable=True)
|
|
|
+ completed_at = db.Column(db.DateTime, nullable=True)
|
|
|
+
|
|
|
+ # pause
|
|
|
+ is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
|
|
|
+ paused_by = db.Column(StringUUID, nullable=True)
|
|
|
+ paused_at = db.Column(db.DateTime, nullable=True)
|
|
|
+
|
|
|
+ # error
|
|
|
+ error = db.Column(db.Text, nullable=True)
|
|
|
+ stopped_at = db.Column(db.DateTime, nullable=True)
|
|
|
+
|
|
|
+ # basic fields
|
|
|
+ indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
|
|
|
+ enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
|
|
|
+ disabled_at = db.Column(db.DateTime, nullable=True)
|
|
|
+ disabled_by = db.Column(StringUUID, nullable=True)
|
|
|
+ archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
|
|
|
+ archived_reason = db.Column(db.String(255), nullable=True)
|
|
|
+ archived_by = db.Column(StringUUID, nullable=True)
|
|
|
+ archived_at = db.Column(db.DateTime, nullable=True)
|
|
|
+ updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
|
|
+ doc_type = db.Column(db.String(40), nullable=True)
|
|
|
+ doc_metadata = db.Column(JSONB, nullable=True)
|
|
|
+ doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
|
|
|
+ doc_language = db.Column(db.String(255), nullable=True)
|
|
|
+
|
|
|
+ DATA_SOURCES = ["upload_file"]
|
|
|
+
|
|
|
+ @property
|
|
|
+ def display_status(self):
|
|
|
+ status = None
|
|
|
+ if self.indexing_status == "waiting":
|
|
|
+ status = "queuing"
|
|
|
+ elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
|
|
|
+ status = "paused"
|
|
|
+ elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
|
|
|
+ status = "indexing"
|
|
|
+ elif self.indexing_status == "error":
|
|
|
+ status = "error"
|
|
|
+ elif self.indexing_status == "completed" and not self.archived and self.enabled:
|
|
|
+ status = "available"
|
|
|
+ elif self.indexing_status == "completed" and not self.archived and not self.enabled:
|
|
|
+ status = "disabled"
|
|
|
+ elif self.indexing_status == "completed" and self.archived:
|
|
|
+ status = "archived"
|
|
|
+ return status
|
|
|
+
|
|
|
+ @property
|
|
|
+ def data_source_info_dict(self):
|
|
|
+ if self.data_source_info:
|
|
|
+ try:
|
|
|
+ data_source_info_dict = json.loads(self.data_source_info)
|
|
|
+ except JSONDecodeError:
|
|
|
+ data_source_info_dict = {}
|
|
|
+
|
|
|
+ return data_source_info_dict
|
|
|
+ return None
|
|
|
+
|
|
|
+ @property
|
|
|
+ def data_source_detail_dict(self):
|
|
|
+ if self.data_source_info:
|
|
|
+ if self.data_source_type == "upload_file":
|
|
|
+ data_source_info_dict = json.loads(self.data_source_info)
|
|
|
+ file_detail = (
|
|
|
+ db.session.query(UploadFile)
|
|
|
+ .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
|
|
|
+ .one_or_none()
|
|
|
+ )
|
|
|
+ if file_detail:
|
|
|
+ return {
|
|
|
+ "upload_file": {
|
|
|
+ "id": file_detail.id,
|
|
|
+ "name": file_detail.name,
|
|
|
+ "size": file_detail.size,
|
|
|
+ "extension": file_detail.extension,
|
|
|
+ "mime_type": file_detail.mime_type,
|
|
|
+ "created_by": file_detail.created_by,
|
|
|
+ "created_at": file_detail.created_at.timestamp(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+ elif self.data_source_type in {"notion_import", "website_crawl"}:
|
|
|
+ return json.loads(self.data_source_info)
|
|
|
+ return {}
|
|
|
+
|
|
|
+ @property
|
|
|
+ def average_segment_length(self):
|
|
|
+ if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
|
|
|
+ return self.word_count // self.segment_count
|
|
|
+ return 0
|
|
|
+
|
|
|
+ @property
|
|
|
+ def dataset_process_rule(self):
|
|
|
+ if self.dataset_process_rule_id:
|
|
|
+ return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
|
|
|
+ return None
|
|
|
+
|
|
|
+ @property
|
|
|
+ def dataset(self):
|
|
|
+ return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
|
|
|
+
|
|
|
+ @property
|
|
|
+ def segment_count(self):
|
|
|
+ return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
|
|
|
+
|
|
|
+ @property
|
|
|
+ def hit_count(self):
|
|
|
+ return (
|
|
|
+ DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count)))
|
|
|
+ .filter(DocumentSegment.document_id == self.id)
|
|
|
+ .scalar()
|
|
|
+ )
|
|
|
+
|
|
|
+ @property
|
|
|
+ def uploader(self):
|
|
|
+ user = db.session.query(Account).filter(Account.id == self.created_by).first()
|
|
|
+ return user.name if user else None
|
|
|
+
|
|
|
+ @property
|
|
|
+ def upload_date(self):
|
|
|
+ return self.created_at
|
|
|
+
|
|
|
+ @property
|
|
|
+ def last_update_date(self):
|
|
|
+ return self.updated_at
|
|
|
+
|
|
|
+ @property
|
|
|
+ def doc_metadata_details(self):
|
|
|
+ if self.doc_metadata:
|
|
|
+ document_metadatas = (
|
|
|
+ db.session.query(DatasetMetadata)
|
|
|
+ .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
|
|
|
+ .filter(
|
|
|
+ DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
|
|
|
+ )
|
|
|
+ .all()
|
|
|
+ )
|
|
|
+ metadata_list = []
|
|
|
+ for metadata in document_metadatas:
|
|
|
+ metadata_dict = {
|
|
|
+ "id": metadata.id,
|
|
|
+ "name": metadata.name,
|
|
|
+ "type": metadata.type,
|
|
|
+ "value": self.doc_metadata.get(metadata.name),
|
|
|
+ }
|
|
|
+ metadata_list.append(metadata_dict)
|
|
|
+ # deal built-in fields
|
|
|
+ metadata_list.extend(self.get_built_in_fields())
|
|
|
+
|
|
|
+ return metadata_list
|
|
|
+ return None
|
|
|
+
|
|
|
+ @property
|
|
|
+ def process_rule_dict(self):
|
|
|
+ if self.dataset_process_rule_id:
|
|
|
+ return self.dataset_process_rule.to_dict()
|
|
|
+ return None
|
|
|
+
|
|
|
+ def get_built_in_fields(self):
|
|
|
+ built_in_fields = []
|
|
|
+ built_in_fields.append(
|
|
|
+ {
|
|
|
+ "id": "built-in",
|
|
|
+ "name": BuiltInField.document_name,
|
|
|
+ "type": "string",
|
|
|
+ "value": self.name,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ built_in_fields.append(
|
|
|
+ {
|
|
|
+ "id": "built-in",
|
|
|
+ "name": BuiltInField.uploader,
|
|
|
+ "type": "string",
|
|
|
+ "value": self.uploader,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ built_in_fields.append(
|
|
|
+ {
|
|
|
+ "id": "built-in",
|
|
|
+ "name": BuiltInField.upload_date,
|
|
|
+ "type": "time",
|
|
|
+ "value": self.created_at.timestamp(),
|
|
|
+ }
|
|
|
+ )
|
|
|
+ built_in_fields.append(
|
|
|
+ {
|
|
|
+ "id": "built-in",
|
|
|
+ "name": BuiltInField.last_update_date,
|
|
|
+ "type": "time",
|
|
|
+ "value": self.updated_at.timestamp(),
|
|
|
+ }
|
|
|
+ )
|
|
|
+ built_in_fields.append(
|
|
|
+ {
|
|
|
+ "id": "built-in",
|
|
|
+ "name": BuiltInField.source,
|
|
|
+ "type": "string",
|
|
|
+ "value": MetadataDataSource[self.data_source_type].value,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ return built_in_fields
|
|
|
+
|
|
|
+ def to_dict(self):
|
|
|
+ return {
|
|
|
+ "id": self.id,
|
|
|
+ "tenant_id": self.tenant_id,
|
|
|
+ "dataset_id": self.dataset_id,
|
|
|
+ "position": self.position,
|
|
|
+ "data_source_type": self.data_source_type,
|
|
|
+ "data_source_info": self.data_source_info,
|
|
|
+ "dataset_process_rule_id": self.dataset_process_rule_id,
|
|
|
+ "batch": self.batch,
|
|
|
+ "name": self.name,
|
|
|
+ "created_from": self.created_from,
|
|
|
+ "created_by": self.created_by,
|
|
|
+ "created_api_request_id": self.created_api_request_id,
|
|
|
+ "created_at": self.created_at,
|
|
|
+ "processing_started_at": self.processing_started_at,
|
|
|
+ "file_id": self.file_id,
|
|
|
+ "word_count": self.word_count,
|
|
|
+ "parsing_completed_at": self.parsing_completed_at,
|
|
|
+ "cleaning_completed_at": self.cleaning_completed_at,
|
|
|
+ "splitting_completed_at": self.splitting_completed_at,
|
|
|
+ "tokens": self.tokens,
|
|
|
+ "indexing_latency": self.indexing_latency,
|
|
|
+ "completed_at": self.completed_at,
|
|
|
+ "is_paused": self.is_paused,
|
|
|
+ "paused_by": self.paused_by,
|
|
|
+ "paused_at": self.paused_at,
|
|
|
+ "error": self.error,
|
|
|
+ "stopped_at": self.stopped_at,
|
|
|
+ "indexing_status": self.indexing_status,
|
|
|
+ "enabled": self.enabled,
|
|
|
+ "disabled_at": self.disabled_at,
|
|
|
+ "disabled_by": self.disabled_by,
|
|
|
+ "archived": self.archived,
|
|
|
+ "archived_reason": self.archived_reason,
|
|
|
+ "archived_by": self.archived_by,
|
|
|
+ "archived_at": self.archived_at,
|
|
|
+ "updated_at": self.updated_at,
|
|
|
+ "doc_type": self.doc_type,
|
|
|
+ "doc_metadata": self.doc_metadata,
|
|
|
+ "doc_form": self.doc_form,
|
|
|
+ "doc_language": self.doc_language,
|
|
|
+ "display_status": self.display_status,
|
|
|
+ "data_source_info_dict": self.data_source_info_dict,
|
|
|
+ "average_segment_length": self.average_segment_length,
|
|
|
+ "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
|
|
|
+ "dataset": self.dataset.to_dict() if self.dataset else None,
|
|
|
+ "segment_count": self.segment_count,
|
|
|
+ "hit_count": self.hit_count,
|
|
|
+ }
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def from_dict(cls, data: dict):
|
|
|
+ return cls(
|
|
|
+ id=data.get("id"),
|
|
|
+ tenant_id=data.get("tenant_id"),
|
|
|
+ dataset_id=data.get("dataset_id"),
|
|
|
+ position=data.get("position"),
|
|
|
+ data_source_type=data.get("data_source_type"),
|
|
|
+ data_source_info=data.get("data_source_info"),
|
|
|
+ dataset_process_rule_id=data.get("dataset_process_rule_id"),
|
|
|
+ batch=data.get("batch"),
|
|
|
+ name=data.get("name"),
|
|
|
+ created_from=data.get("created_from"),
|
|
|
+ created_by=data.get("created_by"),
|
|
|
+ created_api_request_id=data.get("created_api_request_id"),
|
|
|
+ created_at=data.get("created_at"),
|
|
|
+ processing_started_at=data.get("processing_started_at"),
|
|
|
+ file_id=data.get("file_id"),
|
|
|
+ word_count=data.get("word_count"),
|
|
|
+ parsing_completed_at=data.get("parsing_completed_at"),
|
|
|
+ cleaning_completed_at=data.get("cleaning_completed_at"),
|
|
|
+ splitting_completed_at=data.get("splitting_completed_at"),
|
|
|
+ tokens=data.get("tokens"),
|
|
|
+ indexing_latency=data.get("indexing_latency"),
|
|
|
+ completed_at=data.get("completed_at"),
|
|
|
+ is_paused=data.get("is_paused"),
|
|
|
+ paused_by=data.get("paused_by"),
|
|
|
+ paused_at=data.get("paused_at"),
|
|
|
+ error=data.get("error"),
|
|
|
+ stopped_at=data.get("stopped_at"),
|
|
|
+ indexing_status=data.get("indexing_status"),
|
|
|
+ enabled=data.get("enabled"),
|
|
|
+ disabled_at=data.get("disabled_at"),
|
|
|
+ disabled_by=data.get("disabled_by"),
|
|
|
+ archived=data.get("archived"),
|
|
|
+ archived_reason=data.get("archived_reason"),
|
|
|
+ archived_by=data.get("archived_by"),
|
|
|
+ archived_at=data.get("archived_at"),
|
|
|
+ updated_at=data.get("updated_at"),
|
|
|
+ doc_type=data.get("doc_type"),
|
|
|
+ doc_metadata=data.get("doc_metadata"),
|
|
|
+ doc_form=data.get("doc_form"),
|
|
|
+ doc_language=data.get("doc_language"),
|
|
|
+ )
|
|
|
class DocumentSegment(db.Model): # type: ignore[name-defined]
|
|
|
__tablename__ = "document_segments"
|
|
|
__table_args__ = (
|