| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 | import datetimeimport loggingimport timeimport clickfrom celery import shared_task  # type: ignorefrom core.rag.index_processor.constant.index_type import IndexTypefrom core.rag.index_processor.index_processor_factory import IndexProcessorFactoryfrom core.rag.models.document import ChildDocument, Documentfrom extensions.ext_database import dbfrom extensions.ext_redis import redis_clientfrom models.dataset import Dataset, DocumentSegmentfrom models.dataset import Document as DatasetDocument@shared_task(queue="dataset")def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_id: str):    """    Async enable segments to index    :param segment_ids:    Usage: enable_segments_to_index_task.delay(segment_ids)    """    start_at = time.perf_counter()    dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()    if not dataset:        logging.info(click.style("Dataset {} not found, pass.".format(dataset_id), fg="cyan"))        return    dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first()    if not dataset_document:        logging.info(click.style("Document {} not found, pass.".format(document_id), fg="cyan"))        return    if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":        logging.info(click.style("Document {} status is invalid, pass.".format(document_id), fg="cyan"))        return    # sync index processor    index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()    segments = (        db.session.query(DocumentSegment)        .filter(            DocumentSegment.id.in_(segment_ids),            DocumentSegment.dataset_id == dataset_id,            DocumentSegment.document_id == document_id,        )        .all()    )    if not segments:        return    try:        documents = []        for segment in segments:            document = Document(                page_content=segment.content,                metadata={                    "doc_id": segment.index_node_id,                    "doc_hash": segment.index_node_hash,                    "document_id": document_id,                    "dataset_id": dataset_id,                },            )            if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:                child_chunks = segment.child_chunks                if child_chunks:                    child_documents = []                    for child_chunk in child_chunks:                        child_document = ChildDocument(                            page_content=child_chunk.content,                            metadata={                                "doc_id": child_chunk.index_node_id,                                "doc_hash": child_chunk.index_node_hash,                                "document_id": document_id,                                "dataset_id": dataset_id,                            },                        )                        child_documents.append(child_document)                    document.children = child_documents            documents.append(document)        # save vector index        index_processor.load(dataset, documents)        end_at = time.perf_counter()        logging.info(click.style("Segments enabled to index latency: {}".format(end_at - start_at), fg="green"))    except Exception as e:        logging.exception("enable segments to index failed")        # update segment error msg        db.session.query(DocumentSegment).filter(            DocumentSegment.id.in_(segment_ids),            DocumentSegment.dataset_id == dataset_id,            DocumentSegment.document_id == document_id,        ).update(            {                "error": str(e),                "status": "error",                "disabled_at": datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None),                "enabled": False,            }        )        db.session.commit()    finally:        for segment in segments:            indexing_cache_key = "segment_{}_indexing".format(segment.id)            redis_client.delete(indexing_cache_key)
 |