'use client' import React, { useEffect, useLayoutEffect, useRef, useState } from 'react' import { useTranslation } from 'react-i18next' import { useContext } from 'use-context-selector' import { useBoolean } from 'ahooks' import { XMarkIcon } from '@heroicons/react/20/solid' import { RocketLaunchIcon } from '@heroicons/react/24/outline' import cn from 'classnames' import Link from 'next/link' import { groupBy } from 'lodash-es' import RetrievalMethodInfo from '../../common/retrieval-method-info' import PreviewItem, { PreviewType } from './preview-item' import LanguageSelect from './language-select' import s from './index.module.css' import type { CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' import { createDocument, createFirstDocument, fetchFileIndexingEstimate as didFetchFileIndexingEstimate, fetchDefaultProcessRule, } from '@/service/datasets' import Button from '@/app/components/base/button' import Loading from '@/app/components/base/loading' import FloatRightContainer from '@/app/components/base/float-right-container' import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' import { type RetrievalConfig } from '@/types/app' import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' import Toast from '@/app/components/base/toast' import { formatNumber } from '@/utils/format' import type { NotionPage } from '@/models/common' import { DataSourceType, DocForm } from '@/models/datasets' import NotionIcon from '@/app/components/base/notion-icon' import Switch from '@/app/components/base/switch' import { MessageChatSquare } from '@/app/components/base/icons/src/public/common' import { XClose } from '@/app/components/base/icons/src/vender/line/general' import { useDatasetDetailContext } from '@/context/dataset-detail' import I18n from '@/context/i18n' import { IS_CE_EDITION } from '@/config' import { RETRIEVE_METHOD } from '@/types/app' import { useProviderContext } from '@/context/provider-context' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' import Tooltip from '@/app/components/base/tooltip' type ValueOf = T[keyof T] type StepTwoProps = { isSetting?: boolean documentDetail?: FullDocumentDetail hasSetAPIKEY: boolean onSetting: () => void datasetId?: string indexingType?: ValueOf dataSourceType: DataSourceType files: CustomFile[] notionPages?: NotionPage[] onStepChange?: (delta: number) => void updateIndexingTypeCache?: (type: string) => void updateResultCache?: (res: createDocumentResponse) => void onSave?: () => void onCancel?: () => void } enum SegmentType { AUTO = 'automatic', CUSTOM = 'custom', } enum IndexingType { QUALIFIED = 'high_quality', ECONOMICAL = 'economy', } const StepTwo = ({ isSetting, documentDetail, hasSetAPIKEY, onSetting, datasetId, indexingType, dataSourceType, files, notionPages = [], onStepChange, updateIndexingTypeCache, updateResultCache, onSave, onCancel, }: StepTwoProps) => { const { t } = useTranslation() const { locale } = useContext(I18n) const media = useBreakpoints() const isMobile = media === MediaType.mobile const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext() const scrollRef = useRef(null) const [scrolled, setScrolled] = useState(false) const previewScrollRef = useRef(null) const [previewScrolled, setPreviewScrolled] = useState(false) const [segmentationType, setSegmentationType] = useState(SegmentType.AUTO) const [segmentIdentifier, setSegmentIdentifier] = useState('\\n') const [max, setMax] = useState(1000) const [rules, setRules] = useState([]) const [defaultConfig, setDefaultConfig] = useState() const hasSetIndexType = !!indexingType const [indexType, setIndexType] = useState>( (indexingType || hasSetAPIKEY) ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL, ) const [docForm, setDocForm] = useState( (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT, ) const [docLanguage, setDocLanguage] = useState(locale === 'en' ? 'English' : 'Chinese') const [QATipHide, setQATipHide] = useState(false) const [previewSwitched, setPreviewSwitched] = useState(false) const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean() const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState(null) const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState(null) const [estimateTokes, setEstimateTokes] = useState | null>(null) const fileIndexingEstimate = (() => { return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate })() const [isCreating, setIsCreating] = useState(false) const scrollHandle = (e: Event) => { if ((e.target as HTMLDivElement).scrollTop > 0) setScrolled(true) else setScrolled(false) } const previewScrollHandle = (e: Event) => { if ((e.target as HTMLDivElement).scrollTop > 0) setPreviewScrolled(true) else setPreviewScrolled(false) } const getFileName = (name: string) => { const arr = name.split('.') return arr.slice(0, -1).join('.') } const getRuleName = (key: string) => { if (key === 'remove_extra_spaces') return t('datasetCreation.stepTwo.removeExtraSpaces') if (key === 'remove_urls_emails') return t('datasetCreation.stepTwo.removeUrlEmails') if (key === 'remove_stopwords') return t('datasetCreation.stepTwo.removeStopwords') } const ruleChangeHandle = (id: string) => { const newRules = rules.map((rule) => { if (rule.id === id) { return { id: rule.id, enabled: !rule.enabled, } } return rule }) setRules(newRules) } const resetRules = () => { if (defaultConfig) { setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n') setMax(defaultConfig.segmentation.max_tokens) setRules(defaultConfig.pre_processing_rules) } } const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT) => { // eslint-disable-next-line @typescript-eslint/no-use-before-define const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm)!) if (segmentationType === SegmentType.CUSTOM) { setCustomFileIndexingEstimate(res) } else { setAutomaticFileIndexingEstimate(res) indexType === IndexingType.QUALIFIED && setEstimateTokes({ tokens: res.tokens, total_price: res.total_price }) } } const confirmChangeCustomConfig = () => { setCustomFileIndexingEstimate(null) setShowPreview() fetchFileIndexingEstimate() setPreviewSwitched(false) } const getIndexing_technique = () => indexingType || indexType const getProcessRule = () => { const processRule: ProcessRule = { rules: {} as any, // api will check this. It will be removed after api refactored. mode: segmentationType, } if (segmentationType === SegmentType.CUSTOM) { const ruleObj = { pre_processing_rules: rules, segmentation: { separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier, max_tokens: max, }, } processRule.rules = ruleObj } return processRule } const getNotionInfo = () => { const workspacesMap = groupBy(notionPages, 'workspace_id') const workspaces = Object.keys(workspacesMap).map((workspaceId) => { return { workspaceId, pages: workspacesMap[workspaceId], } }) return workspaces.map((workspace) => { return { workspace_id: workspace.workspaceId, pages: workspace.pages.map((page) => { const { page_id, page_name, page_icon, type } = page return { page_id, page_name, page_icon, type, } }), } }) as NotionInfo[] } const getFileIndexingEstimateParams = (docForm: DocForm): IndexingEstimateParams | undefined => { if (dataSourceType === DataSourceType.FILE) { return { info_list: { data_source_type: dataSourceType, file_info_list: { file_ids: files.map(file => file.id) as string[], }, }, indexing_technique: getIndexing_technique() as string, process_rule: getProcessRule(), doc_form: docForm, doc_language: docLanguage, dataset_id: datasetId as string, } } if (dataSourceType === DataSourceType.NOTION) { return { info_list: { data_source_type: dataSourceType, notion_info_list: getNotionInfo(), }, indexing_technique: getIndexing_technique() as string, process_rule: getProcessRule(), doc_form: docForm, doc_language: docLanguage, dataset_id: datasetId as string, } } } const { rerankDefaultModel, isRerankDefaultModelVaild, rerankModelList, } = useProviderContext() const getCreationParams = () => { let params if (isSetting) { params = { original_document_id: documentDetail?.id, doc_form: docForm, doc_language: docLanguage, process_rule: getProcessRule(), // eslint-disable-next-line @typescript-eslint/no-use-before-define retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page. } as CreateDocumentReq } else { // create const indexMethod = getIndexing_technique() if ( !isReRankModelSelected({ rerankDefaultModel, isRerankDefaultModelVaild, rerankModelList, // eslint-disable-next-line @typescript-eslint/no-use-before-define retrievalConfig, indexMethod: indexMethod as string, }) ) { Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') }) return } const postRetrievalConfig = ensureRerankModelSelected({ rerankDefaultModel: rerankDefaultModel!, // eslint-disable-next-line @typescript-eslint/no-use-before-define retrievalConfig, indexMethod: indexMethod as string, }) params = { data_source: { type: dataSourceType, info_list: { data_source_type: dataSourceType, }, }, indexing_technique: getIndexing_technique(), process_rule: getProcessRule(), doc_form: docForm, doc_language: docLanguage, retrieval_model: postRetrievalConfig, } as CreateDocumentReq if (dataSourceType === DataSourceType.FILE) { params.data_source.info_list.file_info_list = { file_ids: files.map(file => file.id || '').filter(Boolean), } } if (dataSourceType === DataSourceType.NOTION) params.data_source.info_list.notion_info_list = getNotionInfo() } return params } const getRules = async () => { try { const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' }) const separator = res.rules.segmentation.separator setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') setMax(res.rules.segmentation.max_tokens) setRules(res.rules.pre_processing_rules) setDefaultConfig(res.rules) } catch (err) { console.log(err) } } const getRulesFromDetail = () => { if (documentDetail) { const rules = documentDetail.dataset_process_rule.rules const separator = rules.segmentation.separator const max = rules.segmentation.max_tokens setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') setMax(max) setRules(rules.pre_processing_rules) setDefaultConfig(rules) } } const getDefaultMode = () => { if (documentDetail) setSegmentationType(documentDetail.dataset_process_rule.mode) } const createHandle = async () => { if (isCreating) return setIsCreating(true) try { let res const params = getCreationParams() if (!params) return false setIsCreating(true) if (!datasetId) { res = await createFirstDocument({ body: params as CreateDocumentReq, }) updateIndexingTypeCache && updateIndexingTypeCache(indexType as string) updateResultCache && updateResultCache(res) } else { res = await createDocument({ datasetId, body: params as CreateDocumentReq, }) updateIndexingTypeCache && updateIndexingTypeCache(indexType as string) updateResultCache && updateResultCache(res) } if (mutateDatasetRes) mutateDatasetRes() onStepChange && onStepChange(+1) isSetting && onSave && onSave() } catch (err) { Toast.notify({ type: 'error', message: `${err}`, }) } finally { setIsCreating(false) } } const handleSwitch = (state: boolean) => { if (state) setDocForm(DocForm.QA) else setDocForm(DocForm.TEXT) } const handleSelect = (language: string) => { setDocLanguage(language) } const changeToEconomicalType = () => { if (!hasSetIndexType) { setIndexType(IndexingType.ECONOMICAL) setDocForm(DocForm.TEXT) } } const previewSwitch = async () => { setPreviewSwitched(true) if (segmentationType === SegmentType.AUTO) setAutomaticFileIndexingEstimate(null) else setCustomFileIndexingEstimate(null) await fetchFileIndexingEstimate(DocForm.QA) } useEffect(() => { // fetch rules if (!isSetting) { getRules() } else { getRulesFromDetail() getDefaultMode() } }, []) useEffect(() => { scrollRef.current?.addEventListener('scroll', scrollHandle) return () => { scrollRef.current?.removeEventListener('scroll', scrollHandle) } }, []) useLayoutEffect(() => { if (showPreview) { previewScrollRef.current?.addEventListener('scroll', previewScrollHandle) return () => { previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle) } } }, [showPreview]) useEffect(() => { if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA) setDocForm(DocForm.TEXT) }, [indexingType, docForm]) useEffect(() => { // get indexing type by props if (indexingType) setIndexType(indexingType as IndexingType) else setIndexType(hasSetAPIKEY ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) }, [hasSetAPIKEY, indexingType, datasetId]) useEffect(() => { if (segmentationType === SegmentType.AUTO) { setAutomaticFileIndexingEstimate(null) !isMobile && setShowPreview() fetchFileIndexingEstimate() setPreviewSwitched(false) } else { hidePreview() setCustomFileIndexingEstimate(null) setPreviewSwitched(false) } }, [segmentationType, indexType]) const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || { search_method: RETRIEVE_METHOD.semantic, reranking_enable: false, reranking_model: { reranking_provider_name: rerankDefaultModel?.model_provider.provider_name, reranking_model_name: rerankDefaultModel?.model_name, }, top_k: 3, score_threshold_enabled: false, score_threshold: 0.5, } as RetrievalConfig) return (
{t('datasetCreation.steps.two')} {isMobile && ( )}
{t('datasetCreation.stepTwo.segmentation')}
setSegmentationType(SegmentType.AUTO)} >
{t('datasetCreation.stepTwo.auto')}
{t('datasetCreation.stepTwo.autoDescription')}
setSegmentationType(SegmentType.CUSTOM)} >
{t('datasetCreation.stepTwo.custom')}
{t('datasetCreation.stepTwo.customDescription')}
{segmentationType === SegmentType.CUSTOM && (
{t('datasetCreation.stepTwo.separator')}
setSegmentIdentifier(e.target.value)} />
{t('datasetCreation.stepTwo.maxLength')}
setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))} />
{t('datasetCreation.stepTwo.rules')}
{rules.map(rule => (
ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
))}
)}
{t('datasetCreation.stepTwo.indexMode')}
{(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
{ if (hasSetAPIKEY) setIndexType(IndexingType.QUALIFIED) }} > {!hasSetIndexType && }
{t('datasetCreation.stepTwo.qualified')} {!hasSetIndexType && {t('datasetCreation.stepTwo.recommend')}}
{t('datasetCreation.stepTwo.qualifiedTip')}
{t('datasetCreation.stepTwo.emstimateCost')}
{ estimateTokes ? (
{formatNumber(estimateTokes.tokens)} tokens(${formatNumber(estimateTokes.total_price)})
) : (
{t('datasetCreation.stepTwo.calculating')}
) }
{!hasSetAPIKEY && (
{t('datasetCreation.stepTwo.warning')}  {t('datasetCreation.stepTwo.click')}
)}
)} {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
{!hasSetIndexType && }
{t('datasetCreation.stepTwo.economical')}
{t('datasetCreation.stepTwo.economicalTip')}
{t('datasetCreation.stepTwo.emstimateCost')}
0 tokens
)}
{hasSetIndexType && (
{t('datasetCreation.stepTwo.indexSettedTip')} {t('datasetCreation.stepTwo.datasetSettingLink')}
)} {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
{t('datasetCreation.stepTwo.QATitle')}
{t('datasetCreation.stepTwo.QALanguage')}
{docForm === DocForm.QA && !QATipHide && (
{t('datasetCreation.stepTwo.QATip')} setQATipHide(true)} />
)}
)} {/* Retrieval Method Config */}
{!datasetId ? (
{t('datasetSettings.form.retrievalSetting.title')}
{t('datasetSettings.form.retrievalSetting.learnMore')} {t('datasetSettings.form.retrievalSetting.longDescription')}
) : (
{t('datasetSettings.form.retrievalSetting.title')}
)}
{!datasetId ? (<> {getIndexing_technique() === IndexingType.QUALIFIED ? ( ) : ( )} ) : (
{t('datasetCreation.stepTwo.retrivalSettedTip')} {t('datasetCreation.stepTwo.datasetSettingLink')}
)}
{dataSourceType === DataSourceType.FILE && ( <>
{t('datasetCreation.stepTwo.fileSource')}
{getFileName(files[0].name || '')} {files.length > 1 && ( {t('datasetCreation.stepTwo.other')} {files.length - 1} {t('datasetCreation.stepTwo.fileUnit')} )}
)} {dataSourceType === DataSourceType.NOTION && ( <>
{t('datasetCreation.stepTwo.notionSource')}
{notionPages[0]?.page_name} {notionPages.length > 1 && ( {t('datasetCreation.stepTwo.other')} {notionPages.length - 1} {t('datasetCreation.stepTwo.notionUnit')} )}
)}
{t('datasetCreation.stepTwo.emstimateSegment')}
{ fileIndexingEstimate ? (
{formatNumber(fileIndexingEstimate.total_segments)}
) : (
{t('datasetCreation.stepTwo.calculating')}
) }
{!isSetting ? (
) : (
)}
{showPreview &&
{t('datasetCreation.stepTwo.previewTitle')}
{docForm === DocForm.QA && !previewSwitched && ( )}
{docForm === DocForm.QA && !previewSwitched && (
{t('datasetCreation.stepTwo.previewSwitchTipStart')} {t('datasetCreation.stepTwo.previewSwitchTipEnd')}
)}
{previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && ( <> {fileIndexingEstimate?.qa_preview.map((item, index) => ( ))} )} {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && ( <> {fileIndexingEstimate?.preview.map((item, index) => ( ))} )} {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
)} {!previewSwitched && !fileIndexingEstimate?.preview && (
)}
} {!showPreview && (
{t('datasetCreation.stepTwo.sideTipTitle')}

{t('datasetCreation.stepTwo.sideTipP1')}

{t('datasetCreation.stepTwo.sideTipP2')}

{t('datasetCreation.stepTwo.sideTipP3')}

{t('datasetCreation.stepTwo.sideTipP4')}

)}
) } export default StepTwo