'use client' import React, { useState, useRef, useEffect, useLayoutEffect } from 'react' import { useTranslation } from 'react-i18next' import { useBoolean } from 'ahooks' import type { File, PreProcessingRule, Rules, FileIndexingEstimateResponse as IndexingEstimateResponse } from '@/models/datasets' import { fetchDefaultProcessRule, createFirstDocument, createDocument, fetchFileIndexingEstimate as didFetchFileIndexingEstimate, } from '@/service/datasets' import type { CreateDocumentReq, createDocumentResponse } from '@/models/datasets' import Button from '@/app/components/base/button' import PreviewItem from './preview-item' import Loading from '@/app/components/base/loading' import { XMarkIcon } from '@heroicons/react/20/solid' import cn from 'classnames' import s from './index.module.css' import Link from 'next/link' import Toast from '@/app/components/base/toast' import { formatNumber } from '@/utils/format' type StepTwoProps = { hasSetAPIKEY: boolean, onSetting: () => void, datasetId?: string, indexingType?: string, file?: File, onStepChange: (delta: number) => void, updateIndexingTypeCache: (type: string) => void, updateResultCache: (res: createDocumentResponse) => void } enum SegmentType { AUTO = 'automatic', CUSTOM = 'custom', } enum IndexingType { QUALIFIED = 'high_quality', ECONOMICAL = 'economy', } const StepTwo = ({ hasSetAPIKEY, onSetting, datasetId, indexingType, file, onStepChange, updateIndexingTypeCache, updateResultCache, }: StepTwoProps) => { const { t } = useTranslation() const scrollRef = useRef(null) const [scrolled, setScrolled] = useState(false) const previewScrollRef = useRef(null) const [previewScrolled, setPreviewScrolled] = useState(false) const [segmentationType, setSegmentationType] = useState(SegmentType.AUTO) const [segmentIdentifier, setSegmentIdentifier] = useState('\\n') const [max, setMax] = useState(1000) const [rules, setRules] = useState([]) const [defaultConfig, setDefaultConfig] = useState() const hasSetIndexType = !!indexingType const [indexType, setIndexType] = useState( indexingType || hasSetAPIKEY ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL ) const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean() const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState(null) const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState(null) const fileIndexingEstimate = (() => { return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate })() const scrollHandle = (e: any) => { if (e.target.scrollTop > 0) { setScrolled(true) } else { setScrolled(false) } } const previewScrollHandle = (e: any) => { if (e.target.scrollTop > 0) { setPreviewScrolled(true) } else { setPreviewScrolled(false) } } const getFileName = (name: string) => { const arr = name.split('.') return arr.slice(0, -1).join('.') } const getRuleName = (key: string) => { if (key === 'remove_extra_spaces') { return t('datasetCreation.stepTwo.removeExtraSpaces') } if (key === 'remove_urls_emails') { return t('datasetCreation.stepTwo.removeUrlEmails') } if (key === 'remove_stopwords') { return t('datasetCreation.stepTwo.removeStopwords') } } const ruleChangeHandle = (id: string) => { const newRules = rules.map(rule => { if (rule.id === id) { return { id: rule.id, enabled: !rule.enabled, } } return rule }) setRules(newRules) } const resetRules = () => { if (defaultConfig) { setSegmentIdentifier(defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator || '\\n') setMax(defaultConfig.segmentation.max_tokens) setRules(defaultConfig.pre_processing_rules) } } const confirmChangeCustomConfig = async () => { setCustomFileIndexingEstimate(null) setShowPreview() await fetchFileIndexingEstimate() } const getIndexing_technique = () => indexingType ? indexingType : indexType const getProcessRule = () => { const processRule: any = { rules: {}, // api will check this. It will be removed after api refactored. mode: segmentationType, } if (segmentationType === SegmentType.CUSTOM) { const ruleObj = { pre_processing_rules: rules, segmentation: { separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier, max_tokens: max, }, } processRule.rules = ruleObj } return processRule } const getFileIndexingEstimateParams = () => { const params = { file_id: file?.id, dataset_id: datasetId, indexing_technique: getIndexing_technique(), process_rule: getProcessRule(), } return params } const fetchFileIndexingEstimate = async () => { const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams()) if (segmentationType === SegmentType.CUSTOM) { setCustomFileIndexingEstimate(res) } else { setAutomaticFileIndexingEstimate(res) } } const getCreationParams = () => { const params = { data_source: { type: 'upload_file', info: file?.id, name: file?.name, }, indexing_technique: getIndexing_technique(), process_rule: getProcessRule(), } as CreateDocumentReq return params } const getRules = async () => { try { const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' }) const separator = res.rules.segmentation.separator setSegmentIdentifier(separator === '\n' ? '\\n' : separator || '\\n') setMax(res.rules.segmentation.max_tokens) setRules(res.rules.pre_processing_rules) setDefaultConfig(res.rules) } catch (err) { console.log(err) } } const createHandle = async () => { try { let res; const params = getCreationParams() if (!datasetId) { res = await createFirstDocument({ body: params }) updateIndexingTypeCache(indexType) updateResultCache(res) } else { res = await createDocument({ datasetId, body: params }) updateIndexingTypeCache(indexType) updateResultCache({ document: res, }) } onStepChange(+1) } catch (err) { Toast.notify({ type: 'error', message: err + '', }) } } useEffect(() => { // fetch rules getRules() }, []) useEffect(() => { scrollRef.current?.addEventListener('scroll', scrollHandle); return () => { scrollRef.current?.removeEventListener('scroll', scrollHandle); } }, []) useLayoutEffect(() => { if (showPreview) { previewScrollRef.current?.addEventListener('scroll', previewScrollHandle); return () => { previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle); } } }, [showPreview]) useEffect(() => { // get indexing type by props if (indexingType) { setIndexType(indexingType as IndexingType) } else { setIndexType(hasSetAPIKEY ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) } }, [hasSetAPIKEY, indexingType, datasetId]) useEffect(() => { if (segmentationType === SegmentType.AUTO) { setAutomaticFileIndexingEstimate(null) setShowPreview() fetchFileIndexingEstimate() } else { hidePreview() setCustomFileIndexingEstimate(null) } }, [segmentationType, indexType]) return (
{t('datasetCreation.steps.two')}
{t('datasetCreation.stepTwo.segmentation')}
setSegmentationType(SegmentType.AUTO)} >
{t('datasetCreation.stepTwo.auto')}
{t('datasetCreation.stepTwo.autoDescription')}
setSegmentationType(SegmentType.CUSTOM)} >
{t('datasetCreation.stepTwo.custom')}
{t('datasetCreation.stepTwo.customDescription')}
{segmentationType === SegmentType.CUSTOM && (
{t('datasetCreation.stepTwo.separator')}
setSegmentIdentifier(e.target.value)} />
{t('datasetCreation.stepTwo.maxLength')}
setMax(Number(e.target.value))} />
{t('datasetCreation.stepTwo.rules')}
{rules.map(rule => (
ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
))}
)}
{t('datasetCreation.stepTwo.indexMode')}
{(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
{ if (hasSetAPIKEY) { setIndexType(IndexingType.QUALIFIED) } }} > {!hasSetIndexType && }
{t('datasetCreation.stepTwo.qualified')} {!hasSetIndexType && {t('datasetCreation.stepTwo.recommend')}}
{t('datasetCreation.stepTwo.qualifiedTip')}
{t('datasetCreation.stepTwo.emstimateCost')}
{ !!fileIndexingEstimate ? (
{formatNumber(fileIndexingEstimate.tokens)} tokens(${formatNumber(fileIndexingEstimate.total_price)})
) : (
{t('datasetCreation.stepTwo.calculating')}
) }
{!hasSetAPIKEY && (
{t('datasetCreation.stepTwo.warning')}  {t('datasetCreation.stepTwo.click')}
)}
)} {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
!hasSetIndexType && setIndexType(IndexingType.ECONOMICAL)} > {!hasSetIndexType && }
{t('datasetCreation.stepTwo.economical')}
{t('datasetCreation.stepTwo.economicalTip')}
{t('datasetCreation.stepTwo.emstimateCost')}
0 tokens
)}
{hasSetIndexType && (
{t('datasetCreation.stepTwo.indexSettedTip')} {t('datasetCreation.stepTwo.datasetSettingLink')}
)}
{t('datasetCreation.stepTwo.fileName')}
{getFileName(file?.name || '')}
{t('datasetCreation.stepTwo.emstimateSegment')}
{ !!fileIndexingEstimate ? (
{formatNumber(fileIndexingEstimate.total_segments)}
) : (
{t('datasetCreation.stepTwo.calculating')}
) }
{(showPreview) ? (
{t('datasetCreation.stepTwo.previewTitle')}
{fileIndexingEstimate?.preview ? ( <> {fileIndexingEstimate?.preview.map((item, index) => ( ))} ) :
}
) : (
{t('datasetCreation.stepTwo.sideTipTitle')}

{t('datasetCreation.stepTwo.sideTipP1')}

{t('datasetCreation.stepTwo.sideTipP2')}

{t('datasetCreation.stepTwo.sideTipP3')}

{t('datasetCreation.stepTwo.sideTipP4')}

)}
) } export default StepTwo