|
@@ -1,5 +1,5 @@
|
|
|
'use client'
|
|
|
-import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
|
|
|
+import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
|
|
|
import { useTranslation } from 'react-i18next'
|
|
|
import { useContext } from 'use-context-selector'
|
|
|
import { useBoolean } from 'ahooks'
|
|
@@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es'
|
|
|
import PreviewItem, { PreviewType } from './preview-item'
|
|
|
import LanguageSelect from './language-select'
|
|
|
import s from './index.module.css'
|
|
|
+import unescape from './unescape'
|
|
|
+import escape from './escape'
|
|
|
import cn from '@/utils/classnames'
|
|
|
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
|
|
import {
|
|
@@ -78,6 +80,8 @@ enum IndexingType {
|
|
|
ECONOMICAL = 'economy',
|
|
|
}
|
|
|
|
|
|
+const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
|
|
+
|
|
|
const StepTwo = ({
|
|
|
isSetting,
|
|
|
documentDetail,
|
|
@@ -110,8 +114,11 @@ const StepTwo = ({
|
|
|
const previewScrollRef = useRef<HTMLDivElement>(null)
|
|
|
const [previewScrolled, setPreviewScrolled] = useState(false)
|
|
|
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
|
|
|
- const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
|
|
|
- const [max, setMax] = useState(5000) // default chunk length
|
|
|
+ const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
|
|
|
+ const setSegmentIdentifier = useCallback((value: string) => {
|
|
|
+ doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
|
|
|
+ }, [])
|
|
|
+ const [max, setMax] = useState(4000) // default chunk length
|
|
|
const [overlap, setOverlap] = useState(50)
|
|
|
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
|
|
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
|
@@ -183,7 +190,7 @@ const StepTwo = ({
|
|
|
}
|
|
|
const resetRules = () => {
|
|
|
if (defaultConfig) {
|
|
|
- setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
|
|
|
+ setSegmentIdentifier(defaultConfig.segmentation.separator)
|
|
|
setMax(defaultConfig.segmentation.max_tokens)
|
|
|
setOverlap(defaultConfig.segmentation.chunk_overlap)
|
|
|
setRules(defaultConfig.pre_processing_rules)
|
|
@@ -217,7 +224,7 @@ const StepTwo = ({
|
|
|
const ruleObj = {
|
|
|
pre_processing_rules: rules,
|
|
|
segmentation: {
|
|
|
- separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
|
|
|
+ separator: unescape(segmentIdentifier),
|
|
|
max_tokens: max,
|
|
|
chunk_overlap: overlap,
|
|
|
},
|
|
@@ -394,7 +401,7 @@ const StepTwo = ({
|
|
|
try {
|
|
|
const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
|
|
|
const separator = res.rules.segmentation.separator
|
|
|
- setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
|
|
|
+ setSegmentIdentifier(separator)
|
|
|
setMax(res.rules.segmentation.max_tokens)
|
|
|
setOverlap(res.rules.segmentation.chunk_overlap)
|
|
|
setRules(res.rules.pre_processing_rules)
|
|
@@ -411,7 +418,7 @@ const StepTwo = ({
|
|
|
const separator = rules.segmentation.separator
|
|
|
const max = rules.segmentation.max_tokens
|
|
|
const overlap = rules.segmentation.chunk_overlap
|
|
|
- setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
|
|
|
+ setSegmentIdentifier(separator)
|
|
|
setMax(max)
|
|
|
setOverlap(overlap)
|
|
|
setRules(rules.pre_processing_rules)
|
|
@@ -616,12 +623,22 @@ const StepTwo = ({
|
|
|
<div className={s.typeFormBody}>
|
|
|
<div className={s.formRow}>
|
|
|
<div className='w-full'>
|
|
|
- <div className={s.label}>{t('datasetCreation.stepTwo.separator')}</div>
|
|
|
+ <div className={s.label}>
|
|
|
+ {t('datasetCreation.stepTwo.separator')}
|
|
|
+ <Tooltip
|
|
|
+ popupContent={
|
|
|
+ <div className='max-w-[200px]'>
|
|
|
+ {t('datasetCreation.stepTwo.separatorTip')}
|
|
|
+ </div>
|
|
|
+ }
|
|
|
+ />
|
|
|
+ </div>
|
|
|
<input
|
|
|
type="text"
|
|
|
className={s.input}
|
|
|
- placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
|
|
|
- onChange={e => setSegmentIdentifier(e.target.value)}
|
|
|
+ placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
|
|
|
+ value={segmentIdentifier}
|
|
|
+ onChange={e => doSetSegmentIdentifier(e.target.value)}
|
|
|
/>
|
|
|
</div>
|
|
|
</div>
|