| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172 | 'use client'import type { FC, PropsWithChildren } from 'react'import React, { useCallback, useEffect, useRef, useState } from 'react'import { useTranslation } from 'react-i18next'import { useContext } from 'use-context-selector'import {  RiAlertFill,  RiArrowLeftLine,  RiSearchEyeLine,} from '@remixicon/react'import Link from 'next/link'import Image from 'next/image'import { useHover } from 'ahooks'import SettingCog from '../assets/setting-gear-mod.svg'import OrangeEffect from '../assets/option-card-effect-orange.svg'import FamilyMod from '../assets/family-mod.svg'import Note from '../assets/note-mod.svg'import FileList from '../assets/file-list-3-fill.svg'import { indexMethodIcon } from '../icons'import { PreviewContainer } from '../../preview/container'import { ChunkContainer, QAPreview } from '../../chunk'import { PreviewHeader } from '../../preview/header'import { FormattedText } from '../../formatted-text/formatted'import { PreviewSlice } from '../../formatted-text/flavours/preview-slice'import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker'import s from './index.module.css'import unescape from './unescape'import escape from './escape'import { OptionCard } from './option-card'import LanguageSelect from './language-select'import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'import cn from '@/utils/classnames'import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'import Button from '@/app/components/base/button'import FloatRightContainer from '@/app/components/base/float-right-container'import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'import type { RetrievalConfig } from '@/types/app'import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'import Toast from '@/app/components/base/toast'import type { NotionPage } from '@/models/common'import { DataSourceProvider } from '@/models/common'import { useDatasetDetailContext } from '@/context/dataset-detail'import I18n from '@/context/i18n'import { RETRIEVE_METHOD } from '@/types/app'import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'import { LanguagesSupported } from '@/i18n/language'import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'import Checkbox from '@/app/components/base/checkbox'import RadioCard from '@/app/components/base/radio-card'import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config'import Divider from '@/app/components/base/divider'import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset'import Badge from '@/app/components/base/badge'import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton'import Tooltip from '@/app/components/base/tooltip'import CustomDialog from '@/app/components/base/dialog'import { PortalToFollowElem, PortalToFollowElemContent, PortalToFollowElemTrigger } from '@/app/components/base/portal-to-follow-elem'import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'const TextLabel: FC<PropsWithChildren> = (props) => {  return <label className='system-sm-semibold text-text-secondary'>{props.children}</label>}type StepTwoProps = {  isSetting?: boolean  documentDetail?: FullDocumentDetail  isAPIKeySet: boolean  onSetting: () => void  datasetId?: string  indexingType?: IndexingType  retrievalMethod?: string  dataSourceType: DataSourceType  files: CustomFile[]  notionPages?: NotionPage[]  websitePages?: CrawlResultItem[]  crawlOptions?: CrawlOptions  websiteCrawlProvider?: DataSourceProvider  websiteCrawlJobId?: string  onStepChange?: (delta: number) => void  updateIndexingTypeCache?: (type: string) => void  updateRetrievalMethodCache?: (method: string) => void  updateResultCache?: (res: createDocumentResponse) => void  onSave?: () => void  onCancel?: () => void}export enum IndexingType {  QUALIFIED = 'high_quality',  ECONOMICAL = 'economy',}const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500const DEFAULT_OVERLAP = 50const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)type ParentChildConfig = {  chunkForContext: ParentMode  parent: {    delimiter: string    maxLength: number  }  child: {    delimiter: string    maxLength: number  }}const defaultParentChildConfig: ParentChildConfig = {  chunkForContext: 'paragraph',  parent: {    delimiter: '\\n\\n',    maxLength: 500,  },  child: {    delimiter: '\\n',    maxLength: 200,  },}const StepTwo = ({  isSetting,  documentDetail,  isAPIKeySet,  datasetId,  indexingType,  dataSourceType: inCreatePageDataSourceType,  files,  notionPages = [],  websitePages = [],  crawlOptions,  websiteCrawlProvider = DataSourceProvider.fireCrawl,  websiteCrawlJobId = '',  onStepChange,  updateIndexingTypeCache,  updateResultCache,  onSave,  onCancel,  updateRetrievalMethodCache,}: StepTwoProps) => {  const { t } = useTranslation()  const { locale } = useContext(I18n)  const media = useBreakpoints()  const isMobile = media === MediaType.mobile  const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()  const isInUpload = Boolean(currentDataset)  const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form  const isNotUploadInEmptyDataset = !isUploadInEmptyDataset  const isInInit = !isInUpload && !isSetting  const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)  const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type  const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)  const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)  const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {    doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))  }, [])  const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length  const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)  const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)  const [rules, setRules] = useState<PreProcessingRule[]>([])  const [defaultConfig, setDefaultConfig] = useState<Rules>()  const hasSetIndexType = !!indexingType  const [indexType, setIndexType] = useState<IndexingType>(    (indexingType      || isAPIKeySet)      ? IndexingType.QUALIFIED      : IndexingType.ECONOMICAL,  )  const [previewFile, setPreviewFile] = useState<DocumentItem>(    (datasetId && documentDetail)      ? documentDetail.file      : files[0],  )  const [previewNotionPage, setPreviewNotionPage] = useState<NotionPage>(    (datasetId && documentDetail)      ? documentDetail.notion_page      : notionPages[0],  )  const [previewWebsitePage, setPreviewWebsitePage] = useState<CrawlResultItem>(    (datasetId && documentDetail)      ? documentDetail.website_page      : websitePages[0],  )  // QA Related  const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)  const [docForm, setDocForm] = useState<ChunkingMode>(    (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,  )  const handleChangeDocform = (value: ChunkingMode) => {    if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) {      setIsQAConfirmDialogOpen(true)      return    }    if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL)      setIndexType(IndexingType.QUALIFIED)    setDocForm(value)    // eslint-disable-next-line ts/no-use-before-define    currentEstimateMutation.reset()  }  const [docLanguage, setDocLanguage] = useState<string>(    (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),  )  const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)  const getIndexing_technique = () => indexingType || indexType  const currentDocForm = currentDataset?.doc_form || docForm  const getProcessRule = (): ProcessRule => {    if (currentDocForm === ChunkingMode.parentChild) {      return {        rules: {          pre_processing_rules: rules,          segmentation: {            separator: unescape(              parentChildConfig.parent.delimiter,            ),            max_tokens: parentChildConfig.parent.maxLength,          },          parent_mode: parentChildConfig.chunkForContext,          subchunk_segmentation: {            separator: unescape(parentChildConfig.child.delimiter),            max_tokens: parentChildConfig.child.maxLength,          },        },        mode: 'hierarchical',      } as ProcessRule    }    return {      rules: {        pre_processing_rules: rules,        segmentation: {          separator: unescape(segmentIdentifier),          max_tokens: maxChunkLength,          chunk_overlap: overlap,        },      }, // api will check this. It will be removed after api refactored.      mode: segmentationType,    } as ProcessRule  }  const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({    docForm: currentDocForm,    docLanguage,    dataSourceType: DataSourceType.FILE,    files: previewFile      ? [files.find(file => file.name === previewFile.name)!]      : files,    indexingTechnique: getIndexing_technique() as any,    processRule: getProcessRule(),    dataset_id: datasetId!,  })  const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({    docForm: currentDocForm,    docLanguage,    dataSourceType: DataSourceType.NOTION,    notionPages: [previewNotionPage],    indexingTechnique: getIndexing_technique() as any,    processRule: getProcessRule(),    dataset_id: datasetId || '',  })  const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({    docForm: currentDocForm,    docLanguage,    dataSourceType: DataSourceType.WEB,    websitePages: [previewWebsitePage],    crawlOptions,    websiteCrawlProvider,    websiteCrawlJobId,    indexingTechnique: getIndexing_technique() as any,    processRule: getProcessRule(),    dataset_id: datasetId || '',  })  const currentEstimateMutation = dataSourceType === DataSourceType.FILE    ? fileIndexingEstimateQuery    : dataSourceType === DataSourceType.NOTION      ? notionIndexingEstimateQuery      : websiteIndexingEstimateQuery  const fetchEstimate = useCallback(() => {    if (dataSourceType === DataSourceType.FILE)      fileIndexingEstimateQuery.mutate()    if (dataSourceType === DataSourceType.NOTION)      notionIndexingEstimateQuery.mutate()    if (dataSourceType === DataSourceType.WEB)      websiteIndexingEstimateQuery.mutate()  }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])  const estimate    = dataSourceType === DataSourceType.FILE      ? fileIndexingEstimateQuery.data      : dataSourceType === DataSourceType.NOTION        ? notionIndexingEstimateQuery.data        : websiteIndexingEstimateQuery.data  const getRuleName = (key: string) => {    if (key === 'remove_extra_spaces')      return t('datasetCreation.stepTwo.removeExtraSpaces')    if (key === 'remove_urls_emails')      return t('datasetCreation.stepTwo.removeUrlEmails')    if (key === 'remove_stopwords')      return t('datasetCreation.stepTwo.removeStopwords')  }  const ruleChangeHandle = (id: string) => {    const newRules = rules.map((rule) => {      if (rule.id === id) {        return {          id: rule.id,          enabled: !rule.enabled,        }      }      return rule    })    setRules(newRules)  }  const resetRules = () => {    if (defaultConfig) {      setSegmentIdentifier(defaultConfig.segmentation.separator)      setMaxChunkLength(defaultConfig.segmentation.max_tokens)      setOverlap(defaultConfig.segmentation.chunk_overlap!)      setRules(defaultConfig.pre_processing_rules)    }    setParentChildConfig(defaultParentChildConfig)  }  const updatePreview = () => {    if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })      return    }    fetchEstimate()  }  const {    modelList: rerankModelList,    defaultModel: rerankDefaultModel,    currentModel: isRerankDefaultModelValid,  } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)  const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)  const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)  const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(    currentDataset?.embedding_model      ? {        provider: currentDataset.embedding_model_provider,        model: currentDataset.embedding_model,      }      : {        provider: defaultEmbeddingModel?.provider.provider || '',        model: defaultEmbeddingModel?.model || '',      },  )  const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {    search_method: RETRIEVE_METHOD.semantic,    reranking_enable: false,    reranking_model: {      reranking_provider_name: '',      reranking_model_name: '',    },    top_k: 3,    score_threshold_enabled: false,    score_threshold: 0.5,  } as RetrievalConfig)  useEffect(() => {    if (currentDataset?.retrieval_model_dict)      return    setRetrievalConfig({      search_method: RETRIEVE_METHOD.semantic,      reranking_enable: !!isRerankDefaultModelValid,      reranking_model: {        reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',        reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',      },      top_k: 3,      score_threshold_enabled: false,      score_threshold: 0.5,    })    // eslint-disable-next-line react-hooks/exhaustive-deps  }, [rerankDefaultModel, isRerankDefaultModelValid])  const getCreationParams = () => {    let params    if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })      return    }    if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })      return    }    if (isSetting) {      params = {        original_document_id: documentDetail?.id,        doc_form: currentDocForm,        doc_language: docLanguage,        process_rule: getProcessRule(),        retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.        embedding_model: embeddingModel.model, // Readonly        embedding_model_provider: embeddingModel.provider, // Readonly        indexing_technique: getIndexing_technique(),      } as CreateDocumentReq    }    else { // create      const indexMethod = getIndexing_technique()      if (        !isReRankModelSelected({          rerankModelList,          retrievalConfig,          indexMethod: indexMethod as string,        })      ) {        Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })        return      }      params = {        data_source: {          type: dataSourceType,          info_list: {            data_source_type: dataSourceType,          },        },        indexing_technique: getIndexing_technique(),        process_rule: getProcessRule(),        doc_form: currentDocForm,        doc_language: docLanguage,        retrieval_model: retrievalConfig,        embedding_model: embeddingModel.model,        embedding_model_provider: embeddingModel.provider,      } as CreateDocumentReq      if (dataSourceType === DataSourceType.FILE) {        params.data_source.info_list.file_info_list = {          file_ids: files.map(file => file.id || '').filter(Boolean),        }      }      if (dataSourceType === DataSourceType.NOTION)        params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)      if (dataSourceType === DataSourceType.WEB) {        params.data_source.info_list.website_info_list = getWebsiteInfo({          websiteCrawlProvider,          websiteCrawlJobId,          websitePages,        })      }    }    return params  }  const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({    onSuccess(data) {      const separator = data.rules.segmentation.separator      setSegmentIdentifier(separator)      setMaxChunkLength(data.rules.segmentation.max_tokens)      setOverlap(data.rules.segmentation.chunk_overlap!)      setRules(data.rules.pre_processing_rules)      setDefaultConfig(data.rules)      setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length)    },    onError(error) {      Toast.notify({        type: 'error',        message: `${error}`,      })    },  })  const getRulesFromDetail = () => {    if (documentDetail) {      const rules = documentDetail.dataset_process_rule.rules      const separator = rules.segmentation.separator      const max = rules.segmentation.max_tokens      const overlap = rules.segmentation.chunk_overlap      setSegmentIdentifier(separator)      setMaxChunkLength(max)      setOverlap(overlap!)      setRules(rules.pre_processing_rules)      setDefaultConfig(rules)    }  }  const getDefaultMode = () => {    if (documentDetail)      setSegmentationType(documentDetail.dataset_process_rule.mode)  }  const createFirstDocumentMutation = useCreateFirstDocument({    onError(error) {      Toast.notify({        type: 'error',        message: `${error}`,      })    },  })  const createDocumentMutation = useCreateDocument(datasetId!, {    onError(error) {      Toast.notify({        type: 'error',        message: `${error}`,      })    },  })  const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending  const createHandle = async () => {    const params = getCreationParams()    if (!params)      return false    if (!datasetId) {      await createFirstDocumentMutation.mutateAsync(        params,        {          onSuccess(data) {            updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)            updateResultCache && updateResultCache(data)            updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)          },        },      )    }    else {      await createDocumentMutation.mutateAsync(params, {        onSuccess(data) {          updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)          updateResultCache && updateResultCache(data)        },      })    }    if (mutateDatasetRes)      mutateDatasetRes()    onStepChange && onStepChange(+1)    isSetting && onSave && onSave()  }  useEffect(() => {    // fetch rules    if (!isSetting) {      fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')    }    else {      getRulesFromDetail()      getDefaultMode()    }    // eslint-disable-next-line react-hooks/exhaustive-deps  }, [])  useEffect(() => {    // get indexing type by props    if (indexingType)      setIndexType(indexingType as IndexingType)    else      setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)  }, [isAPIKeySet, indexingType, datasetId])  const economyDomRef = useRef<HTMLDivElement>(null)  const isHoveringEconomy = useHover(economyDomRef)  const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type  return (    <div className='flex h-full w-full'>      <div className={cn('relative h-full w-1/2 overflow-y-auto py-6', isMobile ? 'px-4' : 'px-12')}>        <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.segmentation')}</div>        {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form))          || isUploadInEmptyDataset          || isInInit)          && <OptionCard            className='mb-2 bg-background-section'            title={t('datasetCreation.stepTwo.general')}            icon={<Image width={20} height={20} src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}            activeHeaderClassName='bg-dataset-option-card-blue-gradient'            description={t('datasetCreation.stepTwo.generalTip')}            isActive={              [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm)            }            onSwitched={() =>              handleChangeDocform(ChunkingMode.text)            }            actions={              <>                <Button variant={'secondary-accent'} onClick={() => updatePreview()}>                  <RiSearchEyeLine className='mr-0.5 h-4 w-4' />                  {t('datasetCreation.stepTwo.previewChunk')}                </Button>                <Button variant={'ghost'} onClick={resetRules}>                  {t('datasetCreation.stepTwo.reset')}                </Button>              </>            }            noHighlight={isInUpload && isNotUploadInEmptyDataset}          >            <div className='flex flex-col gap-y-4'>              <div className='flex gap-3'>                <DelimiterInput                  value={segmentIdentifier}                  onChange={e => setSegmentIdentifier(e.target.value, true)}                />                <MaxLengthInput                  unit='tokens'                  value={maxChunkLength}                  onChange={setMaxChunkLength}                />                <OverlapInput                  unit='tokens'                  value={overlap}                  min={1}                  onChange={setOverlap}                />              </div>              <div className='flex w-full flex-col'>                <div className='flex items-center gap-x-2'>                  <div className='inline-flex shrink-0'>                    <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>                  </div>                  <Divider className='grow' bgStyle='gradient' />                </div>                <div className='mt-1'>                  {rules.map(rule => (                    <div key={rule.id} className={s.ruleItem} onClick={() => {                      ruleChangeHandle(rule.id)                    }}>                      <Checkbox                        checked={rule.enabled}                      />                      <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>                    </div>                  ))}                  {IS_CE_EDITION && <>                    <Divider type='horizontal' className='my-4 bg-divider-subtle' />                    <div className='flex items-center py-0.5'>                      <div className='flex items-center' onClick={() => {                        if (currentDataset?.doc_form)                          return                        if (docForm === ChunkingMode.qa)                          handleChangeDocform(ChunkingMode.text)                        else                          handleChangeDocform(ChunkingMode.qa)                      }}>                        <Checkbox                          checked={currentDocForm === ChunkingMode.qa}                          disabled={!!currentDataset?.doc_form}                        />                        <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">                          {t('datasetCreation.stepTwo.useQALanguage')}                        </label>                      </div>                      <LanguageSelect                        currentLanguage={docLanguage || locale}                        onSelect={setDocLanguage}                        disabled={currentDocForm !== ChunkingMode.qa}                      />                      <Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />                    </div>                    {currentDocForm === ChunkingMode.qa && (                      <div                        style={{                          background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',                        }}                        className='mt-2 flex h-10 items-center gap-2 rounded-xl border border-components-panel-border px-3 text-xs shadow-xs backdrop-blur-[5px]'                      >                        <RiAlertFill className='size-4 text-text-warning-secondary' />                        <span className='system-xs-medium text-text-primary'>                          {t('datasetCreation.stepTwo.QATip')}                        </span>                      </div>                    )}                  </>}                </div>              </div>            </div>          </OptionCard>}        {          (            (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild)            || isUploadInEmptyDataset            || isInInit          )          && <OptionCard            title={t('datasetCreation.stepTwo.parentChild')}            icon={<Image width={20} height={20} src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}            effectImg={OrangeEffect.src}            activeHeaderClassName='bg-dataset-option-card-orange-gradient'            description={t('datasetCreation.stepTwo.parentChildTip')}            isActive={currentDocForm === ChunkingMode.parentChild}            onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)}            actions={              <>                <Button variant={'secondary-accent'} onClick={() => updatePreview()}>                  <RiSearchEyeLine className='mr-0.5 h-4 w-4' />                  {t('datasetCreation.stepTwo.previewChunk')}                </Button>                <Button variant={'ghost'} onClick={resetRules}>                  {t('datasetCreation.stepTwo.reset')}                </Button>              </>            }            noHighlight={isInUpload && isNotUploadInEmptyDataset}          >            <div className='flex flex-col gap-4'>              <div>                <div className='flex items-center gap-x-2'>                  <div className='inline-flex shrink-0'>                    <TextLabel>{t('datasetCreation.stepTwo.parentChunkForContext')}</TextLabel>                  </div>                  <Divider className='grow' bgStyle='gradient' />                </div>                <RadioCard className='mt-1'                  icon={<Image src={Note} alt='' />}                  title={t('datasetCreation.stepTwo.paragraph')}                  description={t('datasetCreation.stepTwo.paragraphTip')}                  isChosen={parentChildConfig.chunkForContext === 'paragraph'}                  onChosen={() => setParentChildConfig(                    {                      ...parentChildConfig,                      chunkForContext: 'paragraph',                    },                  )}                  chosenConfig={                    <div className='flex gap-3'>                      <DelimiterInput                        value={parentChildConfig.parent.delimiter}                        tooltip={t('datasetCreation.stepTwo.parentChildDelimiterTip')!}                        onChange={e => setParentChildConfig({                          ...parentChildConfig,                          parent: {                            ...parentChildConfig.parent,                            delimiter: e.target.value ? escape(e.target.value) : '',                          },                        })}                      />                      <MaxLengthInput                        unit='tokens'                        value={parentChildConfig.parent.maxLength}                        onChange={value => setParentChildConfig({                          ...parentChildConfig,                          parent: {                            ...parentChildConfig.parent,                            maxLength: value,                          },                        })}                      />                    </div>                  }                />                <RadioCard className='mt-2'                  icon={<Image src={FileList} alt='' />}                  title={t('datasetCreation.stepTwo.fullDoc')}                  description={t('datasetCreation.stepTwo.fullDocTip')}                  onChosen={() => setParentChildConfig(                    {                      ...parentChildConfig,                      chunkForContext: 'full-doc',                    },                  )}                  isChosen={parentChildConfig.chunkForContext === 'full-doc'}                />              </div>              <div>                <div className='flex items-center gap-x-2'>                  <div className='inline-flex shrink-0'>                    <TextLabel>{t('datasetCreation.stepTwo.childChunkForRetrieval')}</TextLabel>                  </div>                  <Divider className='grow' bgStyle='gradient' />                </div>                <div className='mt-1 flex gap-3'>                  <DelimiterInput                    value={parentChildConfig.child.delimiter}                    tooltip={t('datasetCreation.stepTwo.parentChildChunkDelimiterTip')!}                    onChange={e => setParentChildConfig({                      ...parentChildConfig,                      child: {                        ...parentChildConfig.child,                        delimiter: e.target.value ? escape(e.target.value) : '',                      },                    })}                  />                  <MaxLengthInput                    unit='tokens'                    value={parentChildConfig.child.maxLength}                    onChange={value => setParentChildConfig({                      ...parentChildConfig,                      child: {                        ...parentChildConfig.child,                        maxLength: value,                      },                    })}                  />                </div>              </div>              <div>                <div className='flex items-center gap-x-2'>                  <div className='inline-flex shrink-0'>                    <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>                  </div>                  <Divider className='grow' bgStyle='gradient' />                </div>                <div className='mt-1'>                  {rules.map(rule => (                    <div key={rule.id} className={s.ruleItem} onClick={() => {                      ruleChangeHandle(rule.id)                    }}>                      <Checkbox                        checked={rule.enabled}                      />                      <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>                    </div>                  ))}                </div>              </div>            </div>          </OptionCard>}        <Divider className='my-5' />        <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.indexMode')}</div>        <div className='flex items-center gap-2'>          {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (            <OptionCard className='flex-1 self-stretch'              title={<div className='flex items-center'>                {t('datasetCreation.stepTwo.qualified')}                <Badge className={cn('ml-1 h-[18px]', (!hasSetIndexType && indexType === IndexingType.QUALIFIED) ? 'border-text-accent-secondary text-text-accent-secondary' : '')} uppercase>                  {t('datasetCreation.stepTwo.recommend')}                </Badge>                <span className='ml-auto'>                  {!hasSetIndexType && <span className={cn(s.radio)} />}                </span>              </div>}              description={t('datasetCreation.stepTwo.qualifiedTip')}              icon={<Image src={indexMethodIcon.high_quality} alt='' />}              isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED}              disabled={!isAPIKeySet || hasSetIndexType}              onSwitched={() => {                if (isAPIKeySet)                  setIndexType(IndexingType.QUALIFIED)              }}            />          )}          {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (            <>              <CustomDialog show={isQAConfirmDialogOpen} onClose={() => setIsQAConfirmDialogOpen(false)} className='w-[432px]'>                <header className='mb-4 pt-6'>                  <h2 className='text-lg font-semibold'>                    {t('datasetCreation.stepTwo.qaSwitchHighQualityTipTitle')}                  </h2>                  <p className='mt-2 text-sm font-normal'>                    {t('datasetCreation.stepTwo.qaSwitchHighQualityTipContent')}                  </p>                </header>                <div className='flex gap-2 pb-6'>                  <Button className='ml-auto' onClick={() => {                    setIsQAConfirmDialogOpen(false)                  }}>                    {t('datasetCreation.stepTwo.cancel')}                  </Button>                  <Button variant={'primary'} onClick={() => {                    setIsQAConfirmDialogOpen(false)                    setIndexType(IndexingType.QUALIFIED)                    setDocForm(ChunkingMode.qa)                  }}>                    {t('datasetCreation.stepTwo.switch')}                  </Button>                </div>              </CustomDialog>              <PortalToFollowElem                open={                  isHoveringEconomy && docForm !== ChunkingMode.text                }                placement={'top'}              >                <PortalToFollowElemTrigger asChild>                  <OptionCard className='flex-1 self-stretch'                    title={t('datasetCreation.stepTwo.economical')}                    description={t('datasetCreation.stepTwo.economicalTip')}                    icon={<Image src={indexMethodIcon.economical} alt='' />}                    isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL}                    disabled={!isAPIKeySet || hasSetIndexType || docForm !== ChunkingMode.text}                    ref={economyDomRef}                    onSwitched={() => {                      if (isAPIKeySet && docForm === ChunkingMode.text)                        setIndexType(IndexingType.ECONOMICAL)                    }}                  />                </PortalToFollowElemTrigger>                <PortalToFollowElemContent>                  <div className='rounded-lg border-components-panel-border bg-components-tooltip-bg p-3 text-xs font-medium text-text-secondary shadow-lg'>                    {                      docForm === ChunkingMode.qa                        ? t('datasetCreation.stepTwo.notAvailableForQA')                        : t('datasetCreation.stepTwo.notAvailableForParentChild')                    }                  </div>                </PortalToFollowElemContent>              </PortalToFollowElem>            </>)}        </div>        {!hasSetIndexType && indexType === IndexingType.QUALIFIED && (          <div className='mt-2 flex h-10 items-center gap-x-0.5 overflow-hidden rounded-xl border-[0.5px] border-components-panel-border bg-components-panel-bg-blur p-2 shadow-xs backdrop-blur-[5px]'>            <div className='absolute bottom-0 left-0 right-0 top-0 bg-dataset-warning-message-bg opacity-40'></div>            <div className='p-1'>              <AlertTriangle className='size-4 text-text-warning-secondary' />            </div>            <span className='system-xs-medium text-text-primary'>{t('datasetCreation.stepTwo.highQualityTip')}</span>          </div>        )}        {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (          <div className='system-xs-medium mt-2'>            {t('datasetCreation.stepTwo.indexSettingTip')}            <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>          </div>        )}        {/* Embedding model */}        {indexType === IndexingType.QUALIFIED && (          <div className='mt-5'>            <div className={cn('system-md-semibold mb-1 text-text-secondary', datasetId && 'flex items-center justify-between')}>{t('datasetSettings.form.embeddingModel')}</div>            <ModelSelector              readonly={isModelAndRetrievalConfigDisabled}              triggerClassName={isModelAndRetrievalConfigDisabled ? 'opacity-50' : ''}              defaultModel={embeddingModel}              modelList={embeddingModelList}              onSelect={(model: DefaultModel) => {                setEmbeddingModel(model)              }}            />            {isModelAndRetrievalConfigDisabled && (              <div className='system-xs-medium mt-2 text-text-tertiary'>                {t('datasetCreation.stepTwo.indexSettingTip')}                <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>              </div>            )}          </div>        )}        <Divider className='my-5' />        {/* Retrieval Method Config */}        <div>          {!isModelAndRetrievalConfigDisabled            ? (              <div className={'mb-1'}>                <div className='system-md-semibold mb-0.5 text-text-secondary'>{t('datasetSettings.form.retrievalSetting.title')}</div>                <div className='body-xs-regular text-text-tertiary'>                  <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>                  {t('datasetSettings.form.retrievalSetting.longDescription')}                </div>              </div>            )            : (              <div className={cn('system-md-semibold mb-0.5 text-text-secondary', 'flex items-center justify-between')}>                <div>{t('datasetSettings.form.retrievalSetting.title')}</div>              </div>            )}          <div className=''>            {              getIndexing_technique() === IndexingType.QUALIFIED                ? (                  <RetrievalMethodConfig                    disabled={isModelAndRetrievalConfigDisabled}                    value={retrievalConfig}                    onChange={setRetrievalConfig}                  />                )                : (                  <EconomicalRetrievalMethodConfig                    disabled={isModelAndRetrievalConfigDisabled}                    value={retrievalConfig}                    onChange={setRetrievalConfig}                  />                )            }          </div>        </div>        {!isSetting          ? (            <div className='mt-8 flex items-center py-2'>              <Button onClick={() => onStepChange && onStepChange(-1)}>                <RiArrowLeftLine className='mr-1 h-4 w-4' />                {t('datasetCreation.stepTwo.previousStep')}              </Button>              <Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>            </div>          )          : (            <div className='mt-8 flex items-center py-2'>              <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>              <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>            </div>          )}      </div>      <FloatRightContainer isMobile={isMobile} isOpen={true} onClose={() => { }} footer={null}>        <PreviewContainer          header={<PreviewHeader            title={t('datasetCreation.stepTwo.preview')}          >            <div className='flex items-center gap-1'>              {dataSourceType === DataSourceType.FILE                && <PreviewDocumentPicker                  files={files as Array<Required<CustomFile>>}                  onChange={(selected) => {                    currentEstimateMutation.reset()                    setPreviewFile(selected)                    currentEstimateMutation.mutate()                  }}                  // when it is from setting, it just has one file                  value={isSetting ? (files[0]! as Required<CustomFile>) : previewFile}                />              }              {dataSourceType === DataSourceType.NOTION                && <PreviewDocumentPicker                  files={                    notionPages.map(page => ({                      id: page.page_id,                      name: page.page_name,                      extension: 'md',                    }))                  }                  onChange={(selected) => {                    currentEstimateMutation.reset()                    const selectedPage = notionPages.find(page => page.page_id === selected.id)                    setPreviewNotionPage(selectedPage!)                    currentEstimateMutation.mutate()                  }}                  value={{                    id: previewNotionPage?.page_id || '',                    name: previewNotionPage?.page_name || '',                    extension: 'md',                  }}                />              }              {dataSourceType === DataSourceType.WEB                && <PreviewDocumentPicker                  files={                    websitePages.map(page => ({                      id: page.source_url,                      name: page.title,                      extension: 'md',                    }))                  }                  onChange={(selected) => {                    currentEstimateMutation.reset()                    const selectedPage = websitePages.find(page => page.source_url === selected.id)                    setPreviewWebsitePage(selectedPage!)                    currentEstimateMutation.mutate()                  }}                  value={                    {                      id: previewWebsitePage?.source_url || '',                      name: previewWebsitePage?.title || '',                      extension: 'md',                    }                  }                />              }              {                currentDocForm !== ChunkingMode.qa                && <Badge text={t('datasetCreation.stepTwo.previewChunkCount', {                  count: estimate?.total_segments || 0,                }) as string}                />              }            </div>          </PreviewHeader>}          className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')}          mainClassName='space-y-6'        >          {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && (            estimate?.qa_preview.map((item, index) => (              <ChunkContainer                key={item.question}                label={`Chunk-${index + 1}`}                characterCount={item.question.length + item.answer.length}              >                <QAPreview qa={item} />              </ChunkContainer>            ))          )}          {currentDocForm === ChunkingMode.text && estimate?.preview && (            estimate?.preview.map((item, index) => (              <ChunkContainer                key={item.content}                label={`Chunk-${index + 1}`}                characterCount={item.content.length}              >                {item.content}              </ChunkContainer>            ))          )}          {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && (            estimate?.preview?.map((item, index) => {              const indexForLabel = index + 1              const childChunks = parentChildConfig.chunkForContext === 'full-doc'                ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)                : item.child_chunks              return (                <ChunkContainer                  key={item.content}                  label={`Chunk-${indexForLabel}`}                  characterCount={item.content.length}                >                  <FormattedText>                    {childChunks.map((child, index) => {                      const indexForLabel = index + 1                      return (                        <PreviewSlice                          key={child}                          label={`C-${indexForLabel}`}                          text={child}                          tooltip={`Child-chunk-${indexForLabel} · ${child.length} Characters`}                          labelInnerClassName='text-[10px] font-semibold align-bottom leading-7'                          dividerClassName='leading-7'                        />                      )                    })}                  </FormattedText>                </ChunkContainer>              )            })          )}          {currentEstimateMutation.isIdle && (            <div className='flex h-full w-full items-center justify-center'>              <div className='flex flex-col items-center justify-center gap-3'>                <RiSearchEyeLine className='size-10 text-text-empty-state-icon' />                <p className='text-sm text-text-tertiary'>                  {t('datasetCreation.stepTwo.previewChunkTip')}                </p>              </div>            </div>          )}          {currentEstimateMutation.isPending && (            <div className='space-y-6'>              {Array.from({ length: 10 }, (_, i) => (                <SkeletonContainer key={i}>                  <SkeletonRow>                    <SkeletonRectangle className="w-20" />                    <SkeletonPoint />                    <SkeletonRectangle className="w-24" />                  </SkeletonRow>                  <SkeletonRectangle className="w-full" />                  <SkeletonRectangle className="w-full" />                  <SkeletonRectangle className="w-[422px]" />                </SkeletonContainer>              ))}            </div>          )}        </PreviewContainer>      </FloatRightContainer>    </div>  )}export default StepTwo
 |