index.tsx 47 KB


  1. 'use client'
  2. import type { FC, PropsWithChildren } from 'react'
  3. import React, { useCallback, useEffect, useRef, useState } from 'react'
  4. import { useTranslation } from 'react-i18next'
  5. import { useContext } from 'use-context-selector'
  6. import {
  7. RiAlertFill,
  8. RiArrowLeftLine,
  9. RiSearchEyeLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import Image from 'next/image'
  13. import { useHover } from 'ahooks'
  14. import SettingCog from '../assets/setting-gear-mod.svg'
  15. import OrangeEffect from '../assets/option-card-effect-orange.svg'
  16. import FamilyMod from '../assets/family-mod.svg'
  17. import Note from '../assets/note-mod.svg'
  18. import FileList from '../assets/file-list-3-fill.svg'
  19. import { indexMethodIcon } from '../icons'
  20. import { PreviewContainer } from '../../preview/container'
  21. import { ChunkContainer, QAPreview } from '../../chunk'
  22. import { PreviewHeader } from '../../preview/header'
  23. import { FormattedText } from '../../formatted-text/formatted'
  24. import { PreviewSlice } from '../../formatted-text/flavours/preview-slice'
  25. import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker'
  26. import s from './index.module.css'
  27. import unescape from './unescape'
  28. import escape from './escape'
  29. import { OptionCard } from './option-card'
  30. import LanguageSelect from './language-select'
  31. import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
  32. import cn from '@/utils/classnames'
  33. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  34. import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
  35. import Button from '@/app/components/base/button'
  36. import FloatRightContainer from '@/app/components/base/float-right-container'
  37. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  38. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  39. import { type RetrievalConfig } from '@/types/app'
  40. import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  41. import Toast from '@/app/components/base/toast'
  42. import type { NotionPage } from '@/models/common'
  43. import { DataSourceProvider } from '@/models/common'
  44. import { useDatasetDetailContext } from '@/context/dataset-detail'
  45. import I18n from '@/context/i18n'
  46. import { RETRIEVE_METHOD } from '@/types/app'
  47. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  48. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  49. import { LanguagesSupported } from '@/i18n/language'
  50. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  51. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  53. import Checkbox from '@/app/components/base/checkbox'
  54. import RadioCard from '@/app/components/base/radio-card'
  55. import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config'
  56. import Divider from '@/app/components/base/divider'
  57. import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset'
  58. import Badge from '@/app/components/base/badge'
  59. import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton'
  60. import Tooltip from '@/app/components/base/tooltip'
  61. import CustomDialog from '@/app/components/base/dialog'
  62. import { PortalToFollowElem, PortalToFollowElemContent, PortalToFollowElemTrigger } from '@/app/components/base/portal-to-follow-elem'
  63. import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
  64. const TextLabel: FC<PropsWithChildren> = (props) => {
  65. return <label className='text-text-secondary system-sm-semibold'>{props.children}</label>
  66. }
  67. type StepTwoProps = {
  68. isSetting?: boolean
  69. documentDetail?: FullDocumentDetail
  70. isAPIKeySet: boolean
  71. onSetting: () => void
  72. datasetId?: string
  73. indexingType?: IndexingType
  74. retrievalMethod?: string
  75. dataSourceType: DataSourceType
  76. files: CustomFile[]
  77. notionPages?: NotionPage[]
  78. websitePages?: CrawlResultItem[]
  79. crawlOptions?: CrawlOptions
  80. websiteCrawlProvider?: DataSourceProvider
  81. websiteCrawlJobId?: string
  82. onStepChange?: (delta: number) => void
  83. updateIndexingTypeCache?: (type: string) => void
  84. updateRetrievalMethodCache?: (method: string) => void
  85. updateResultCache?: (res: createDocumentResponse) => void
  86. onSave?: () => void
  87. onCancel?: () => void
  88. }
  89. export enum IndexingType {
  90. QUALIFIED = 'high_quality',
  91. ECONOMICAL = 'economy',
  92. }
  93. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  94. const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
  95. const DEFAULT_OVERLAP = 50
  96. type ParentChildConfig = {
  97. chunkForContext: ParentMode
  98. parent: {
  99. delimiter: string
  100. maxLength: number
  101. }
  102. child: {
  103. delimiter: string
  104. maxLength: number
  105. }
  106. }
  107. const defaultParentChildConfig: ParentChildConfig = {
  108. chunkForContext: 'paragraph',
  109. parent: {
  110. delimiter: '\\n\\n',
  111. maxLength: 500,
  112. },
  113. child: {
  114. delimiter: '\\n',
  115. maxLength: 200,
  116. },
  117. }
  118. const StepTwo = ({
  119. isSetting,
  120. documentDetail,
  121. isAPIKeySet,
  122. datasetId,
  123. indexingType,
  124. dataSourceType: inCreatePageDataSourceType,
  125. files,
  126. notionPages = [],
  127. websitePages = [],
  128. crawlOptions,
  129. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  130. websiteCrawlJobId = '',
  131. onStepChange,
  132. updateIndexingTypeCache,
  133. updateResultCache,
  134. onSave,
  135. onCancel,
  136. updateRetrievalMethodCache,
  137. }: StepTwoProps) => {
  138. const { t } = useTranslation()
  139. const { locale } = useContext(I18n)
  140. const media = useBreakpoints()
  141. const isMobile = media === MediaType.mobile
  142. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  143. const isInUpload = Boolean(currentDataset)
  144. const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form
  145. const isNotUploadInEmptyDataset = !isUploadInEmptyDataset
  146. const isInInit = !isInUpload && !isSetting
  147. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  148. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  149. const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)
  150. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  151. const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
  152. doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
  153. }, [])
  154. const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
  155. const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
  156. const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
  157. const [rules, setRules] = useState<PreProcessingRule[]>([])
  158. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  159. const hasSetIndexType = !!indexingType
  160. const [indexType, setIndexType] = useState<IndexingType>(
  161. (indexingType
  162. || isAPIKeySet)
  163. ? IndexingType.QUALIFIED
  164. : IndexingType.ECONOMICAL,
  165. )
  166. const [previewFile, setPreviewFile] = useState<DocumentItem>(
  167. (datasetId && documentDetail)
  168. ? documentDetail.file
  169. : files[0],
  170. )
  171. const [previewNotionPage, setPreviewNotionPage] = useState<NotionPage>(
  172. (datasetId && documentDetail)
  173. ? documentDetail.notion_page
  174. : notionPages[0],
  175. )
  176. const [previewWebsitePage, setPreviewWebsitePage] = useState<CrawlResultItem>(
  177. (datasetId && documentDetail)
  178. ? documentDetail.website_page
  179. : websitePages[0],
  180. )
  181. // QA Related
  182. const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
  183. const [docForm, setDocForm] = useState<ChunkingMode>(
  184. (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
  185. )
  186. const handleChangeDocform = (value: ChunkingMode) => {
  187. if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) {
  188. setIsQAConfirmDialogOpen(true)
  189. return
  190. }
  191. if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL)
  192. setIndexType(IndexingType.QUALIFIED)
  193. setDocForm(value)
  194. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  195. currentEstimateMutation.reset()
  196. }
  197. const [docLanguage, setDocLanguage] = useState<string>(
  198. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
  199. )
  200. const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
  201. const getIndexing_technique = () => indexingType || indexType
  202. const currentDocForm = currentDataset?.doc_form || docForm
  203. const getProcessRule = (): ProcessRule => {
  204. if (currentDocForm === ChunkingMode.parentChild) {
  205. return {
  206. rules: {
  207. pre_processing_rules: rules,
  208. segmentation: {
  209. separator: unescape(
  210. parentChildConfig.parent.delimiter,
  211. ),
  212. max_tokens: parentChildConfig.parent.maxLength,
  213. },
  214. parent_mode: parentChildConfig.chunkForContext,
  215. subchunk_segmentation: {
  216. separator: unescape(parentChildConfig.child.delimiter),
  217. max_tokens: parentChildConfig.child.maxLength,
  218. },
  219. },
  220. mode: 'hierarchical',
  221. } as ProcessRule
  222. }
  223. return {
  224. rules: {
  225. pre_processing_rules: rules,
  226. segmentation: {
  227. separator: unescape(segmentIdentifier),
  228. max_tokens: maxChunkLength,
  229. chunk_overlap: overlap,
  230. },
  231. }, // api will check this. It will be removed after api refactored.
  232. mode: segmentationType,
  233. } as ProcessRule
  234. }
  235. const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
  236. docForm: currentDocForm,
  237. docLanguage,
  238. dataSourceType: DataSourceType.FILE,
  239. files: previewFile
  240. ? [files.find(file => file.name === previewFile.name)!]
  241. : files,
  242. indexingTechnique: getIndexing_technique() as any,
  243. processRule: getProcessRule(),
  244. dataset_id: datasetId!,
  245. })
  246. const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
  247. docForm: currentDocForm,
  248. docLanguage,
  249. dataSourceType: DataSourceType.NOTION,
  250. notionPages: [previewNotionPage],
  251. indexingTechnique: getIndexing_technique() as any,
  252. processRule: getProcessRule(),
  253. dataset_id: datasetId || '',
  254. })
  255. const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
  256. docForm: currentDocForm,
  257. docLanguage,
  258. dataSourceType: DataSourceType.WEB,
  259. websitePages: [previewWebsitePage],
  260. crawlOptions,
  261. websiteCrawlProvider,
  262. websiteCrawlJobId,
  263. indexingTechnique: getIndexing_technique() as any,
  264. processRule: getProcessRule(),
  265. dataset_id: datasetId || '',
  266. })
  267. const currentEstimateMutation = dataSourceType === DataSourceType.FILE
  268. ? fileIndexingEstimateQuery
  269. : dataSourceType === DataSourceType.NOTION
  270. ? notionIndexingEstimateQuery
  271. : websiteIndexingEstimateQuery
  272. const fetchEstimate = useCallback(() => {
  273. if (dataSourceType === DataSourceType.FILE)
  274. fileIndexingEstimateQuery.mutate()
  275. if (dataSourceType === DataSourceType.NOTION)
  276. notionIndexingEstimateQuery.mutate()
  277. if (dataSourceType === DataSourceType.WEB)
  278. websiteIndexingEstimateQuery.mutate()
  279. }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
  280. const estimate
  281. = dataSourceType === DataSourceType.FILE
  282. ? fileIndexingEstimateQuery.data
  283. : dataSourceType === DataSourceType.NOTION
  284. ? notionIndexingEstimateQuery.data
  285. : websiteIndexingEstimateQuery.data
  286. const getRuleName = (key: string) => {
  287. if (key === 'remove_extra_spaces')
  288. return t('datasetCreation.stepTwo.removeExtraSpaces')
  289. if (key === 'remove_urls_emails')
  290. return t('datasetCreation.stepTwo.removeUrlEmails')
  291. if (key === 'remove_stopwords')
  292. return t('datasetCreation.stepTwo.removeStopwords')
  293. }
  294. const ruleChangeHandle = (id: string) => {
  295. const newRules = rules.map((rule) => {
  296. if (rule.id === id) {
  297. return {
  298. id: rule.id,
  299. enabled: !rule.enabled,
  300. }
  301. }
  302. return rule
  303. })
  304. setRules(newRules)
  305. }
  306. const resetRules = () => {
  307. if (defaultConfig) {
  308. setSegmentIdentifier(defaultConfig.segmentation.separator)
  309. setMaxChunkLength(defaultConfig.segmentation.max_tokens)
  310. setOverlap(defaultConfig.segmentation.chunk_overlap!)
  311. setRules(defaultConfig.pre_processing_rules)
  312. }
  313. setParentChildConfig(defaultParentChildConfig)
  314. }
  315. const updatePreview = () => {
  316. if (segmentationType === ProcessMode.general && maxChunkLength > 4000) {
  317. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
  318. return
  319. }
  320. fetchEstimate()
  321. }
  322. const {
  323. modelList: rerankModelList,
  324. defaultModel: rerankDefaultModel,
  325. currentModel: isRerankDefaultModelValid,
  326. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  327. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  328. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  329. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  330. currentDataset?.embedding_model
  331. ? {
  332. provider: currentDataset.embedding_model_provider,
  333. model: currentDataset.embedding_model,
  334. }
  335. : {
  336. provider: defaultEmbeddingModel?.provider.provider || '',
  337. model: defaultEmbeddingModel?.model || '',
  338. },
  339. )
  340. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  341. search_method: RETRIEVE_METHOD.semantic,
  342. reranking_enable: false,
  343. reranking_model: {
  344. reranking_provider_name: '',
  345. reranking_model_name: '',
  346. },
  347. top_k: 3,
  348. score_threshold_enabled: false,
  349. score_threshold: 0.5,
  350. } as RetrievalConfig)
  351. useEffect(() => {
  352. if (currentDataset?.retrieval_model_dict)
  353. return
  354. setRetrievalConfig({
  355. search_method: RETRIEVE_METHOD.semantic,
  356. reranking_enable: !!isRerankDefaultModelValid,
  357. reranking_model: {
  358. reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
  359. reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
  360. },
  361. top_k: 3,
  362. score_threshold_enabled: false,
  363. score_threshold: 0.5,
  364. })
  365. // eslint-disable-next-line react-hooks/exhaustive-deps
  366. }, [rerankDefaultModel, isRerankDefaultModelValid])
  367. const getCreationParams = () => {
  368. let params
  369. if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
  370. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  371. return
  372. }
  373. if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
  374. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
  375. return
  376. }
  377. if (isSetting) {
  378. params = {
  379. original_document_id: documentDetail?.id,
  380. doc_form: currentDocForm,
  381. doc_language: docLanguage,
  382. process_rule: getProcessRule(),
  383. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  384. embedding_model: embeddingModel.model, // Readonly
  385. embedding_model_provider: embeddingModel.provider, // Readonly
  386. indexing_technique: getIndexing_technique(),
  387. } as CreateDocumentReq
  388. }
  389. else { // create
  390. const indexMethod = getIndexing_technique()
  391. if (
  392. !isReRankModelSelected({
  393. rerankModelList,
  394. retrievalConfig,
  395. indexMethod: indexMethod as string,
  396. })
  397. ) {
  398. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  399. return
  400. }
  401. params = {
  402. data_source: {
  403. type: dataSourceType,
  404. info_list: {
  405. data_source_type: dataSourceType,
  406. },
  407. },
  408. indexing_technique: getIndexing_technique(),
  409. process_rule: getProcessRule(),
  410. doc_form: currentDocForm,
  411. doc_language: docLanguage,
  412. retrieval_model: retrievalConfig,
  413. embedding_model: embeddingModel.model,
  414. embedding_model_provider: embeddingModel.provider,
  415. } as CreateDocumentReq
  416. if (dataSourceType === DataSourceType.FILE) {
  417. params.data_source.info_list.file_info_list = {
  418. file_ids: files.map(file => file.id || '').filter(Boolean),
  419. }
  420. }
  421. if (dataSourceType === DataSourceType.NOTION)
  422. params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
  423. if (dataSourceType === DataSourceType.WEB) {
  424. params.data_source.info_list.website_info_list = getWebsiteInfo({
  425. websiteCrawlProvider,
  426. websiteCrawlJobId,
  427. websitePages,
  428. })
  429. }
  430. }
  431. return params
  432. }
  433. const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
  434. onSuccess(data) {
  435. const separator = data.rules.segmentation.separator
  436. setSegmentIdentifier(separator)
  437. setMaxChunkLength(data.rules.segmentation.max_tokens)
  438. setOverlap(data.rules.segmentation.chunk_overlap!)
  439. setRules(data.rules.pre_processing_rules)
  440. setDefaultConfig(data.rules)
  441. setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length)
  442. },
  443. onError(error) {
  444. Toast.notify({
  445. type: 'error',
  446. message: `${error}`,
  447. })
  448. },
  449. })
  450. const getRulesFromDetail = () => {
  451. if (documentDetail) {
  452. const rules = documentDetail.dataset_process_rule.rules
  453. const separator = rules.segmentation.separator
  454. const max = rules.segmentation.max_tokens
  455. const overlap = rules.segmentation.chunk_overlap
  456. setSegmentIdentifier(separator)
  457. setMaxChunkLength(max)
  458. setOverlap(overlap!)
  459. setRules(rules.pre_processing_rules)
  460. setDefaultConfig(rules)
  461. }
  462. }
  463. const getDefaultMode = () => {
  464. if (documentDetail)
  465. setSegmentationType(documentDetail.dataset_process_rule.mode)
  466. }
  467. const createFirstDocumentMutation = useCreateFirstDocument({
  468. onError(error) {
  469. Toast.notify({
  470. type: 'error',
  471. message: `${error}`,
  472. })
  473. },
  474. })
  475. const createDocumentMutation = useCreateDocument(datasetId!, {
  476. onError(error) {
  477. Toast.notify({
  478. type: 'error',
  479. message: `${error}`,
  480. })
  481. },
  482. })
  483. const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
  484. const createHandle = async () => {
  485. const params = getCreationParams()
  486. if (!params)
  487. return false
  488. if (!datasetId) {
  489. await createFirstDocumentMutation.mutateAsync(
  490. params,
  491. {
  492. onSuccess(data) {
  493. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  494. updateResultCache && updateResultCache(data)
  495. updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
  496. },
  497. },
  498. )
  499. }
  500. else {
  501. await createDocumentMutation.mutateAsync(params, {
  502. onSuccess(data) {
  503. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  504. updateResultCache && updateResultCache(data)
  505. },
  506. })
  507. }
  508. if (mutateDatasetRes)
  509. mutateDatasetRes()
  510. onStepChange && onStepChange(+1)
  511. isSetting && onSave && onSave()
  512. }
  513. useEffect(() => {
  514. // fetch rules
  515. if (!isSetting) {
  516. fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
  517. }
  518. else {
  519. getRulesFromDetail()
  520. getDefaultMode()
  521. }
  522. // eslint-disable-next-line react-hooks/exhaustive-deps
  523. }, [])
  524. useEffect(() => {
  525. // get indexing type by props
  526. if (indexingType)
  527. setIndexType(indexingType as IndexingType)
  528. else
  529. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  530. }, [isAPIKeySet, indexingType, datasetId])
  531. const economyDomRef = useRef<HTMLDivElement>(null)
  532. const isHoveringEconomy = useHover(economyDomRef)
  533. return (
  534. <div className='flex w-full h-full'>
  535. <div className={cn('relative h-full w-1/2 py-6 overflow-y-auto', isMobile ? 'px-4' : 'px-12')}>
  536. <div className={'system-md-semibold mb-1'}>{t('datasetCreation.stepTwo.segmentation')}</div>
  537. {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form))
  538. || isUploadInEmptyDataset
  539. || isInInit)
  540. && <OptionCard
  541. className='bg-background-section mb-2'
  542. title={t('datasetCreation.stepTwo.general')}
  543. icon={<Image width={20} height={20} src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
  544. activeHeaderClassName='bg-dataset-option-card-blue-gradient'
  545. description={t('datasetCreation.stepTwo.generalTip')}
  546. isActive={
  547. [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm)
  548. }
  549. onSwitched={() =>
  550. handleChangeDocform(ChunkingMode.text)
  551. }
  552. actions={
  553. <>
  554. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  555. <RiSearchEyeLine className='h-4 w-4 mr-0.5' />
  556. {t('datasetCreation.stepTwo.previewChunk')}
  557. </Button>
  558. <Button variant={'ghost'} onClick={resetRules}>
  559. {t('datasetCreation.stepTwo.reset')}
  560. </Button>
  561. </>
  562. }
  563. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  564. >
  565. <div className='flex flex-col gap-y-4'>
  566. <div className='flex gap-3'>
  567. <DelimiterInput
  568. value={segmentIdentifier}
  569. onChange={e => setSegmentIdentifier(e.target.value, true)}
  570. />
  571. <MaxLengthInput
  572. unit='tokens'
  573. value={maxChunkLength}
  574. onChange={setMaxChunkLength}
  575. />
  576. <OverlapInput
  577. unit='tokens'
  578. value={overlap}
  579. min={1}
  580. onChange={setOverlap}
  581. />
  582. </div>
  583. <div className='w-full flex flex-col'>
  584. <div className='flex items-center gap-x-2'>
  585. <div className='inline-flex shrink-0'>
  586. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  587. </div>
  588. <Divider className='grow' bgStyle='gradient' />
  589. </div>
  590. <div className='mt-1'>
  591. {rules.map(rule => (
  592. <div key={rule.id} className={s.ruleItem} onClick={() => {
  593. ruleChangeHandle(rule.id)
  594. }}>
  595. <Checkbox
  596. checked={rule.enabled}
  597. />
  598. <label className="ml-2 system-sm-regular cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  599. </div>
  600. ))}
  601. {IS_CE_EDITION && <>
  602. <Divider type='horizontal' className='my-4 bg-divider-subtle' />
  603. <div className='flex items-center py-0.5'>
  604. <div className='flex items-center' onClick={() => {
  605. if (currentDataset?.doc_form)
  606. return
  607. if (docForm === ChunkingMode.qa)
  608. handleChangeDocform(ChunkingMode.text)
  609. else
  610. handleChangeDocform(ChunkingMode.qa)
  611. }}>
  612. <Checkbox
  613. checked={currentDocForm === ChunkingMode.qa}
  614. disabled={!!currentDataset?.doc_form}
  615. />
  616. <label className="ml-2 system-sm-regular cursor-pointer text-text-secondary">
  617. {t('datasetCreation.stepTwo.useQALanguage')}
  618. </label>
  619. </div>
  620. <LanguageSelect
  621. currentLanguage={docLanguage || locale}
  622. onSelect={setDocLanguage}
  623. disabled={currentDocForm !== ChunkingMode.qa}
  624. />
  625. <Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />
  626. </div>
  627. {currentDocForm === ChunkingMode.qa && (
  628. <div
  629. style={{
  630. background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
  631. }}
  632. className='h-10 mt-2 flex items-center gap-2 rounded-xl backdrop-blur-[5px] border-components-panel-border border shadow-xs px-3 text-xs'
  633. >
  634. <RiAlertFill className='size-4 text-text-warning-secondary' />
  635. <span className='system-xs-medium text-text-primary'>
  636. {t('datasetCreation.stepTwo.QATip')}
  637. </span>
  638. </div>
  639. )}
  640. </>}
  641. </div>
  642. </div>
  643. </div>
  644. </OptionCard>}
  645. {
  646. (
  647. (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild)
  648. || isUploadInEmptyDataset
  649. || isInInit
  650. )
  651. && <OptionCard
  652. title={t('datasetCreation.stepTwo.parentChild')}
  653. icon={<Image width={20} height={20} src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}
  654. effectImg={OrangeEffect.src}
  655. activeHeaderClassName='bg-dataset-option-card-orange-gradient'
  656. description={t('datasetCreation.stepTwo.parentChildTip')}
  657. isActive={currentDocForm === ChunkingMode.parentChild}
  658. onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)}
  659. actions={
  660. <>
  661. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  662. <RiSearchEyeLine className='h-4 w-4 mr-0.5' />
  663. {t('datasetCreation.stepTwo.previewChunk')}
  664. </Button>
  665. <Button variant={'ghost'} onClick={resetRules}>
  666. {t('datasetCreation.stepTwo.reset')}
  667. </Button>
  668. </>
  669. }
  670. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  671. >
  672. <div className='flex flex-col gap-4'>
  673. <div>
  674. <div className='flex items-center gap-x-2'>
  675. <div className='inline-flex shrink-0'>
  676. <TextLabel>{t('datasetCreation.stepTwo.parentChunkForContext')}</TextLabel>
  677. </div>
  678. <Divider className='grow' bgStyle='gradient' />
  679. </div>
  680. <RadioCard className='mt-1'
  681. icon={<Image src={Note} alt='' />}
  682. title={t('datasetCreation.stepTwo.paragraph')}
  683. description={t('datasetCreation.stepTwo.paragraphTip')}
  684. isChosen={parentChildConfig.chunkForContext === 'paragraph'}
  685. onChosen={() => setParentChildConfig(
  686. {
  687. ...parentChildConfig,
  688. chunkForContext: 'paragraph',
  689. },
  690. )}
  691. chosenConfig={
  692. <div className='flex gap-3'>
  693. <DelimiterInput
  694. value={parentChildConfig.parent.delimiter}
  695. tooltip={t('datasetCreation.stepTwo.parentChildDelimiterTip')!}
  696. onChange={e => setParentChildConfig({
  697. ...parentChildConfig,
  698. parent: {
  699. ...parentChildConfig.parent,
  700. delimiter: e.target.value ? escape(e.target.value) : '',
  701. },
  702. })}
  703. />
  704. <MaxLengthInput
  705. unit='tokens'
  706. value={parentChildConfig.parent.maxLength}
  707. onChange={value => setParentChildConfig({
  708. ...parentChildConfig,
  709. parent: {
  710. ...parentChildConfig.parent,
  711. maxLength: value,
  712. },
  713. })}
  714. />
  715. </div>
  716. }
  717. />
  718. <RadioCard className='mt-2'
  719. icon={<Image src={FileList} alt='' />}
  720. title={t('datasetCreation.stepTwo.fullDoc')}
  721. description={t('datasetCreation.stepTwo.fullDocTip')}
  722. onChosen={() => setParentChildConfig(
  723. {
  724. ...parentChildConfig,
  725. chunkForContext: 'full-doc',
  726. },
  727. )}
  728. isChosen={parentChildConfig.chunkForContext === 'full-doc'}
  729. />
  730. </div>
  731. <div>
  732. <div className='flex items-center gap-x-2'>
  733. <div className='inline-flex shrink-0'>
  734. <TextLabel>{t('datasetCreation.stepTwo.childChunkForRetrieval')}</TextLabel>
  735. </div>
  736. <Divider className='grow' bgStyle='gradient' />
  737. </div>
  738. <div className='flex gap-3 mt-1'>
  739. <DelimiterInput
  740. value={parentChildConfig.child.delimiter}
  741. tooltip={t('datasetCreation.stepTwo.parentChildChunkDelimiterTip')!}
  742. onChange={e => setParentChildConfig({
  743. ...parentChildConfig,
  744. child: {
  745. ...parentChildConfig.child,
  746. delimiter: e.target.value ? escape(e.target.value) : '',
  747. },
  748. })}
  749. />
  750. <MaxLengthInput
  751. unit='tokens'
  752. value={parentChildConfig.child.maxLength}
  753. onChange={value => setParentChildConfig({
  754. ...parentChildConfig,
  755. child: {
  756. ...parentChildConfig.child,
  757. maxLength: value,
  758. },
  759. })}
  760. />
  761. </div>
  762. </div>
  763. <div>
  764. <div className='flex items-center gap-x-2'>
  765. <div className='inline-flex shrink-0'>
  766. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  767. </div>
  768. <Divider className='grow' bgStyle='gradient' />
  769. </div>
  770. <div className='mt-1'>
  771. {rules.map(rule => (
  772. <div key={rule.id} className={s.ruleItem} onClick={() => {
  773. ruleChangeHandle(rule.id)
  774. }}>
  775. <Checkbox
  776. checked={rule.enabled}
  777. />
  778. <label className="ml-2 system-sm-regular cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  779. </div>
  780. ))}
  781. </div>
  782. </div>
  783. </div>
  784. </OptionCard>}
  785. <Divider className='my-5' />
  786. <div className={'system-md-semibold mb-1'}>{t('datasetCreation.stepTwo.indexMode')}</div>
  787. <div className='flex items-center gap-2'>
  788. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  789. <OptionCard className='flex-1'
  790. title={<div className='flex items-center'>
  791. {t('datasetCreation.stepTwo.qualified')}
  792. <Badge className={cn('ml-1 h-[18px]', (!hasSetIndexType && indexType === IndexingType.QUALIFIED) ? 'border-text-accent-secondary text-text-accent-secondary' : '')} uppercase>
  793. {t('datasetCreation.stepTwo.recommend')}
  794. </Badge>
  795. <span className='ml-auto'>
  796. {!hasSetIndexType && <span className={cn(s.radio)} />}
  797. </span>
  798. </div>}
  799. description={t('datasetCreation.stepTwo.qualifiedTip')}
  800. icon={<Image src={indexMethodIcon.high_quality} alt='' />}
  801. isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED}
  802. disabled={!isAPIKeySet || hasSetIndexType}
  803. onSwitched={() => {
  804. if (isAPIKeySet)
  805. setIndexType(IndexingType.QUALIFIED)
  806. }}
  807. />
  808. )}
  809. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  810. <>
  811. <CustomDialog show={isQAConfirmDialogOpen} onClose={() => setIsQAConfirmDialogOpen(false)} className='w-[432px]'>
  812. <header className='pt-6 mb-4'>
  813. <h2 className='text-lg font-semibold'>
  814. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipTitle')}
  815. </h2>
  816. <p className='font-normal text-sm mt-2'>
  817. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipContent')}
  818. </p>
  819. </header>
  820. <div className='flex gap-2 pb-6'>
  821. <Button className='ml-auto' onClick={() => {
  822. setIsQAConfirmDialogOpen(false)
  823. }}>
  824. {t('datasetCreation.stepTwo.cancel')}
  825. </Button>
  826. <Button variant={'primary'} onClick={() => {
  827. setIsQAConfirmDialogOpen(false)
  828. setIndexType(IndexingType.QUALIFIED)
  829. setDocForm(ChunkingMode.qa)
  830. }}>
  831. {t('datasetCreation.stepTwo.switch')}
  832. </Button>
  833. </div>
  834. </CustomDialog>
  835. <PortalToFollowElem
  836. open={
  837. isHoveringEconomy && docForm !== ChunkingMode.text
  838. }
  839. placement={'top'}
  840. >
  841. <PortalToFollowElemTrigger asChild>
  842. <OptionCard className='flex-1'
  843. title={t('datasetCreation.stepTwo.economical')}
  844. description={t('datasetCreation.stepTwo.economicalTip')}
  845. icon={<Image src={indexMethodIcon.economical} alt='' />}
  846. isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL}
  847. disabled={!isAPIKeySet || hasSetIndexType || docForm !== ChunkingMode.text}
  848. ref={economyDomRef}
  849. onSwitched={() => {
  850. if (isAPIKeySet && docForm === ChunkingMode.text)
  851. setIndexType(IndexingType.ECONOMICAL)
  852. }}
  853. />
  854. </PortalToFollowElemTrigger>
  855. <PortalToFollowElemContent>
  856. <div className='p-3 bg-components-tooltip-bg border-components-panel-border text-xs font-medium text-text-secondary rounded-lg shadow-lg'>
  857. {
  858. docForm === ChunkingMode.qa
  859. ? t('datasetCreation.stepTwo.notAvailableForQA')
  860. : t('datasetCreation.stepTwo.notAvailableForParentChild')
  861. }
  862. </div>
  863. </PortalToFollowElemContent>
  864. </PortalToFollowElem>
  865. </>)}
  866. </div>
  867. {!hasSetIndexType && indexType === IndexingType.QUALIFIED && (
  868. <div className='mt-2 h-10 p-2 flex items-center gap-x-0.5 rounded-xl border-[0.5px] border-components-panel-border overflow-hidden bg-components-panel-bg-blur backdrop-blur-[5px] shadow-xs'>
  869. <div className='absolute top-0 left-0 right-0 bottom-0 bg-[linear-gradient(92deg,rgba(247,144,9,0.25)_0%,rgba(255,255,255,0.00)_100%)] opacity-40'></div>
  870. <div className='p-1'>
  871. <AlertTriangle className='size-4 text-text-warning-secondary' />
  872. </div>
  873. <span className='system-xs-medium'>{t('datasetCreation.stepTwo.highQualityTip')}</span>
  874. </div>
  875. )}
  876. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  877. <div className='mt-2 system-xs-medium'>
  878. {t('datasetCreation.stepTwo.indexSettingTip')}
  879. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  880. </div>
  881. )}
  882. {/* Embedding model */}
  883. {indexType === IndexingType.QUALIFIED && (
  884. <div className='mt-5'>
  885. <div className={cn('system-md-semibold mb-1', datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
  886. <ModelSelector
  887. readonly={!!datasetId}
  888. triggerClassName={datasetId ? 'opacity-50' : ''}
  889. defaultModel={embeddingModel}
  890. modelList={embeddingModelList}
  891. onSelect={(model: DefaultModel) => {
  892. setEmbeddingModel(model)
  893. }}
  894. />
  895. {!!datasetId && (
  896. <div className='mt-2 system-xs-medium'>
  897. {t('datasetCreation.stepTwo.indexSettingTip')}
  898. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  899. </div>
  900. )}
  901. </div>
  902. )}
  903. <Divider className='my-5' />
  904. {/* Retrieval Method Config */}
  905. <div>
  906. {!datasetId
  907. ? (
  908. <div className={'mb-1'}>
  909. <div className='system-md-semibold mb-0.5'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  910. <div className='body-xs-regular text-text-tertiary'>
  911. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  912. {t('datasetSettings.form.retrievalSetting.longDescription')}
  913. </div>
  914. </div>
  915. )
  916. : (
  917. <div className={cn('system-md-semibold mb-0.5', 'flex justify-between items-center')}>
  918. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  919. </div>
  920. )}
  921. <div className=''>
  922. {
  923. getIndexing_technique() === IndexingType.QUALIFIED
  924. ? (
  925. <RetrievalMethodConfig
  926. disabled={!!datasetId}
  927. value={retrievalConfig}
  928. onChange={setRetrievalConfig}
  929. />
  930. )
  931. : (
  932. <EconomicalRetrievalMethodConfig
  933. disabled={!!datasetId}
  934. value={retrievalConfig}
  935. onChange={setRetrievalConfig}
  936. />
  937. )
  938. }
  939. </div>
  940. </div>
  941. {!isSetting
  942. ? (
  943. <div className='flex items-center mt-8 py-2'>
  944. <Button onClick={() => onStepChange && onStepChange(-1)}>
  945. <RiArrowLeftLine className='w-4 h-4 mr-1' />
  946. {t('datasetCreation.stepTwo.previousStep')}
  947. </Button>
  948. <Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  949. </div>
  950. )
  951. : (
  952. <div className='flex items-center mt-8 py-2'>
  953. {!datasetId && <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>}
  954. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  955. </div>
  956. )}
  957. </div>
  958. <FloatRightContainer isMobile={isMobile} isOpen={true} onClose={() => { }} footer={null}>
  959. <PreviewContainer
  960. header={<PreviewHeader
  961. title={t('datasetCreation.stepTwo.preview')}
  962. >
  963. <div className='flex items-center gap-1'>
  964. {dataSourceType === DataSourceType.FILE
  965. && <PreviewDocumentPicker
  966. files={files as Array<Required<CustomFile>>}
  967. onChange={(selected) => {
  968. currentEstimateMutation.reset()
  969. setPreviewFile(selected)
  970. currentEstimateMutation.mutate()
  971. }}
  972. // when it is from setting, it just has one file
  973. value={isSetting ? (files[0]! as Required<CustomFile>) : previewFile}
  974. />
  975. }
  976. {dataSourceType === DataSourceType.NOTION
  977. && <PreviewDocumentPicker
  978. files={
  979. notionPages.map(page => ({
  980. id: page.page_id,
  981. name: page.page_name,
  982. extension: 'md',
  983. }))
  984. }
  985. onChange={(selected) => {
  986. currentEstimateMutation.reset()
  987. const selectedPage = notionPages.find(page => page.page_id === selected.id)
  988. setPreviewNotionPage(selectedPage!)
  989. currentEstimateMutation.mutate()
  990. }}
  991. value={{
  992. id: previewNotionPage?.page_id || '',
  993. name: previewNotionPage?.page_name || '',
  994. extension: 'md',
  995. }}
  996. />
  997. }
  998. {dataSourceType === DataSourceType.WEB
  999. && <PreviewDocumentPicker
  1000. files={
  1001. websitePages.map(page => ({
  1002. id: page.source_url,
  1003. name: page.title,
  1004. extension: 'md',
  1005. }))
  1006. }
  1007. onChange={(selected) => {
  1008. currentEstimateMutation.reset()
  1009. const selectedPage = websitePages.find(page => page.source_url === selected.id)
  1010. setPreviewWebsitePage(selectedPage!)
  1011. currentEstimateMutation.mutate()
  1012. }}
  1013. value={
  1014. {
  1015. id: previewWebsitePage?.source_url || '',
  1016. name: previewWebsitePage?.title || '',
  1017. extension: 'md',
  1018. }
  1019. }
  1020. />
  1021. }
  1022. {
  1023. currentDocForm !== ChunkingMode.qa
  1024. && <Badge text={t(
  1025. 'datasetCreation.stepTwo.previewChunkCount', {
  1026. count: estimate?.total_segments || 0,
  1027. }) as string}
  1028. />
  1029. }
  1030. </div>
  1031. </PreviewHeader>}
  1032. className={cn('flex shrink-0 w-1/2 p-4 pr-0 relative h-full', isMobile && 'w-full max-w-[524px]')}
  1033. mainClassName='space-y-6'
  1034. >
  1035. {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && (
  1036. estimate?.qa_preview.map((item, index) => (
  1037. <ChunkContainer
  1038. key={item.question}
  1039. label={`Chunk-${index + 1}`}
  1040. characterCount={item.question.length + item.answer.length}
  1041. >
  1042. <QAPreview qa={item} />
  1043. </ChunkContainer>
  1044. ))
  1045. )}
  1046. {currentDocForm === ChunkingMode.text && estimate?.preview && (
  1047. estimate?.preview.map((item, index) => (
  1048. <ChunkContainer
  1049. key={item.content}
  1050. label={`Chunk-${index + 1}`}
  1051. characterCount={item.content.length}
  1052. >
  1053. {item.content}
  1054. </ChunkContainer>
  1055. ))
  1056. )}
  1057. {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && (
  1058. estimate?.preview?.map((item, index) => {
  1059. const indexForLabel = index + 1
  1060. const childChunks = parentChildConfig.chunkForContext === 'full-doc'
  1061. ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)
  1062. : item.child_chunks
  1063. return (
  1064. <ChunkContainer
  1065. key={item.content}
  1066. label={`Chunk-${indexForLabel}`}
  1067. characterCount={item.content.length}
  1068. >
  1069. <FormattedText>
  1070. {childChunks.map((child, index) => {
  1071. const indexForLabel = index + 1
  1072. return (
  1073. <PreviewSlice
  1074. key={child}
  1075. label={`C-${indexForLabel}`}
  1076. text={child}
  1077. tooltip={`Child-chunk-${indexForLabel} · ${child.length} Characters`}
  1078. labelInnerClassName='text-[10px] font-semibold align-bottom leading-7'
  1079. dividerClassName='leading-7'
  1080. />
  1081. )
  1082. })}
  1083. </FormattedText>
  1084. </ChunkContainer>
  1085. )
  1086. })
  1087. )}
  1088. {currentEstimateMutation.isIdle && (
  1089. <div className='h-full w-full flex items-center justify-center'>
  1090. <div className='flex flex-col items-center justify-center gap-3'>
  1091. <RiSearchEyeLine className='size-10 text-text-empty-state-icon' />
  1092. <p className='text-sm text-text-tertiary'>
  1093. {t('datasetCreation.stepTwo.previewChunkTip')}
  1094. </p>
  1095. </div>
  1096. </div>
  1097. )}
  1098. {currentEstimateMutation.isPending && (
  1099. <div className='space-y-6'>
  1100. {Array.from({ length: 10 }, (_, i) => (
  1101. <SkeletonContainer key={i}>
  1102. <SkeletonRow>
  1103. <SkeletonRectangle className="w-20" />
  1104. <SkeletonPoint />
  1105. <SkeletonRectangle className="w-24" />
  1106. </SkeletonRow>
  1107. <SkeletonRectangle className="w-full" />
  1108. <SkeletonRectangle className="w-full" />
  1109. <SkeletonRectangle className="w-[422px]" />
  1110. </SkeletonContainer>
  1111. ))}
  1112. </div>
  1113. )}
  1114. </PreviewContainer>
  1115. </FloatRightContainer>
  1116. </div>
  1117. )
  1118. }
  1119. export default StepTwo