index.tsx 41 KB


  1. 'use client'
  2. import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
  3. import { useTranslation } from 'react-i18next'
  4. import { useContext } from 'use-context-selector'
  5. import { useBoolean } from 'ahooks'
  6. import { XMarkIcon } from '@heroicons/react/20/solid'
  7. import { RocketLaunchIcon } from '@heroicons/react/24/outline'
  8. import {
  9. RiCloseLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import { groupBy } from 'lodash-es'
  13. import PreviewItem, { PreviewType } from './preview-item'
  14. import LanguageSelect from './language-select'
  15. import s from './index.module.css'
  16. import unescape from './unescape'
  17. import escape from './escape'
  18. import cn from '@/utils/classnames'
  19. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  20. import {
  21. createDocument,
  22. createFirstDocument,
  23. fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
  24. fetchDefaultProcessRule,
  25. } from '@/service/datasets'
  26. import Button from '@/app/components/base/button'
  27. import Input from '@/app/components/base/input'
  28. import Loading from '@/app/components/base/loading'
  29. import FloatRightContainer from '@/app/components/base/float-right-container'
  30. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  31. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  32. import { type RetrievalConfig } from '@/types/app'
  33. import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  34. import Toast from '@/app/components/base/toast'
  35. import { formatNumber } from '@/utils/format'
  36. import type { NotionPage } from '@/models/common'
  37. import { DataSourceProvider } from '@/models/common'
  38. import { DataSourceType, DocForm } from '@/models/datasets'
  39. import NotionIcon from '@/app/components/base/notion-icon'
  40. import Switch from '@/app/components/base/switch'
  41. import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
  42. import { useDatasetDetailContext } from '@/context/dataset-detail'
  43. import I18n from '@/context/i18n'
  44. import { IS_CE_EDITION } from '@/config'
  45. import { RETRIEVE_METHOD } from '@/types/app'
  46. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  47. import Tooltip from '@/app/components/base/tooltip'
  48. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  49. import { LanguagesSupported } from '@/i18n/language'
  50. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  51. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  53. import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel'
  54. type ValueOf<T> = T[keyof T]
  55. type StepTwoProps = {
  56. isSetting?: boolean
  57. documentDetail?: FullDocumentDetail
  58. isAPIKeySet: boolean
  59. onSetting: () => void
  60. datasetId?: string
  61. indexingType?: ValueOf<IndexingType>
  62. dataSourceType: DataSourceType
  63. files: CustomFile[]
  64. notionPages?: NotionPage[]
  65. websitePages?: CrawlResultItem[]
  66. crawlOptions?: CrawlOptions
  67. websiteCrawlProvider?: DataSourceProvider
  68. websiteCrawlJobId?: string
  69. onStepChange?: (delta: number) => void
  70. updateIndexingTypeCache?: (type: string) => void
  71. updateResultCache?: (res: createDocumentResponse) => void
  72. onSave?: () => void
  73. onCancel?: () => void
  74. }
  75. enum SegmentType {
  76. AUTO = 'automatic',
  77. CUSTOM = 'custom',
  78. }
  79. enum IndexingType {
  80. QUALIFIED = 'high_quality',
  81. ECONOMICAL = 'economy',
  82. }
  83. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  84. const StepTwo = ({
  85. isSetting,
  86. documentDetail,
  87. isAPIKeySet,
  88. onSetting,
  89. datasetId,
  90. indexingType,
  91. dataSourceType: inCreatePageDataSourceType,
  92. files,
  93. notionPages = [],
  94. websitePages = [],
  95. crawlOptions,
  96. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  97. websiteCrawlJobId = '',
  98. onStepChange,
  99. updateIndexingTypeCache,
  100. updateResultCache,
  101. onSave,
  102. onCancel,
  103. }: StepTwoProps) => {
  104. const { t } = useTranslation()
  105. const { locale } = useContext(I18n)
  106. const media = useBreakpoints()
  107. const isMobile = media === MediaType.mobile
  108. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  109. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  110. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  111. const scrollRef = useRef<HTMLDivElement>(null)
  112. const [scrolled, setScrolled] = useState(false)
  113. const previewScrollRef = useRef<HTMLDivElement>(null)
  114. const [previewScrolled, setPreviewScrolled] = useState(false)
  115. const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
  116. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  117. const setSegmentIdentifier = useCallback((value: string) => {
  118. doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
  119. }, [])
  120. const [max, setMax] = useState(4000) // default chunk length
  121. const [overlap, setOverlap] = useState(50)
  122. const [rules, setRules] = useState<PreProcessingRule[]>([])
  123. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  124. const hasSetIndexType = !!indexingType
  125. const [indexType, setIndexType] = useState<ValueOf<IndexingType>>(
  126. (indexingType
  127. || isAPIKeySet)
  128. ? IndexingType.QUALIFIED
  129. : IndexingType.ECONOMICAL,
  130. )
  131. const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false)
  132. const [docForm, setDocForm] = useState<DocForm | string>(
  133. (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
  134. )
  135. const [docLanguage, setDocLanguage] = useState<string>(
  136. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
  137. )
  138. const [QATipHide, setQATipHide] = useState(false)
  139. const [previewSwitched, setPreviewSwitched] = useState(false)
  140. const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
  141. const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  142. const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  143. const fileIndexingEstimate = (() => {
  144. return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
  145. })()
  146. const [isCreating, setIsCreating] = useState(false)
  147. const scrollHandle = (e: Event) => {
  148. if ((e.target as HTMLDivElement).scrollTop > 0)
  149. setScrolled(true)
  150. else
  151. setScrolled(false)
  152. }
  153. const previewScrollHandle = (e: Event) => {
  154. if ((e.target as HTMLDivElement).scrollTop > 0)
  155. setPreviewScrolled(true)
  156. else
  157. setPreviewScrolled(false)
  158. }
  159. const getFileName = (name: string) => {
  160. const arr = name.split('.')
  161. return arr.slice(0, -1).join('.')
  162. }
  163. const getRuleName = (key: string) => {
  164. if (key === 'remove_extra_spaces')
  165. return t('datasetCreation.stepTwo.removeExtraSpaces')
  166. if (key === 'remove_urls_emails')
  167. return t('datasetCreation.stepTwo.removeUrlEmails')
  168. if (key === 'remove_stopwords')
  169. return t('datasetCreation.stepTwo.removeStopwords')
  170. }
  171. const ruleChangeHandle = (id: string) => {
  172. const newRules = rules.map((rule) => {
  173. if (rule.id === id) {
  174. return {
  175. id: rule.id,
  176. enabled: !rule.enabled,
  177. }
  178. }
  179. return rule
  180. })
  181. setRules(newRules)
  182. }
  183. const resetRules = () => {
  184. if (defaultConfig) {
  185. setSegmentIdentifier(defaultConfig.segmentation.separator)
  186. setMax(defaultConfig.segmentation.max_tokens)
  187. setOverlap(defaultConfig.segmentation.chunk_overlap)
  188. setRules(defaultConfig.pre_processing_rules)
  189. }
  190. }
  191. const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT, language?: string) => {
  192. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  193. const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm, language)!)
  194. if (segmentationType === SegmentType.CUSTOM)
  195. setCustomFileIndexingEstimate(res)
  196. else
  197. setAutomaticFileIndexingEstimate(res)
  198. }
  199. const confirmChangeCustomConfig = () => {
  200. setCustomFileIndexingEstimate(null)
  201. setShowPreview()
  202. fetchFileIndexingEstimate()
  203. setPreviewSwitched(false)
  204. }
  205. const getIndexing_technique = () => indexingType || indexType
  206. const getProcessRule = () => {
  207. const processRule: ProcessRule = {
  208. rules: {} as any, // api will check this. It will be removed after api refactored.
  209. mode: segmentationType,
  210. }
  211. if (segmentationType === SegmentType.CUSTOM) {
  212. const ruleObj = {
  213. pre_processing_rules: rules,
  214. segmentation: {
  215. separator: unescape(segmentIdentifier),
  216. max_tokens: max,
  217. chunk_overlap: overlap,
  218. },
  219. }
  220. processRule.rules = ruleObj
  221. }
  222. return processRule
  223. }
  224. const getNotionInfo = () => {
  225. const workspacesMap = groupBy(notionPages, 'workspace_id')
  226. const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
  227. return {
  228. workspaceId,
  229. pages: workspacesMap[workspaceId],
  230. }
  231. })
  232. return workspaces.map((workspace) => {
  233. return {
  234. workspace_id: workspace.workspaceId,
  235. pages: workspace.pages.map((page) => {
  236. const { page_id, page_name, page_icon, type } = page
  237. return {
  238. page_id,
  239. page_name,
  240. page_icon,
  241. type,
  242. }
  243. }),
  244. }
  245. }) as NotionInfo[]
  246. }
  247. const getWebsiteInfo = () => {
  248. return {
  249. provider: websiteCrawlProvider,
  250. job_id: websiteCrawlJobId,
  251. urls: websitePages.map(page => page.source_url),
  252. only_main_content: crawlOptions?.only_main_content,
  253. }
  254. }
  255. const getFileIndexingEstimateParams = (docForm: DocForm, language?: string): IndexingEstimateParams | undefined => {
  256. if (dataSourceType === DataSourceType.FILE) {
  257. return {
  258. info_list: {
  259. data_source_type: dataSourceType,
  260. file_info_list: {
  261. file_ids: files.map(file => file.id) as string[],
  262. },
  263. },
  264. indexing_technique: getIndexing_technique() as string,
  265. process_rule: getProcessRule(),
  266. doc_form: docForm,
  267. doc_language: language || docLanguage,
  268. dataset_id: datasetId as string,
  269. }
  270. }
  271. if (dataSourceType === DataSourceType.NOTION) {
  272. return {
  273. info_list: {
  274. data_source_type: dataSourceType,
  275. notion_info_list: getNotionInfo(),
  276. },
  277. indexing_technique: getIndexing_technique() as string,
  278. process_rule: getProcessRule(),
  279. doc_form: docForm,
  280. doc_language: language || docLanguage,
  281. dataset_id: datasetId as string,
  282. }
  283. }
  284. if (dataSourceType === DataSourceType.WEB) {
  285. return {
  286. info_list: {
  287. data_source_type: dataSourceType,
  288. website_info_list: getWebsiteInfo(),
  289. },
  290. indexing_technique: getIndexing_technique() as string,
  291. process_rule: getProcessRule(),
  292. doc_form: docForm,
  293. doc_language: language || docLanguage,
  294. dataset_id: datasetId as string,
  295. }
  296. }
  297. }
  298. const {
  299. modelList: rerankModelList,
  300. defaultModel: rerankDefaultModel,
  301. currentModel: isRerankDefaultModelValid,
  302. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  303. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  304. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  305. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  306. currentDataset?.embedding_model
  307. ? {
  308. provider: currentDataset.embedding_model_provider,
  309. model: currentDataset.embedding_model,
  310. }
  311. : {
  312. provider: defaultEmbeddingModel?.provider.provider || '',
  313. model: defaultEmbeddingModel?.model || '',
  314. },
  315. )
  316. const getCreationParams = () => {
  317. let params
  318. if (segmentationType === SegmentType.CUSTOM && overlap > max) {
  319. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  320. return
  321. }
  322. if (isSetting) {
  323. params = {
  324. original_document_id: documentDetail?.id,
  325. doc_form: docForm,
  326. doc_language: docLanguage,
  327. process_rule: getProcessRule(),
  328. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  329. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  330. embedding_model: embeddingModel.model, // Readonly
  331. embedding_model_provider: embeddingModel.provider, // Readonly
  332. } as CreateDocumentReq
  333. }
  334. else { // create
  335. const indexMethod = getIndexing_technique()
  336. if (
  337. !isReRankModelSelected({
  338. rerankDefaultModel,
  339. isRerankDefaultModelValid: !!isRerankDefaultModelValid,
  340. rerankModelList,
  341. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  342. retrievalConfig,
  343. indexMethod: indexMethod as string,
  344. })
  345. ) {
  346. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  347. return
  348. }
  349. const postRetrievalConfig = ensureRerankModelSelected({
  350. rerankDefaultModel: rerankDefaultModel!,
  351. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  352. retrievalConfig,
  353. indexMethod: indexMethod as string,
  354. })
  355. params = {
  356. data_source: {
  357. type: dataSourceType,
  358. info_list: {
  359. data_source_type: dataSourceType,
  360. },
  361. },
  362. indexing_technique: getIndexing_technique(),
  363. process_rule: getProcessRule(),
  364. doc_form: docForm,
  365. doc_language: docLanguage,
  366. retrieval_model: postRetrievalConfig,
  367. embedding_model: embeddingModel.model,
  368. embedding_model_provider: embeddingModel.provider,
  369. } as CreateDocumentReq
  370. if (dataSourceType === DataSourceType.FILE) {
  371. params.data_source.info_list.file_info_list = {
  372. file_ids: files.map(file => file.id || '').filter(Boolean),
  373. }
  374. }
  375. if (dataSourceType === DataSourceType.NOTION)
  376. params.data_source.info_list.notion_info_list = getNotionInfo()
  377. if (dataSourceType === DataSourceType.WEB)
  378. params.data_source.info_list.website_info_list = getWebsiteInfo()
  379. }
  380. return params
  381. }
  382. const getRules = async () => {
  383. try {
  384. const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
  385. const separator = res.rules.segmentation.separator
  386. setSegmentIdentifier(separator)
  387. setMax(res.rules.segmentation.max_tokens)
  388. setOverlap(res.rules.segmentation.chunk_overlap)
  389. setRules(res.rules.pre_processing_rules)
  390. setDefaultConfig(res.rules)
  391. }
  392. catch (err) {
  393. console.log(err)
  394. }
  395. }
  396. const getRulesFromDetail = () => {
  397. if (documentDetail) {
  398. const rules = documentDetail.dataset_process_rule.rules
  399. const separator = rules.segmentation.separator
  400. const max = rules.segmentation.max_tokens
  401. const overlap = rules.segmentation.chunk_overlap
  402. setSegmentIdentifier(separator)
  403. setMax(max)
  404. setOverlap(overlap)
  405. setRules(rules.pre_processing_rules)
  406. setDefaultConfig(rules)
  407. }
  408. }
  409. const getDefaultMode = () => {
  410. if (documentDetail)
  411. setSegmentationType(documentDetail.dataset_process_rule.mode)
  412. }
  413. const createHandle = async () => {
  414. if (isCreating)
  415. return
  416. setIsCreating(true)
  417. try {
  418. let res
  419. const params = getCreationParams()
  420. if (!params)
  421. return false
  422. setIsCreating(true)
  423. if (!datasetId) {
  424. res = await createFirstDocument({
  425. body: params as CreateDocumentReq,
  426. })
  427. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  428. updateResultCache && updateResultCache(res)
  429. }
  430. else {
  431. res = await createDocument({
  432. datasetId,
  433. body: params as CreateDocumentReq,
  434. })
  435. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  436. updateResultCache && updateResultCache(res)
  437. }
  438. if (mutateDatasetRes)
  439. mutateDatasetRes()
  440. onStepChange && onStepChange(+1)
  441. isSetting && onSave && onSave()
  442. }
  443. catch (err) {
  444. Toast.notify({
  445. type: 'error',
  446. message: `${err}`,
  447. })
  448. }
  449. finally {
  450. setIsCreating(false)
  451. }
  452. }
  453. const handleSwitch = (state: boolean) => {
  454. if (state)
  455. setDocForm(DocForm.QA)
  456. else
  457. setDocForm(DocForm.TEXT)
  458. }
  459. const previewSwitch = async (language?: string) => {
  460. setPreviewSwitched(true)
  461. setIsLanguageSelectDisabled(true)
  462. if (segmentationType === SegmentType.AUTO)
  463. setAutomaticFileIndexingEstimate(null)
  464. else
  465. setCustomFileIndexingEstimate(null)
  466. try {
  467. await fetchFileIndexingEstimate(DocForm.QA, language)
  468. }
  469. finally {
  470. setIsLanguageSelectDisabled(false)
  471. }
  472. }
  473. const handleSelect = (language: string) => {
  474. setDocLanguage(language)
  475. // Switch language, re-cutter
  476. if (docForm === DocForm.QA && previewSwitched)
  477. previewSwitch(language)
  478. }
  479. const changeToEconomicalType = () => {
  480. if (!hasSetIndexType) {
  481. setIndexType(IndexingType.ECONOMICAL)
  482. setDocForm(DocForm.TEXT)
  483. }
  484. }
  485. useEffect(() => {
  486. // fetch rules
  487. if (!isSetting) {
  488. getRules()
  489. }
  490. else {
  491. getRulesFromDetail()
  492. getDefaultMode()
  493. }
  494. }, [])
  495. useEffect(() => {
  496. scrollRef.current?.addEventListener('scroll', scrollHandle)
  497. return () => {
  498. scrollRef.current?.removeEventListener('scroll', scrollHandle)
  499. }
  500. }, [])
  501. useLayoutEffect(() => {
  502. if (showPreview) {
  503. previewScrollRef.current?.addEventListener('scroll', previewScrollHandle)
  504. return () => {
  505. previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle)
  506. }
  507. }
  508. }, [showPreview])
  509. useEffect(() => {
  510. if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
  511. setDocForm(DocForm.TEXT)
  512. }, [indexingType, docForm])
  513. useEffect(() => {
  514. // get indexing type by props
  515. if (indexingType)
  516. setIndexType(indexingType as IndexingType)
  517. else
  518. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  519. }, [isAPIKeySet, indexingType, datasetId])
  520. useEffect(() => {
  521. if (segmentationType === SegmentType.AUTO) {
  522. setAutomaticFileIndexingEstimate(null)
  523. !isMobile && setShowPreview()
  524. fetchFileIndexingEstimate()
  525. setPreviewSwitched(false)
  526. }
  527. else {
  528. hidePreview()
  529. setCustomFileIndexingEstimate(null)
  530. setPreviewSwitched(false)
  531. }
  532. }, [segmentationType, indexType])
  533. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  534. search_method: RETRIEVE_METHOD.semantic,
  535. reranking_enable: false,
  536. reranking_model: {
  537. reranking_provider_name: rerankDefaultModel?.provider.provider,
  538. reranking_model_name: rerankDefaultModel?.model,
  539. },
  540. top_k: 3,
  541. score_threshold_enabled: false,
  542. score_threshold: 0.5,
  543. } as RetrievalConfig)
  544. return (
  545. <div className='flex w-full h-full'>
  546. <div ref={scrollRef} className='relative h-full w-full overflow-y-scroll'>
  547. <div className={cn(s.pageHeader, scrolled && s.fixed, isMobile && '!px-6')}>
  548. <span>{t('datasetCreation.steps.two')}</span>
  549. {(isMobile || !showPreview) && (
  550. <Button
  551. className='border-[0.5px] !h-8 hover:outline hover:outline-[0.5px] hover:outline-gray-300 text-gray-700 font-medium bg-white shadow-[0px_1px_2px_0px_rgba(16,24,40,0.05)]'
  552. onClick={setShowPreview}
  553. >
  554. <Tooltip>
  555. <div className="flex flex-row items-center">
  556. <RocketLaunchIcon className="h-4 w-4 mr-1.5 stroke-[1.8px]" />
  557. <span className="text-[13px]">{t('datasetCreation.stepTwo.previewTitleButton')}</span>
  558. </div>
  559. </Tooltip>
  560. </Button>
  561. )}
  562. </div>
  563. <div className={cn(s.form, isMobile && '!px-4')}>
  564. <div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
  565. <div className='max-w-[640px]'>
  566. <div
  567. className={cn(
  568. s.radioItem,
  569. s.segmentationItem,
  570. segmentationType === SegmentType.AUTO && s.active,
  571. )}
  572. onClick={() => setSegmentationType(SegmentType.AUTO)}
  573. >
  574. <span className={cn(s.typeIcon, s.auto)} />
  575. <span className={cn(s.radio)} />
  576. <div className={s.typeHeader}>
  577. <div className={s.title}>{t('datasetCreation.stepTwo.auto')}</div>
  578. <div className={s.tip}>{t('datasetCreation.stepTwo.autoDescription')}</div>
  579. </div>
  580. </div>
  581. <div
  582. className={cn(
  583. s.radioItem,
  584. s.segmentationItem,
  585. segmentationType === SegmentType.CUSTOM && s.active,
  586. segmentationType === SegmentType.CUSTOM && s.custom,
  587. )}
  588. onClick={() => setSegmentationType(SegmentType.CUSTOM)}
  589. >
  590. <span className={cn(s.typeIcon, s.customize)} />
  591. <span className={cn(s.radio)} />
  592. <div className={s.typeHeader}>
  593. <div className={s.title}>{t('datasetCreation.stepTwo.custom')}</div>
  594. <div className={s.tip}>{t('datasetCreation.stepTwo.customDescription')}</div>
  595. </div>
  596. {segmentationType === SegmentType.CUSTOM && (
  597. <div className={s.typeFormBody}>
  598. <div className={s.formRow}>
  599. <div className='w-full'>
  600. <div className={s.label}>
  601. {t('datasetCreation.stepTwo.separator')}
  602. <Tooltip
  603. popupContent={
  604. <div className='max-w-[200px]'>
  605. {t('datasetCreation.stepTwo.separatorTip')}
  606. </div>
  607. }
  608. />
  609. </div>
  610. <Input
  611. type="text"
  612. className='h-9'
  613. placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
  614. onChange={e => setSegmentIdentifier(e.target.value)}
  615. />
  616. </div>
  617. </div>
  618. <div className={s.formRow}>
  619. <div className='w-full'>
  620. <div className={s.label}>{t('datasetCreation.stepTwo.maxLength')}</div>
  621. <Input
  622. type="number"
  623. className='h-9'
  624. placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
  625. value={max}
  626. min={1}
  627. onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  628. />
  629. </div>
  630. </div>
  631. <div className={s.formRow}>
  632. <div className='w-full'>
  633. <div className={s.label}>
  634. {t('datasetCreation.stepTwo.overlap')}
  635. <Tooltip
  636. popupContent={
  637. <div className='max-w-[200px]'>
  638. {t('datasetCreation.stepTwo.overlapTip')}
  639. </div>
  640. }
  641. />
  642. </div>
  643. <Input
  644. type="number"
  645. className='h-9'
  646. placeholder={t('datasetCreation.stepTwo.overlap') || ''}
  647. value={overlap}
  648. min={1}
  649. onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  650. />
  651. </div>
  652. </div>
  653. <div className={s.formRow}>
  654. <div className='w-full flex flex-col gap-1'>
  655. <div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>
  656. {rules.map(rule => (
  657. <div key={rule.id} className={s.ruleItem}>
  658. <input id={rule.id} type="checkbox" checked={rule.enabled} onChange={() => ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
  659. <label htmlFor={rule.id} className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
  660. </div>
  661. ))}
  662. </div>
  663. </div>
  664. <div className={s.formFooter}>
  665. <Button variant="primary" className={cn(s.button)} onClick={confirmChangeCustomConfig}>{t('datasetCreation.stepTwo.preview')}</Button>
  666. <Button className={cn(s.button, 'ml-2')} onClick={resetRules}>{t('datasetCreation.stepTwo.reset')}</Button>
  667. </div>
  668. </div>
  669. )}
  670. </div>
  671. </div>
  672. <div className={s.label}>{t('datasetCreation.stepTwo.indexMode')}</div>
  673. <div className='max-w-[640px]'>
  674. <div className='flex items-center gap-3 flex-wrap sm:flex-nowrap'>
  675. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  676. <div
  677. className={cn(
  678. s.radioItem,
  679. s.indexItem,
  680. !isAPIKeySet && s.disabled,
  681. !hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
  682. hasSetIndexType && s.disabled,
  683. hasSetIndexType && '!w-full !min-h-[96px]',
  684. )}
  685. onClick={() => {
  686. if (isAPIKeySet)
  687. setIndexType(IndexingType.QUALIFIED)
  688. }}
  689. >
  690. <span className={cn(s.typeIcon, s.qualified)} />
  691. {!hasSetIndexType && <span className={cn(s.radio)} />}
  692. <div className={s.typeHeader}>
  693. <div className={s.title}>
  694. {t('datasetCreation.stepTwo.qualified')}
  695. {!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
  696. </div>
  697. <div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
  698. </div>
  699. {!isAPIKeySet && (
  700. <div className={s.warningTip}>
  701. <span>{t('datasetCreation.stepTwo.warning')}&nbsp;</span>
  702. <span className={s.click} onClick={onSetting}>{t('datasetCreation.stepTwo.click')}</span>
  703. </div>
  704. )}
  705. </div>
  706. )}
  707. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  708. <div
  709. className={cn(
  710. s.radioItem,
  711. s.indexItem,
  712. !hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
  713. hasSetIndexType && s.disabled,
  714. hasSetIndexType && '!w-full !min-h-[96px]',
  715. )}
  716. onClick={changeToEconomicalType}
  717. >
  718. <span className={cn(s.typeIcon, s.economical)} />
  719. {!hasSetIndexType && <span className={cn(s.radio)} />}
  720. <div className={s.typeHeader}>
  721. <div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
  722. <div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
  723. </div>
  724. </div>
  725. )}
  726. </div>
  727. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  728. <div className='mt-2 text-xs text-gray-500 font-medium'>
  729. {t('datasetCreation.stepTwo.indexSettingTip')}
  730. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  731. </div>
  732. )}
  733. {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
  734. <div className='mt-3 rounded-xl bg-gray-50 border border-gray-100'>
  735. <div className='flex justify-between items-center px-5 py-4'>
  736. <div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
  737. <MessageChatSquare className='w-4 h-4' />
  738. </div>
  739. <div className='grow mx-3'>
  740. <div className='mb-[2px] text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
  741. <div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
  742. <span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
  743. <LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} disabled={isLanguageSelectDisabled} />
  744. </div>
  745. </div>
  746. <div className='shrink-0'>
  747. <Switch
  748. defaultValue={docForm === DocForm.QA}
  749. onChange={handleSwitch}
  750. size='md'
  751. />
  752. </div>
  753. </div>
  754. {docForm === DocForm.QA && !QATipHide && (
  755. <div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
  756. {t('datasetCreation.stepTwo.QATip')}
  757. <RiCloseLine className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
  758. </div>
  759. )}
  760. </div>
  761. )}
  762. {/* Embedding model */}
  763. {indexType === IndexingType.QUALIFIED && (
  764. <div className='mb-2'>
  765. <div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
  766. <ModelSelector
  767. readonly={!!datasetId}
  768. defaultModel={embeddingModel}
  769. modelList={embeddingModelList}
  770. onSelect={(model: DefaultModel) => {
  771. setEmbeddingModel(model)
  772. }}
  773. />
  774. {!!datasetId && (
  775. <div className='mt-2 text-xs text-gray-500 font-medium'>
  776. {t('datasetCreation.stepTwo.indexSettingTip')}
  777. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  778. </div>
  779. )}
  780. </div>
  781. )}
  782. {/* Retrieval Method Config */}
  783. <div>
  784. {!datasetId
  785. ? (
  786. <div className={s.label}>
  787. <div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  788. <div className='leading-[18px] text-xs font-normal text-gray-500'>
  789. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  790. {t('datasetSettings.form.retrievalSetting.longDescription')}
  791. </div>
  792. </div>
  793. )
  794. : (
  795. <div className={cn(s.label, 'flex justify-between items-center')}>
  796. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  797. </div>
  798. )}
  799. <div className='max-w-[640px]'>
  800. {
  801. getIndexing_technique() === IndexingType.QUALIFIED
  802. ? (
  803. <RetrievalMethodConfig
  804. value={retrievalConfig}
  805. onChange={setRetrievalConfig}
  806. />
  807. )
  808. : (
  809. <EconomicalRetrievalMethodConfig
  810. value={retrievalConfig}
  811. onChange={setRetrievalConfig}
  812. />
  813. )
  814. }
  815. </div>
  816. </div>
  817. <div className={s.source}>
  818. <div className={s.sourceContent}>
  819. {dataSourceType === DataSourceType.FILE && (
  820. <>
  821. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.fileSource')}</div>
  822. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  823. <span className={cn(s.fileIcon, files.length && s[files[0].extension || ''])} />
  824. {getFileName(files[0].name || '')}
  825. {files.length > 1 && (
  826. <span className={s.sourceCount}>
  827. <span>{t('datasetCreation.stepTwo.other')}</span>
  828. <span>{files.length - 1}</span>
  829. <span>{t('datasetCreation.stepTwo.fileUnit')}</span>
  830. </span>
  831. )}
  832. </div>
  833. </>
  834. )}
  835. {dataSourceType === DataSourceType.NOTION && (
  836. <>
  837. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.notionSource')}</div>
  838. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  839. <NotionIcon
  840. className='shrink-0 mr-1'
  841. type='page'
  842. src={notionPages[0]?.page_icon}
  843. />
  844. {notionPages[0]?.page_name}
  845. {notionPages.length > 1 && (
  846. <span className={s.sourceCount}>
  847. <span>{t('datasetCreation.stepTwo.other')}</span>
  848. <span>{notionPages.length - 1}</span>
  849. <span>{t('datasetCreation.stepTwo.notionUnit')}</span>
  850. </span>
  851. )}
  852. </div>
  853. </>
  854. )}
  855. {dataSourceType === DataSourceType.WEB && (
  856. <>
  857. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.websiteSource')}</div>
  858. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  859. <Globe01 className='shrink-0 mr-1' />
  860. <span className='grow w-0 truncate'>{websitePages[0].source_url}</span>
  861. {websitePages.length > 1 && (
  862. <span className={s.sourceCount}>
  863. <span>{t('datasetCreation.stepTwo.other')}</span>
  864. <span>{websitePages.length - 1}</span>
  865. <span>{t('datasetCreation.stepTwo.webpageUnit')}</span>
  866. </span>
  867. )}
  868. </div>
  869. </>
  870. )}
  871. </div>
  872. <div className={s.divider} />
  873. <div className={s.segmentCount}>
  874. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.estimateSegment')}</div>
  875. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  876. {
  877. fileIndexingEstimate
  878. ? (
  879. <div className='text-xs font-medium text-gray-800'>{formatNumber(fileIndexingEstimate.total_segments)} </div>
  880. )
  881. : (
  882. <div className={s.calculating}>{t('datasetCreation.stepTwo.calculating')}</div>
  883. )
  884. }
  885. </div>
  886. </div>
  887. </div>
  888. {!isSetting
  889. ? (
  890. <div className='flex items-center mt-8 py-2'>
  891. <Button onClick={() => onStepChange && onStepChange(-1)}>{t('datasetCreation.stepTwo.previousStep')}</Button>
  892. <div className={s.divider} />
  893. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  894. </div>
  895. )
  896. : (
  897. <div className='flex items-center mt-8 py-2'>
  898. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  899. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  900. </div>
  901. )}
  902. </div>
  903. </div>
  904. </div>
  905. <FloatRightContainer isMobile={isMobile} isOpen={showPreview} onClose={hidePreview} footer={null}>
  906. {showPreview && <div ref={previewScrollRef} className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll border-l border-[#F2F4F7]')}>
  907. <div className={cn(s.previewHeader, previewScrolled && `${s.fixed} pb-3`)}>
  908. <div className='flex items-center justify-between px-8'>
  909. <div className='grow flex items-center'>
  910. <div>{t('datasetCreation.stepTwo.previewTitle')}</div>
  911. {docForm === DocForm.QA && !previewSwitched && (
  912. <Button className='ml-2' variant='secondary-accent' onClick={() => previewSwitch()}>{t('datasetCreation.stepTwo.previewButton')}</Button>
  913. )}
  914. </div>
  915. <div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
  916. <XMarkIcon className='h-4 w-4'></XMarkIcon>
  917. </div>
  918. </div>
  919. {docForm === DocForm.QA && !previewSwitched && (
  920. <div className='px-8 pr-12 text-xs text-gray-500'>
  921. <span>{t('datasetCreation.stepTwo.previewSwitchTipStart')}</span>
  922. <span className='text-amber-600'>{t('datasetCreation.stepTwo.previewSwitchTipEnd')}</span>
  923. </div>
  924. )}
  925. </div>
  926. <div className='my-4 px-8 space-y-4'>
  927. {previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && (
  928. <>
  929. {fileIndexingEstimate?.qa_preview.map((item, index) => (
  930. <PreviewItem type={PreviewType.QA} key={item.question} qa={item} index={index + 1} />
  931. ))}
  932. </>
  933. )}
  934. {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && (
  935. <>
  936. {fileIndexingEstimate?.preview.map((item, index) => (
  937. <PreviewItem type={PreviewType.TEXT} key={item} content={item} index={index + 1} />
  938. ))}
  939. </>
  940. )}
  941. {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
  942. <div className='flex items-center justify-center h-[200px]'>
  943. <Loading type='area' />
  944. </div>
  945. )}
  946. {!previewSwitched && !fileIndexingEstimate?.preview && (
  947. <div className='flex items-center justify-center h-[200px]'>
  948. <Loading type='area' />
  949. </div>
  950. )}
  951. </div>
  952. </div>}
  953. {!showPreview && (
  954. <div className={cn(s.sideTip)}>
  955. <div className={s.tipCard}>
  956. <span className={s.icon} />
  957. <div className={s.title}>{t('datasetCreation.stepTwo.sideTipTitle')}</div>
  958. <div className={s.content}>
  959. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP1')}</p>
  960. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP2')}</p>
  961. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP3')}</p>
  962. <p>{t('datasetCreation.stepTwo.sideTipP4')}</p>
  963. </div>
  964. </div>
  965. </div>
  966. )}
  967. </FloatRightContainer>
  968. </div>
  969. )
  970. }
  971. export default StepTwo