index.tsx 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009
  1. 'use client'
  2. import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
  3. import { useTranslation } from 'react-i18next'
  4. import { useContext } from 'use-context-selector'
  5. import { useBoolean } from 'ahooks'
  6. import { XMarkIcon } from '@heroicons/react/20/solid'
  7. import { RocketLaunchIcon } from '@heroicons/react/24/outline'
  8. import {
  9. RiCloseLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import { groupBy } from 'lodash-es'
  13. import PreviewItem, { PreviewType } from './preview-item'
  14. import LanguageSelect from './language-select'
  15. import s from './index.module.css'
  16. import unescape from './unescape'
  17. import escape from './escape'
  18. import cn from '@/utils/classnames'
  19. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  20. import {
  21. createDocument,
  22. createFirstDocument,
  23. fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
  24. fetchDefaultProcessRule,
  25. } from '@/service/datasets'
  26. import Button from '@/app/components/base/button'
  27. import Loading from '@/app/components/base/loading'
  28. import FloatRightContainer from '@/app/components/base/float-right-container'
  29. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  30. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  31. import { type RetrievalConfig } from '@/types/app'
  32. import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  33. import Toast from '@/app/components/base/toast'
  34. import { formatNumber } from '@/utils/format'
  35. import type { NotionPage } from '@/models/common'
  36. import { DataSourceType, DocForm } from '@/models/datasets'
  37. import NotionIcon from '@/app/components/base/notion-icon'
  38. import Switch from '@/app/components/base/switch'
  39. import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
  40. import { useDatasetDetailContext } from '@/context/dataset-detail'
  41. import I18n from '@/context/i18n'
  42. import { IS_CE_EDITION } from '@/config'
  43. import { RETRIEVE_METHOD } from '@/types/app'
  44. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  45. import Tooltip from '@/app/components/base/tooltip'
  46. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  47. import { LanguagesSupported } from '@/i18n/language'
  48. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  49. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  50. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  51. import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel'
  52. type ValueOf<T> = T[keyof T]
  53. type StepTwoProps = {
  54. isSetting?: boolean
  55. documentDetail?: FullDocumentDetail
  56. isAPIKeySet: boolean
  57. onSetting: () => void
  58. datasetId?: string
  59. indexingType?: ValueOf<IndexingType>
  60. dataSourceType: DataSourceType
  61. files: CustomFile[]
  62. notionPages?: NotionPage[]
  63. websitePages?: CrawlResultItem[]
  64. crawlOptions?: CrawlOptions
  65. fireCrawlJobId?: string
  66. onStepChange?: (delta: number) => void
  67. updateIndexingTypeCache?: (type: string) => void
  68. updateResultCache?: (res: createDocumentResponse) => void
  69. onSave?: () => void
  70. onCancel?: () => void
  71. }
  72. enum SegmentType {
  73. AUTO = 'automatic',
  74. CUSTOM = 'custom',
  75. }
  76. enum IndexingType {
  77. QUALIFIED = 'high_quality',
  78. ECONOMICAL = 'economy',
  79. }
  80. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  81. const StepTwo = ({
  82. isSetting,
  83. documentDetail,
  84. isAPIKeySet,
  85. onSetting,
  86. datasetId,
  87. indexingType,
  88. dataSourceType: inCreatePageDataSourceType,
  89. files,
  90. notionPages = [],
  91. websitePages = [],
  92. crawlOptions,
  93. fireCrawlJobId = '',
  94. onStepChange,
  95. updateIndexingTypeCache,
  96. updateResultCache,
  97. onSave,
  98. onCancel,
  99. }: StepTwoProps) => {
  100. const { t } = useTranslation()
  101. const { locale } = useContext(I18n)
  102. const media = useBreakpoints()
  103. const isMobile = media === MediaType.mobile
  104. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  105. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  106. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  107. const scrollRef = useRef<HTMLDivElement>(null)
  108. const [scrolled, setScrolled] = useState(false)
  109. const previewScrollRef = useRef<HTMLDivElement>(null)
  110. const [previewScrolled, setPreviewScrolled] = useState(false)
  111. const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
  112. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  113. const setSegmentIdentifier = useCallback((value: string) => {
  114. doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
  115. }, [])
  116. const [max, setMax] = useState(4000) // default chunk length
  117. const [overlap, setOverlap] = useState(50)
  118. const [rules, setRules] = useState<PreProcessingRule[]>([])
  119. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  120. const hasSetIndexType = !!indexingType
  121. const [indexType, setIndexType] = useState<ValueOf<IndexingType>>(
  122. (indexingType
  123. || isAPIKeySet)
  124. ? IndexingType.QUALIFIED
  125. : IndexingType.ECONOMICAL,
  126. )
  127. const [docForm, setDocForm] = useState<DocForm | string>(
  128. (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
  129. )
  130. const [docLanguage, setDocLanguage] = useState<string>(
  131. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
  132. )
  133. const [QATipHide, setQATipHide] = useState(false)
  134. const [previewSwitched, setPreviewSwitched] = useState(false)
  135. const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
  136. const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  137. const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  138. const fileIndexingEstimate = (() => {
  139. return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
  140. })()
  141. const [isCreating, setIsCreating] = useState(false)
  142. const scrollHandle = (e: Event) => {
  143. if ((e.target as HTMLDivElement).scrollTop > 0)
  144. setScrolled(true)
  145. else
  146. setScrolled(false)
  147. }
  148. const previewScrollHandle = (e: Event) => {
  149. if ((e.target as HTMLDivElement).scrollTop > 0)
  150. setPreviewScrolled(true)
  151. else
  152. setPreviewScrolled(false)
  153. }
  154. const getFileName = (name: string) => {
  155. const arr = name.split('.')
  156. return arr.slice(0, -1).join('.')
  157. }
  158. const getRuleName = (key: string) => {
  159. if (key === 'remove_extra_spaces')
  160. return t('datasetCreation.stepTwo.removeExtraSpaces')
  161. if (key === 'remove_urls_emails')
  162. return t('datasetCreation.stepTwo.removeUrlEmails')
  163. if (key === 'remove_stopwords')
  164. return t('datasetCreation.stepTwo.removeStopwords')
  165. }
  166. const ruleChangeHandle = (id: string) => {
  167. const newRules = rules.map((rule) => {
  168. if (rule.id === id) {
  169. return {
  170. id: rule.id,
  171. enabled: !rule.enabled,
  172. }
  173. }
  174. return rule
  175. })
  176. setRules(newRules)
  177. }
  178. const resetRules = () => {
  179. if (defaultConfig) {
  180. setSegmentIdentifier(defaultConfig.segmentation.separator)
  181. setMax(defaultConfig.segmentation.max_tokens)
  182. setOverlap(defaultConfig.segmentation.chunk_overlap)
  183. setRules(defaultConfig.pre_processing_rules)
  184. }
  185. }
  186. const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT) => {
  187. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  188. const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm)!)
  189. if (segmentationType === SegmentType.CUSTOM)
  190. setCustomFileIndexingEstimate(res)
  191. else
  192. setAutomaticFileIndexingEstimate(res)
  193. }
  194. const confirmChangeCustomConfig = () => {
  195. setCustomFileIndexingEstimate(null)
  196. setShowPreview()
  197. fetchFileIndexingEstimate()
  198. setPreviewSwitched(false)
  199. }
  200. const getIndexing_technique = () => indexingType || indexType
  201. const getProcessRule = () => {
  202. const processRule: ProcessRule = {
  203. rules: {} as any, // api will check this. It will be removed after api refactored.
  204. mode: segmentationType,
  205. }
  206. if (segmentationType === SegmentType.CUSTOM) {
  207. const ruleObj = {
  208. pre_processing_rules: rules,
  209. segmentation: {
  210. separator: unescape(segmentIdentifier),
  211. max_tokens: max,
  212. chunk_overlap: overlap,
  213. },
  214. }
  215. processRule.rules = ruleObj
  216. }
  217. return processRule
  218. }
  219. const getNotionInfo = () => {
  220. const workspacesMap = groupBy(notionPages, 'workspace_id')
  221. const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
  222. return {
  223. workspaceId,
  224. pages: workspacesMap[workspaceId],
  225. }
  226. })
  227. return workspaces.map((workspace) => {
  228. return {
  229. workspace_id: workspace.workspaceId,
  230. pages: workspace.pages.map((page) => {
  231. const { page_id, page_name, page_icon, type } = page
  232. return {
  233. page_id,
  234. page_name,
  235. page_icon,
  236. type,
  237. }
  238. }),
  239. }
  240. }) as NotionInfo[]
  241. }
  242. const getWebsiteInfo = () => {
  243. return {
  244. provider: 'firecrawl',
  245. job_id: fireCrawlJobId,
  246. urls: websitePages.map(page => page.source_url),
  247. only_main_content: crawlOptions?.only_main_content,
  248. }
  249. }
  250. const getFileIndexingEstimateParams = (docForm: DocForm): IndexingEstimateParams | undefined => {
  251. if (dataSourceType === DataSourceType.FILE) {
  252. return {
  253. info_list: {
  254. data_source_type: dataSourceType,
  255. file_info_list: {
  256. file_ids: files.map(file => file.id) as string[],
  257. },
  258. },
  259. indexing_technique: getIndexing_technique() as string,
  260. process_rule: getProcessRule(),
  261. doc_form: docForm,
  262. doc_language: docLanguage,
  263. dataset_id: datasetId as string,
  264. }
  265. }
  266. if (dataSourceType === DataSourceType.NOTION) {
  267. return {
  268. info_list: {
  269. data_source_type: dataSourceType,
  270. notion_info_list: getNotionInfo(),
  271. },
  272. indexing_technique: getIndexing_technique() as string,
  273. process_rule: getProcessRule(),
  274. doc_form: docForm,
  275. doc_language: docLanguage,
  276. dataset_id: datasetId as string,
  277. }
  278. }
  279. if (dataSourceType === DataSourceType.WEB) {
  280. return {
  281. info_list: {
  282. data_source_type: dataSourceType,
  283. website_info_list: getWebsiteInfo(),
  284. },
  285. indexing_technique: getIndexing_technique() as string,
  286. process_rule: getProcessRule(),
  287. doc_form: docForm,
  288. doc_language: docLanguage,
  289. dataset_id: datasetId as string,
  290. }
  291. }
  292. }
  293. const {
  294. modelList: rerankModelList,
  295. defaultModel: rerankDefaultModel,
  296. currentModel: isRerankDefaultModelValid,
  297. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  298. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  299. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  300. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  301. currentDataset?.embedding_model
  302. ? {
  303. provider: currentDataset.embedding_model_provider,
  304. model: currentDataset.embedding_model,
  305. }
  306. : {
  307. provider: defaultEmbeddingModel?.provider.provider || '',
  308. model: defaultEmbeddingModel?.model || '',
  309. },
  310. )
  311. const getCreationParams = () => {
  312. let params
  313. if (segmentationType === SegmentType.CUSTOM && overlap > max) {
  314. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  315. return
  316. }
  317. if (isSetting) {
  318. params = {
  319. original_document_id: documentDetail?.id,
  320. doc_form: docForm,
  321. doc_language: docLanguage,
  322. process_rule: getProcessRule(),
  323. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  324. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  325. embedding_model: embeddingModel.model, // Readonly
  326. embedding_model_provider: embeddingModel.provider, // Readonly
  327. } as CreateDocumentReq
  328. }
  329. else { // create
  330. const indexMethod = getIndexing_technique()
  331. if (
  332. !isReRankModelSelected({
  333. rerankDefaultModel,
  334. isRerankDefaultModelValid: !!isRerankDefaultModelValid,
  335. rerankModelList,
  336. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  337. retrievalConfig,
  338. indexMethod: indexMethod as string,
  339. })
  340. ) {
  341. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  342. return
  343. }
  344. const postRetrievalConfig = ensureRerankModelSelected({
  345. rerankDefaultModel: rerankDefaultModel!,
  346. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  347. retrievalConfig,
  348. indexMethod: indexMethod as string,
  349. })
  350. params = {
  351. data_source: {
  352. type: dataSourceType,
  353. info_list: {
  354. data_source_type: dataSourceType,
  355. },
  356. },
  357. indexing_technique: getIndexing_technique(),
  358. process_rule: getProcessRule(),
  359. doc_form: docForm,
  360. doc_language: docLanguage,
  361. retrieval_model: postRetrievalConfig,
  362. embedding_model: embeddingModel.model,
  363. embedding_model_provider: embeddingModel.provider,
  364. } as CreateDocumentReq
  365. if (dataSourceType === DataSourceType.FILE) {
  366. params.data_source.info_list.file_info_list = {
  367. file_ids: files.map(file => file.id || '').filter(Boolean),
  368. }
  369. }
  370. if (dataSourceType === DataSourceType.NOTION)
  371. params.data_source.info_list.notion_info_list = getNotionInfo()
  372. if (dataSourceType === DataSourceType.WEB)
  373. params.data_source.info_list.website_info_list = getWebsiteInfo()
  374. }
  375. return params
  376. }
  377. const getRules = async () => {
  378. try {
  379. const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
  380. const separator = res.rules.segmentation.separator
  381. setSegmentIdentifier(separator)
  382. setMax(res.rules.segmentation.max_tokens)
  383. setOverlap(res.rules.segmentation.chunk_overlap)
  384. setRules(res.rules.pre_processing_rules)
  385. setDefaultConfig(res.rules)
  386. }
  387. catch (err) {
  388. console.log(err)
  389. }
  390. }
  391. const getRulesFromDetail = () => {
  392. if (documentDetail) {
  393. const rules = documentDetail.dataset_process_rule.rules
  394. const separator = rules.segmentation.separator
  395. const max = rules.segmentation.max_tokens
  396. const overlap = rules.segmentation.chunk_overlap
  397. setSegmentIdentifier(separator)
  398. setMax(max)
  399. setOverlap(overlap)
  400. setRules(rules.pre_processing_rules)
  401. setDefaultConfig(rules)
  402. }
  403. }
  404. const getDefaultMode = () => {
  405. if (documentDetail)
  406. setSegmentationType(documentDetail.dataset_process_rule.mode)
  407. }
  408. const createHandle = async () => {
  409. if (isCreating)
  410. return
  411. setIsCreating(true)
  412. try {
  413. let res
  414. const params = getCreationParams()
  415. if (!params)
  416. return false
  417. setIsCreating(true)
  418. if (!datasetId) {
  419. res = await createFirstDocument({
  420. body: params as CreateDocumentReq,
  421. })
  422. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  423. updateResultCache && updateResultCache(res)
  424. }
  425. else {
  426. res = await createDocument({
  427. datasetId,
  428. body: params as CreateDocumentReq,
  429. })
  430. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  431. updateResultCache && updateResultCache(res)
  432. }
  433. if (mutateDatasetRes)
  434. mutateDatasetRes()
  435. onStepChange && onStepChange(+1)
  436. isSetting && onSave && onSave()
  437. }
  438. catch (err) {
  439. Toast.notify({
  440. type: 'error',
  441. message: `${err}`,
  442. })
  443. }
  444. finally {
  445. setIsCreating(false)
  446. }
  447. }
  448. const handleSwitch = (state: boolean) => {
  449. if (state)
  450. setDocForm(DocForm.QA)
  451. else
  452. setDocForm(DocForm.TEXT)
  453. }
  454. const handleSelect = (language: string) => {
  455. setDocLanguage(language)
  456. }
  457. const changeToEconomicalType = () => {
  458. if (!hasSetIndexType) {
  459. setIndexType(IndexingType.ECONOMICAL)
  460. setDocForm(DocForm.TEXT)
  461. }
  462. }
  463. const previewSwitch = async () => {
  464. setPreviewSwitched(true)
  465. if (segmentationType === SegmentType.AUTO)
  466. setAutomaticFileIndexingEstimate(null)
  467. else
  468. setCustomFileIndexingEstimate(null)
  469. await fetchFileIndexingEstimate(DocForm.QA)
  470. }
  471. useEffect(() => {
  472. // fetch rules
  473. if (!isSetting) {
  474. getRules()
  475. }
  476. else {
  477. getRulesFromDetail()
  478. getDefaultMode()
  479. }
  480. }, [])
  481. useEffect(() => {
  482. scrollRef.current?.addEventListener('scroll', scrollHandle)
  483. return () => {
  484. scrollRef.current?.removeEventListener('scroll', scrollHandle)
  485. }
  486. }, [])
  487. useLayoutEffect(() => {
  488. if (showPreview) {
  489. previewScrollRef.current?.addEventListener('scroll', previewScrollHandle)
  490. return () => {
  491. previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle)
  492. }
  493. }
  494. }, [showPreview])
  495. useEffect(() => {
  496. if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
  497. setDocForm(DocForm.TEXT)
  498. }, [indexingType, docForm])
  499. useEffect(() => {
  500. // get indexing type by props
  501. if (indexingType)
  502. setIndexType(indexingType as IndexingType)
  503. else
  504. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  505. }, [isAPIKeySet, indexingType, datasetId])
  506. useEffect(() => {
  507. if (segmentationType === SegmentType.AUTO) {
  508. setAutomaticFileIndexingEstimate(null)
  509. !isMobile && setShowPreview()
  510. fetchFileIndexingEstimate()
  511. setPreviewSwitched(false)
  512. }
  513. else {
  514. hidePreview()
  515. setCustomFileIndexingEstimate(null)
  516. setPreviewSwitched(false)
  517. }
  518. }, [segmentationType, indexType])
  519. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  520. search_method: RETRIEVE_METHOD.semantic,
  521. reranking_enable: false,
  522. reranking_model: {
  523. reranking_provider_name: rerankDefaultModel?.provider.provider,
  524. reranking_model_name: rerankDefaultModel?.model,
  525. },
  526. top_k: 3,
  527. score_threshold_enabled: false,
  528. score_threshold: 0.5,
  529. } as RetrievalConfig)
  530. return (
  531. <div className='flex w-full h-full'>
  532. <div ref={scrollRef} className='relative h-full w-full overflow-y-scroll'>
  533. <div className={cn(s.pageHeader, scrolled && s.fixed, isMobile && '!px-6')}>
  534. <span>{t('datasetCreation.steps.two')}</span>
  535. {isMobile && (
  536. <Button
  537. className='border-[0.5px] !h-8 hover:outline hover:outline-[0.5px] hover:outline-gray-300 text-gray-700 font-medium bg-white shadow-[0px_1px_2px_0px_rgba(16,24,40,0.05)]'
  538. onClick={setShowPreview}
  539. >
  540. <Tooltip>
  541. <div className="flex flex-row items-center">
  542. <RocketLaunchIcon className="h-4 w-4 mr-1.5 stroke-[1.8px]" />
  543. <span className="text-[13px]">{t('datasetCreation.stepTwo.previewTitleButton')}</span>
  544. </div>
  545. </Tooltip>
  546. </Button>
  547. )}
  548. </div>
  549. <div className={cn(s.form, isMobile && '!px-4')}>
  550. <div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
  551. <div className='max-w-[640px]'>
  552. <div
  553. className={cn(
  554. s.radioItem,
  555. s.segmentationItem,
  556. segmentationType === SegmentType.AUTO && s.active,
  557. )}
  558. onClick={() => setSegmentationType(SegmentType.AUTO)}
  559. >
  560. <span className={cn(s.typeIcon, s.auto)} />
  561. <span className={cn(s.radio)} />
  562. <div className={s.typeHeader}>
  563. <div className={s.title}>{t('datasetCreation.stepTwo.auto')}</div>
  564. <div className={s.tip}>{t('datasetCreation.stepTwo.autoDescription')}</div>
  565. </div>
  566. </div>
  567. <div
  568. className={cn(
  569. s.radioItem,
  570. s.segmentationItem,
  571. segmentationType === SegmentType.CUSTOM && s.active,
  572. segmentationType === SegmentType.CUSTOM && s.custom,
  573. )}
  574. onClick={() => setSegmentationType(SegmentType.CUSTOM)}
  575. >
  576. <span className={cn(s.typeIcon, s.customize)} />
  577. <span className={cn(s.radio)} />
  578. <div className={s.typeHeader}>
  579. <div className={s.title}>{t('datasetCreation.stepTwo.custom')}</div>
  580. <div className={s.tip}>{t('datasetCreation.stepTwo.customDescription')}</div>
  581. </div>
  582. {segmentationType === SegmentType.CUSTOM && (
  583. <div className={s.typeFormBody}>
  584. <div className={s.formRow}>
  585. <div className='w-full'>
  586. <div className={s.label}>
  587. {t('datasetCreation.stepTwo.separator')}
  588. <Tooltip
  589. popupContent={
  590. <div className='max-w-[200px]'>
  591. {t('datasetCreation.stepTwo.separatorTip')}
  592. </div>
  593. }
  594. />
  595. </div>
  596. <input
  597. type="text"
  598. className={s.input}
  599. placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
  600. value={segmentIdentifier}
  601. onChange={e => doSetSegmentIdentifier(e.target.value)}
  602. />
  603. </div>
  604. </div>
  605. <div className={s.formRow}>
  606. <div className='w-full'>
  607. <div className={s.label}>{t('datasetCreation.stepTwo.maxLength')}</div>
  608. <div className='relative w-full'>
  609. <input
  610. type="number"
  611. className={s.input}
  612. placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
  613. value={max}
  614. min={1}
  615. onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  616. />
  617. <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
  618. </div>
  619. </div>
  620. </div>
  621. <div className={s.formRow}>
  622. <div className='w-full'>
  623. <div className={s.label}>
  624. {t('datasetCreation.stepTwo.overlap')}
  625. <Tooltip
  626. popupContent={
  627. <div className='max-w-[200px]'>
  628. {t('datasetCreation.stepTwo.overlapTip')}
  629. </div>
  630. }
  631. />
  632. </div>
  633. <div className='relative w-full'>
  634. <input
  635. type="number"
  636. className={s.input}
  637. placeholder={t('datasetCreation.stepTwo.overlap') || ''}
  638. value={overlap}
  639. min={1}
  640. onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  641. />
  642. <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
  643. </div>
  644. </div>
  645. </div>
  646. <div className={s.formRow}>
  647. <div className='w-full flex flex-col gap-1'>
  648. <div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>
  649. {rules.map(rule => (
  650. <div key={rule.id} className={s.ruleItem}>
  651. <input id={rule.id} type="checkbox" checked={rule.enabled} onChange={() => ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
  652. <label htmlFor={rule.id} className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
  653. </div>
  654. ))}
  655. </div>
  656. </div>
  657. <div className={s.formFooter}>
  658. <Button variant="primary" className={cn(s.button)} onClick={confirmChangeCustomConfig}>{t('datasetCreation.stepTwo.preview')}</Button>
  659. <Button className={cn(s.button, 'ml-2')} onClick={resetRules}>{t('datasetCreation.stepTwo.reset')}</Button>
  660. </div>
  661. </div>
  662. )}
  663. </div>
  664. </div>
  665. <div className={s.label}>{t('datasetCreation.stepTwo.indexMode')}</div>
  666. <div className='max-w-[640px]'>
  667. <div className='flex items-center gap-3 flex-wrap sm:flex-nowrap'>
  668. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  669. <div
  670. className={cn(
  671. s.radioItem,
  672. s.indexItem,
  673. !isAPIKeySet && s.disabled,
  674. !hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
  675. hasSetIndexType && s.disabled,
  676. hasSetIndexType && '!w-full !min-h-[96px]',
  677. )}
  678. onClick={() => {
  679. if (isAPIKeySet)
  680. setIndexType(IndexingType.QUALIFIED)
  681. }}
  682. >
  683. <span className={cn(s.typeIcon, s.qualified)} />
  684. {!hasSetIndexType && <span className={cn(s.radio)} />}
  685. <div className={s.typeHeader}>
  686. <div className={s.title}>
  687. {t('datasetCreation.stepTwo.qualified')}
  688. {!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
  689. </div>
  690. <div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
  691. </div>
  692. {!isAPIKeySet && (
  693. <div className={s.warningTip}>
  694. <span>{t('datasetCreation.stepTwo.warning')}&nbsp;</span>
  695. <span className={s.click} onClick={onSetting}>{t('datasetCreation.stepTwo.click')}</span>
  696. </div>
  697. )}
  698. </div>
  699. )}
  700. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  701. <div
  702. className={cn(
  703. s.radioItem,
  704. s.indexItem,
  705. !hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
  706. hasSetIndexType && s.disabled,
  707. hasSetIndexType && '!w-full !min-h-[96px]',
  708. )}
  709. onClick={changeToEconomicalType}
  710. >
  711. <span className={cn(s.typeIcon, s.economical)} />
  712. {!hasSetIndexType && <span className={cn(s.radio)} />}
  713. <div className={s.typeHeader}>
  714. <div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
  715. <div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
  716. </div>
  717. </div>
  718. )}
  719. </div>
  720. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  721. <div className='mt-2 text-xs text-gray-500 font-medium'>
  722. {t('datasetCreation.stepTwo.indexSettingTip')}
  723. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  724. </div>
  725. )}
  726. {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
  727. <div className='mt-3 rounded-xl bg-gray-50 border border-gray-100'>
  728. <div className='flex justify-between items-center px-5 py-4'>
  729. <div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
  730. <MessageChatSquare className='w-4 h-4' />
  731. </div>
  732. <div className='grow mx-3'>
  733. <div className='mb-[2px] text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
  734. <div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
  735. <span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
  736. <LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} />
  737. </div>
  738. </div>
  739. <div className='shrink-0'>
  740. <Switch
  741. defaultValue={docForm === DocForm.QA}
  742. onChange={handleSwitch}
  743. size='md'
  744. />
  745. </div>
  746. </div>
  747. {docForm === DocForm.QA && !QATipHide && (
  748. <div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
  749. {t('datasetCreation.stepTwo.QATip')}
  750. <RiCloseLine className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
  751. </div>
  752. )}
  753. </div>
  754. )}
  755. {/* Embedding model */}
  756. {indexType === IndexingType.QUALIFIED && (
  757. <div className='mb-2'>
  758. <div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
  759. <ModelSelector
  760. readonly={!!datasetId}
  761. defaultModel={embeddingModel}
  762. modelList={embeddingModelList}
  763. onSelect={(model: DefaultModel) => {
  764. setEmbeddingModel(model)
  765. }}
  766. />
  767. {!!datasetId && (
  768. <div className='mt-2 text-xs text-gray-500 font-medium'>
  769. {t('datasetCreation.stepTwo.indexSettingTip')}
  770. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  771. </div>
  772. )}
  773. </div>
  774. )}
  775. {/* Retrieval Method Config */}
  776. <div>
  777. {!datasetId
  778. ? (
  779. <div className={s.label}>
  780. <div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  781. <div className='leading-[18px] text-xs font-normal text-gray-500'>
  782. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  783. {t('datasetSettings.form.retrievalSetting.longDescription')}
  784. </div>
  785. </div>
  786. )
  787. : (
  788. <div className={cn(s.label, 'flex justify-between items-center')}>
  789. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  790. </div>
  791. )}
  792. <div className='max-w-[640px]'>
  793. {
  794. getIndexing_technique() === IndexingType.QUALIFIED
  795. ? (
  796. <RetrievalMethodConfig
  797. value={retrievalConfig}
  798. onChange={setRetrievalConfig}
  799. />
  800. )
  801. : (
  802. <EconomicalRetrievalMethodConfig
  803. value={retrievalConfig}
  804. onChange={setRetrievalConfig}
  805. />
  806. )
  807. }
  808. </div>
  809. </div>
  810. <div className={s.source}>
  811. <div className={s.sourceContent}>
  812. {dataSourceType === DataSourceType.FILE && (
  813. <>
  814. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.fileSource')}</div>
  815. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  816. <span className={cn(s.fileIcon, files.length && s[files[0].extension || ''])} />
  817. {getFileName(files[0].name || '')}
  818. {files.length > 1 && (
  819. <span className={s.sourceCount}>
  820. <span>{t('datasetCreation.stepTwo.other')}</span>
  821. <span>{files.length - 1}</span>
  822. <span>{t('datasetCreation.stepTwo.fileUnit')}</span>
  823. </span>
  824. )}
  825. </div>
  826. </>
  827. )}
  828. {dataSourceType === DataSourceType.NOTION && (
  829. <>
  830. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.notionSource')}</div>
  831. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  832. <NotionIcon
  833. className='shrink-0 mr-1'
  834. type='page'
  835. src={notionPages[0]?.page_icon}
  836. />
  837. {notionPages[0]?.page_name}
  838. {notionPages.length > 1 && (
  839. <span className={s.sourceCount}>
  840. <span>{t('datasetCreation.stepTwo.other')}</span>
  841. <span>{notionPages.length - 1}</span>
  842. <span>{t('datasetCreation.stepTwo.notionUnit')}</span>
  843. </span>
  844. )}
  845. </div>
  846. </>
  847. )}
  848. {dataSourceType === DataSourceType.WEB && (
  849. <>
  850. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.websiteSource')}</div>
  851. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  852. <Globe01 className='shrink-0 mr-1' />
  853. <span className='grow w-0 truncate'>{websitePages[0].source_url}</span>
  854. {websitePages.length > 1 && (
  855. <span className={s.sourceCount}>
  856. <span>{t('datasetCreation.stepTwo.other')}</span>
  857. <span>{websitePages.length - 1}</span>
  858. <span>{t('datasetCreation.stepTwo.webpageUnit')}</span>
  859. </span>
  860. )}
  861. </div>
  862. </>
  863. )}
  864. </div>
  865. <div className={s.divider} />
  866. <div className={s.segmentCount}>
  867. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.estimateSegment')}</div>
  868. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  869. {
  870. fileIndexingEstimate
  871. ? (
  872. <div className='text-xs font-medium text-gray-800'>{formatNumber(fileIndexingEstimate.total_segments)} </div>
  873. )
  874. : (
  875. <div className={s.calculating}>{t('datasetCreation.stepTwo.calculating')}</div>
  876. )
  877. }
  878. </div>
  879. </div>
  880. </div>
  881. {!isSetting
  882. ? (
  883. <div className='flex items-center mt-8 py-2'>
  884. <Button onClick={() => onStepChange && onStepChange(-1)}>{t('datasetCreation.stepTwo.previousStep')}</Button>
  885. <div className={s.divider} />
  886. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  887. </div>
  888. )
  889. : (
  890. <div className='flex items-center mt-8 py-2'>
  891. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  892. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  893. </div>
  894. )}
  895. </div>
  896. </div>
  897. </div>
  898. <FloatRightContainer isMobile={isMobile} isOpen={showPreview} onClose={hidePreview} footer={null}>
  899. {showPreview && <div ref={previewScrollRef} className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll border-l border-[#F2F4F7]')}>
  900. <div className={cn(s.previewHeader, previewScrolled && `${s.fixed} pb-3`)}>
  901. <div className='flex items-center justify-between px-8'>
  902. <div className='grow flex items-center'>
  903. <div>{t('datasetCreation.stepTwo.previewTitle')}</div>
  904. {docForm === DocForm.QA && !previewSwitched && (
  905. <Button className='ml-2' variant='secondary-accent' onClick={previewSwitch}>{t('datasetCreation.stepTwo.previewButton')}</Button>
  906. )}
  907. </div>
  908. <div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
  909. <XMarkIcon className='h-4 w-4'></XMarkIcon>
  910. </div>
  911. </div>
  912. {docForm === DocForm.QA && !previewSwitched && (
  913. <div className='px-8 pr-12 text-xs text-gray-500'>
  914. <span>{t('datasetCreation.stepTwo.previewSwitchTipStart')}</span>
  915. <span className='text-amber-600'>{t('datasetCreation.stepTwo.previewSwitchTipEnd')}</span>
  916. </div>
  917. )}
  918. </div>
  919. <div className='my-4 px-8 space-y-4'>
  920. {previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && (
  921. <>
  922. {fileIndexingEstimate?.qa_preview.map((item, index) => (
  923. <PreviewItem type={PreviewType.QA} key={item.question} qa={item} index={index + 1} />
  924. ))}
  925. </>
  926. )}
  927. {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && (
  928. <>
  929. {fileIndexingEstimate?.preview.map((item, index) => (
  930. <PreviewItem type={PreviewType.TEXT} key={item} content={item} index={index + 1} />
  931. ))}
  932. </>
  933. )}
  934. {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
  935. <div className='flex items-center justify-center h-[200px]'>
  936. <Loading type='area' />
  937. </div>
  938. )}
  939. {!previewSwitched && !fileIndexingEstimate?.preview && (
  940. <div className='flex items-center justify-center h-[200px]'>
  941. <Loading type='area' />
  942. </div>
  943. )}
  944. </div>
  945. </div>}
  946. {!showPreview && (
  947. <div className={cn(s.sideTip)}>
  948. <div className={s.tipCard}>
  949. <span className={s.icon} />
  950. <div className={s.title}>{t('datasetCreation.stepTwo.sideTipTitle')}</div>
  951. <div className={s.content}>
  952. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP1')}</p>
  953. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP2')}</p>
  954. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP3')}</p>
  955. <p>{t('datasetCreation.stepTwo.sideTipP4')}</p>
  956. </div>
  957. </div>
  958. </div>
  959. )}
  960. </FloatRightContainer>
  961. </div>
  962. )
  963. }
  964. export default StepTwo