index.tsx 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. 'use client'
  2. import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
  3. import { useTranslation } from 'react-i18next'
  4. import { useContext } from 'use-context-selector'
  5. import { useBoolean } from 'ahooks'
  6. import { XMarkIcon } from '@heroicons/react/20/solid'
  7. import { RocketLaunchIcon } from '@heroicons/react/24/outline'
  8. import {
  9. RiCloseLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import { groupBy } from 'lodash-es'
  13. import PreviewItem, { PreviewType } from './preview-item'
  14. import LanguageSelect from './language-select'
  15. import s from './index.module.css'
  16. import unescape from './unescape'
  17. import escape from './escape'
  18. import cn from '@/utils/classnames'
  19. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  20. import {
  21. createDocument,
  22. createFirstDocument,
  23. fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
  24. fetchDefaultProcessRule,
  25. } from '@/service/datasets'
  26. import Button from '@/app/components/base/button'
  27. import Loading from '@/app/components/base/loading'
  28. import FloatRightContainer from '@/app/components/base/float-right-container'
  29. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  30. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  31. import { type RetrievalConfig } from '@/types/app'
  32. import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  33. import Toast from '@/app/components/base/toast'
  34. import { formatNumber } from '@/utils/format'
  35. import type { NotionPage } from '@/models/common'
  36. import { DataSourceProvider } from '@/models/common'
  37. import { DataSourceType, DocForm } from '@/models/datasets'
  38. import NotionIcon from '@/app/components/base/notion-icon'
  39. import Switch from '@/app/components/base/switch'
  40. import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
  41. import { useDatasetDetailContext } from '@/context/dataset-detail'
  42. import I18n from '@/context/i18n'
  43. import { IS_CE_EDITION } from '@/config'
  44. import { RETRIEVE_METHOD } from '@/types/app'
  45. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  46. import Tooltip from '@/app/components/base/tooltip'
  47. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  48. import { LanguagesSupported } from '@/i18n/language'
  49. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  50. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  51. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel'
  53. type ValueOf<T> = T[keyof T]
  54. type StepTwoProps = {
  55. isSetting?: boolean
  56. documentDetail?: FullDocumentDetail
  57. isAPIKeySet: boolean
  58. onSetting: () => void
  59. datasetId?: string
  60. indexingType?: ValueOf<IndexingType>
  61. dataSourceType: DataSourceType
  62. files: CustomFile[]
  63. notionPages?: NotionPage[]
  64. websitePages?: CrawlResultItem[]
  65. crawlOptions?: CrawlOptions
  66. websiteCrawlProvider?: DataSourceProvider
  67. websiteCrawlJobId?: string
  68. onStepChange?: (delta: number) => void
  69. updateIndexingTypeCache?: (type: string) => void
  70. updateResultCache?: (res: createDocumentResponse) => void
  71. onSave?: () => void
  72. onCancel?: () => void
  73. }
  74. enum SegmentType {
  75. AUTO = 'automatic',
  76. CUSTOM = 'custom',
  77. }
  78. enum IndexingType {
  79. QUALIFIED = 'high_quality',
  80. ECONOMICAL = 'economy',
  81. }
  82. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  83. const StepTwo = ({
  84. isSetting,
  85. documentDetail,
  86. isAPIKeySet,
  87. onSetting,
  88. datasetId,
  89. indexingType,
  90. dataSourceType: inCreatePageDataSourceType,
  91. files,
  92. notionPages = [],
  93. websitePages = [],
  94. crawlOptions,
  95. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  96. websiteCrawlJobId = '',
  97. onStepChange,
  98. updateIndexingTypeCache,
  99. updateResultCache,
  100. onSave,
  101. onCancel,
  102. }: StepTwoProps) => {
  103. const { t } = useTranslation()
  104. const { locale } = useContext(I18n)
  105. const media = useBreakpoints()
  106. const isMobile = media === MediaType.mobile
  107. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  108. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  109. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  110. const scrollRef = useRef<HTMLDivElement>(null)
  111. const [scrolled, setScrolled] = useState(false)
  112. const previewScrollRef = useRef<HTMLDivElement>(null)
  113. const [previewScrolled, setPreviewScrolled] = useState(false)
  114. const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
  115. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  116. const setSegmentIdentifier = useCallback((value: string) => {
  117. doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
  118. }, [])
  119. const [max, setMax] = useState(4000) // default chunk length
  120. const [overlap, setOverlap] = useState(50)
  121. const [rules, setRules] = useState<PreProcessingRule[]>([])
  122. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  123. const hasSetIndexType = !!indexingType
  124. const [indexType, setIndexType] = useState<ValueOf<IndexingType>>(
  125. (indexingType
  126. || isAPIKeySet)
  127. ? IndexingType.QUALIFIED
  128. : IndexingType.ECONOMICAL,
  129. )
  130. const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false)
  131. const [docForm, setDocForm] = useState<DocForm | string>(
  132. (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
  133. )
  134. const [docLanguage, setDocLanguage] = useState<string>(
  135. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
  136. )
  137. const [QATipHide, setQATipHide] = useState(false)
  138. const [previewSwitched, setPreviewSwitched] = useState(false)
  139. const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
  140. const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  141. const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  142. const fileIndexingEstimate = (() => {
  143. return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
  144. })()
  145. const [isCreating, setIsCreating] = useState(false)
  146. const scrollHandle = (e: Event) => {
  147. if ((e.target as HTMLDivElement).scrollTop > 0)
  148. setScrolled(true)
  149. else
  150. setScrolled(false)
  151. }
  152. const previewScrollHandle = (e: Event) => {
  153. if ((e.target as HTMLDivElement).scrollTop > 0)
  154. setPreviewScrolled(true)
  155. else
  156. setPreviewScrolled(false)
  157. }
  158. const getFileName = (name: string) => {
  159. const arr = name.split('.')
  160. return arr.slice(0, -1).join('.')
  161. }
  162. const getRuleName = (key: string) => {
  163. if (key === 'remove_extra_spaces')
  164. return t('datasetCreation.stepTwo.removeExtraSpaces')
  165. if (key === 'remove_urls_emails')
  166. return t('datasetCreation.stepTwo.removeUrlEmails')
  167. if (key === 'remove_stopwords')
  168. return t('datasetCreation.stepTwo.removeStopwords')
  169. }
  170. const ruleChangeHandle = (id: string) => {
  171. const newRules = rules.map((rule) => {
  172. if (rule.id === id) {
  173. return {
  174. id: rule.id,
  175. enabled: !rule.enabled,
  176. }
  177. }
  178. return rule
  179. })
  180. setRules(newRules)
  181. }
  182. const resetRules = () => {
  183. if (defaultConfig) {
  184. setSegmentIdentifier(defaultConfig.segmentation.separator)
  185. setMax(defaultConfig.segmentation.max_tokens)
  186. setOverlap(defaultConfig.segmentation.chunk_overlap)
  187. setRules(defaultConfig.pre_processing_rules)
  188. }
  189. }
  190. const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT, language?: string) => {
  191. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  192. const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm, language)!)
  193. if (segmentationType === SegmentType.CUSTOM)
  194. setCustomFileIndexingEstimate(res)
  195. else
  196. setAutomaticFileIndexingEstimate(res)
  197. }
  198. const confirmChangeCustomConfig = () => {
  199. setCustomFileIndexingEstimate(null)
  200. setShowPreview()
  201. fetchFileIndexingEstimate()
  202. setPreviewSwitched(false)
  203. }
  204. const getIndexing_technique = () => indexingType || indexType
  205. const getProcessRule = () => {
  206. const processRule: ProcessRule = {
  207. rules: {} as any, // api will check this. It will be removed after api refactored.
  208. mode: segmentationType,
  209. }
  210. if (segmentationType === SegmentType.CUSTOM) {
  211. const ruleObj = {
  212. pre_processing_rules: rules,
  213. segmentation: {
  214. separator: unescape(segmentIdentifier),
  215. max_tokens: max,
  216. chunk_overlap: overlap,
  217. },
  218. }
  219. processRule.rules = ruleObj
  220. }
  221. return processRule
  222. }
  223. const getNotionInfo = () => {
  224. const workspacesMap = groupBy(notionPages, 'workspace_id')
  225. const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
  226. return {
  227. workspaceId,
  228. pages: workspacesMap[workspaceId],
  229. }
  230. })
  231. return workspaces.map((workspace) => {
  232. return {
  233. workspace_id: workspace.workspaceId,
  234. pages: workspace.pages.map((page) => {
  235. const { page_id, page_name, page_icon, type } = page
  236. return {
  237. page_id,
  238. page_name,
  239. page_icon,
  240. type,
  241. }
  242. }),
  243. }
  244. }) as NotionInfo[]
  245. }
  246. const getWebsiteInfo = () => {
  247. return {
  248. provider: websiteCrawlProvider,
  249. job_id: websiteCrawlJobId,
  250. urls: websitePages.map(page => page.source_url),
  251. only_main_content: crawlOptions?.only_main_content,
  252. }
  253. }
  254. const getFileIndexingEstimateParams = (docForm: DocForm, language?: string): IndexingEstimateParams | undefined => {
  255. if (dataSourceType === DataSourceType.FILE) {
  256. return {
  257. info_list: {
  258. data_source_type: dataSourceType,
  259. file_info_list: {
  260. file_ids: files.map(file => file.id) as string[],
  261. },
  262. },
  263. indexing_technique: getIndexing_technique() as string,
  264. process_rule: getProcessRule(),
  265. doc_form: docForm,
  266. doc_language: language || docLanguage,
  267. dataset_id: datasetId as string,
  268. }
  269. }
  270. if (dataSourceType === DataSourceType.NOTION) {
  271. return {
  272. info_list: {
  273. data_source_type: dataSourceType,
  274. notion_info_list: getNotionInfo(),
  275. },
  276. indexing_technique: getIndexing_technique() as string,
  277. process_rule: getProcessRule(),
  278. doc_form: docForm,
  279. doc_language: language || docLanguage,
  280. dataset_id: datasetId as string,
  281. }
  282. }
  283. if (dataSourceType === DataSourceType.WEB) {
  284. return {
  285. info_list: {
  286. data_source_type: dataSourceType,
  287. website_info_list: getWebsiteInfo(),
  288. },
  289. indexing_technique: getIndexing_technique() as string,
  290. process_rule: getProcessRule(),
  291. doc_form: docForm,
  292. doc_language: language || docLanguage,
  293. dataset_id: datasetId as string,
  294. }
  295. }
  296. }
  297. const {
  298. modelList: rerankModelList,
  299. defaultModel: rerankDefaultModel,
  300. currentModel: isRerankDefaultModelValid,
  301. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  302. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  303. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  304. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  305. currentDataset?.embedding_model
  306. ? {
  307. provider: currentDataset.embedding_model_provider,
  308. model: currentDataset.embedding_model,
  309. }
  310. : {
  311. provider: defaultEmbeddingModel?.provider.provider || '',
  312. model: defaultEmbeddingModel?.model || '',
  313. },
  314. )
  315. const getCreationParams = () => {
  316. let params
  317. if (segmentationType === SegmentType.CUSTOM && overlap > max) {
  318. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  319. return
  320. }
  321. if (isSetting) {
  322. params = {
  323. original_document_id: documentDetail?.id,
  324. doc_form: docForm,
  325. doc_language: docLanguage,
  326. process_rule: getProcessRule(),
  327. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  328. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  329. embedding_model: embeddingModel.model, // Readonly
  330. embedding_model_provider: embeddingModel.provider, // Readonly
  331. } as CreateDocumentReq
  332. }
  333. else { // create
  334. const indexMethod = getIndexing_technique()
  335. if (
  336. !isReRankModelSelected({
  337. rerankDefaultModel,
  338. isRerankDefaultModelValid: !!isRerankDefaultModelValid,
  339. rerankModelList,
  340. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  341. retrievalConfig,
  342. indexMethod: indexMethod as string,
  343. })
  344. ) {
  345. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  346. return
  347. }
  348. const postRetrievalConfig = ensureRerankModelSelected({
  349. rerankDefaultModel: rerankDefaultModel!,
  350. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  351. retrievalConfig,
  352. indexMethod: indexMethod as string,
  353. })
  354. params = {
  355. data_source: {
  356. type: dataSourceType,
  357. info_list: {
  358. data_source_type: dataSourceType,
  359. },
  360. },
  361. indexing_technique: getIndexing_technique(),
  362. process_rule: getProcessRule(),
  363. doc_form: docForm,
  364. doc_language: docLanguage,
  365. retrieval_model: postRetrievalConfig,
  366. embedding_model: embeddingModel.model,
  367. embedding_model_provider: embeddingModel.provider,
  368. } as CreateDocumentReq
  369. if (dataSourceType === DataSourceType.FILE) {
  370. params.data_source.info_list.file_info_list = {
  371. file_ids: files.map(file => file.id || '').filter(Boolean),
  372. }
  373. }
  374. if (dataSourceType === DataSourceType.NOTION)
  375. params.data_source.info_list.notion_info_list = getNotionInfo()
  376. if (dataSourceType === DataSourceType.WEB)
  377. params.data_source.info_list.website_info_list = getWebsiteInfo()
  378. }
  379. return params
  380. }
  381. const getRules = async () => {
  382. try {
  383. const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
  384. const separator = res.rules.segmentation.separator
  385. setSegmentIdentifier(separator)
  386. setMax(res.rules.segmentation.max_tokens)
  387. setOverlap(res.rules.segmentation.chunk_overlap)
  388. setRules(res.rules.pre_processing_rules)
  389. setDefaultConfig(res.rules)
  390. }
  391. catch (err) {
  392. console.log(err)
  393. }
  394. }
  395. const getRulesFromDetail = () => {
  396. if (documentDetail) {
  397. const rules = documentDetail.dataset_process_rule.rules
  398. const separator = rules.segmentation.separator
  399. const max = rules.segmentation.max_tokens
  400. const overlap = rules.segmentation.chunk_overlap
  401. setSegmentIdentifier(separator)
  402. setMax(max)
  403. setOverlap(overlap)
  404. setRules(rules.pre_processing_rules)
  405. setDefaultConfig(rules)
  406. }
  407. }
  408. const getDefaultMode = () => {
  409. if (documentDetail)
  410. setSegmentationType(documentDetail.dataset_process_rule.mode)
  411. }
  412. const createHandle = async () => {
  413. if (isCreating)
  414. return
  415. setIsCreating(true)
  416. try {
  417. let res
  418. const params = getCreationParams()
  419. if (!params)
  420. return false
  421. setIsCreating(true)
  422. if (!datasetId) {
  423. res = await createFirstDocument({
  424. body: params as CreateDocumentReq,
  425. })
  426. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  427. updateResultCache && updateResultCache(res)
  428. }
  429. else {
  430. res = await createDocument({
  431. datasetId,
  432. body: params as CreateDocumentReq,
  433. })
  434. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  435. updateResultCache && updateResultCache(res)
  436. }
  437. if (mutateDatasetRes)
  438. mutateDatasetRes()
  439. onStepChange && onStepChange(+1)
  440. isSetting && onSave && onSave()
  441. }
  442. catch (err) {
  443. Toast.notify({
  444. type: 'error',
  445. message: `${err}`,
  446. })
  447. }
  448. finally {
  449. setIsCreating(false)
  450. }
  451. }
  452. const handleSwitch = (state: boolean) => {
  453. if (state)
  454. setDocForm(DocForm.QA)
  455. else
  456. setDocForm(DocForm.TEXT)
  457. }
  458. const previewSwitch = async (language?: string) => {
  459. setPreviewSwitched(true)
  460. setIsLanguageSelectDisabled(true)
  461. if (segmentationType === SegmentType.AUTO)
  462. setAutomaticFileIndexingEstimate(null)
  463. else
  464. setCustomFileIndexingEstimate(null)
  465. try {
  466. await fetchFileIndexingEstimate(DocForm.QA, language)
  467. }
  468. finally {
  469. setIsLanguageSelectDisabled(false)
  470. }
  471. }
  472. const handleSelect = (language: string) => {
  473. setDocLanguage(language)
  474. // Switch language, re-cutter
  475. if (docForm === DocForm.QA && previewSwitched)
  476. previewSwitch(language)
  477. }
  478. const changeToEconomicalType = () => {
  479. if (!hasSetIndexType) {
  480. setIndexType(IndexingType.ECONOMICAL)
  481. setDocForm(DocForm.TEXT)
  482. }
  483. }
  484. useEffect(() => {
  485. // fetch rules
  486. if (!isSetting) {
  487. getRules()
  488. }
  489. else {
  490. getRulesFromDetail()
  491. getDefaultMode()
  492. }
  493. }, [])
  494. useEffect(() => {
  495. scrollRef.current?.addEventListener('scroll', scrollHandle)
  496. return () => {
  497. scrollRef.current?.removeEventListener('scroll', scrollHandle)
  498. }
  499. }, [])
  500. useLayoutEffect(() => {
  501. if (showPreview) {
  502. previewScrollRef.current?.addEventListener('scroll', previewScrollHandle)
  503. return () => {
  504. previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle)
  505. }
  506. }
  507. }, [showPreview])
  508. useEffect(() => {
  509. if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
  510. setDocForm(DocForm.TEXT)
  511. }, [indexingType, docForm])
  512. useEffect(() => {
  513. // get indexing type by props
  514. if (indexingType)
  515. setIndexType(indexingType as IndexingType)
  516. else
  517. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  518. }, [isAPIKeySet, indexingType, datasetId])
  519. useEffect(() => {
  520. if (segmentationType === SegmentType.AUTO) {
  521. setAutomaticFileIndexingEstimate(null)
  522. !isMobile && setShowPreview()
  523. fetchFileIndexingEstimate()
  524. setPreviewSwitched(false)
  525. }
  526. else {
  527. hidePreview()
  528. setCustomFileIndexingEstimate(null)
  529. setPreviewSwitched(false)
  530. }
  531. }, [segmentationType, indexType])
  532. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  533. search_method: RETRIEVE_METHOD.semantic,
  534. reranking_enable: false,
  535. reranking_model: {
  536. reranking_provider_name: rerankDefaultModel?.provider.provider,
  537. reranking_model_name: rerankDefaultModel?.model,
  538. },
  539. top_k: 3,
  540. score_threshold_enabled: false,
  541. score_threshold: 0.5,
  542. } as RetrievalConfig)
  543. return (
  544. <div className='flex w-full h-full'>
  545. <div ref={scrollRef} className='relative h-full w-full overflow-y-scroll'>
  546. <div className={cn(s.pageHeader, scrolled && s.fixed, isMobile && '!px-6')}>
  547. <span>{t('datasetCreation.steps.two')}</span>
  548. {isMobile && (
  549. <Button
  550. className='border-[0.5px] !h-8 hover:outline hover:outline-[0.5px] hover:outline-gray-300 text-gray-700 font-medium bg-white shadow-[0px_1px_2px_0px_rgba(16,24,40,0.05)]'
  551. onClick={setShowPreview}
  552. >
  553. <Tooltip>
  554. <div className="flex flex-row items-center">
  555. <RocketLaunchIcon className="h-4 w-4 mr-1.5 stroke-[1.8px]" />
  556. <span className="text-[13px]">{t('datasetCreation.stepTwo.previewTitleButton')}</span>
  557. </div>
  558. </Tooltip>
  559. </Button>
  560. )}
  561. </div>
  562. <div className={cn(s.form, isMobile && '!px-4')}>
  563. <div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
  564. <div className='max-w-[640px]'>
  565. <div
  566. className={cn(
  567. s.radioItem,
  568. s.segmentationItem,
  569. segmentationType === SegmentType.AUTO && s.active,
  570. )}
  571. onClick={() => setSegmentationType(SegmentType.AUTO)}
  572. >
  573. <span className={cn(s.typeIcon, s.auto)} />
  574. <span className={cn(s.radio)} />
  575. <div className={s.typeHeader}>
  576. <div className={s.title}>{t('datasetCreation.stepTwo.auto')}</div>
  577. <div className={s.tip}>{t('datasetCreation.stepTwo.autoDescription')}</div>
  578. </div>
  579. </div>
  580. <div
  581. className={cn(
  582. s.radioItem,
  583. s.segmentationItem,
  584. segmentationType === SegmentType.CUSTOM && s.active,
  585. segmentationType === SegmentType.CUSTOM && s.custom,
  586. )}
  587. onClick={() => setSegmentationType(SegmentType.CUSTOM)}
  588. >
  589. <span className={cn(s.typeIcon, s.customize)} />
  590. <span className={cn(s.radio)} />
  591. <div className={s.typeHeader}>
  592. <div className={s.title}>{t('datasetCreation.stepTwo.custom')}</div>
  593. <div className={s.tip}>{t('datasetCreation.stepTwo.customDescription')}</div>
  594. </div>
  595. {segmentationType === SegmentType.CUSTOM && (
  596. <div className={s.typeFormBody}>
  597. <div className={s.formRow}>
  598. <div className='w-full'>
  599. <div className={s.label}>
  600. {t('datasetCreation.stepTwo.separator')}
  601. <Tooltip
  602. popupContent={
  603. <div className='max-w-[200px]'>
  604. {t('datasetCreation.stepTwo.separatorTip')}
  605. </div>
  606. }
  607. />
  608. </div>
  609. <input
  610. type="text"
  611. className={s.input}
  612. placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
  613. value={segmentIdentifier}
  614. onChange={e => doSetSegmentIdentifier(e.target.value)}
  615. />
  616. </div>
  617. </div>
  618. <div className={s.formRow}>
  619. <div className='w-full'>
  620. <div className={s.label}>{t('datasetCreation.stepTwo.maxLength')}</div>
  621. <div className='relative w-full'>
  622. <input
  623. type="number"
  624. className={s.input}
  625. placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
  626. value={max}
  627. min={1}
  628. onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  629. />
  630. <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
  631. </div>
  632. </div>
  633. </div>
  634. <div className={s.formRow}>
  635. <div className='w-full'>
  636. <div className={s.label}>
  637. {t('datasetCreation.stepTwo.overlap')}
  638. <Tooltip
  639. popupContent={
  640. <div className='max-w-[200px]'>
  641. {t('datasetCreation.stepTwo.overlapTip')}
  642. </div>
  643. }
  644. />
  645. </div>
  646. <div className='relative w-full'>
  647. <input
  648. type="number"
  649. className={s.input}
  650. placeholder={t('datasetCreation.stepTwo.overlap') || ''}
  651. value={overlap}
  652. min={1}
  653. onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  654. />
  655. <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
  656. </div>
  657. </div>
  658. </div>
  659. <div className={s.formRow}>
  660. <div className='w-full flex flex-col gap-1'>
  661. <div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>
  662. {rules.map(rule => (
  663. <div key={rule.id} className={s.ruleItem}>
  664. <input id={rule.id} type="checkbox" checked={rule.enabled} onChange={() => ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
  665. <label htmlFor={rule.id} className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
  666. </div>
  667. ))}
  668. </div>
  669. </div>
  670. <div className={s.formFooter}>
  671. <Button variant="primary" className={cn(s.button)} onClick={confirmChangeCustomConfig}>{t('datasetCreation.stepTwo.preview')}</Button>
  672. <Button className={cn(s.button, 'ml-2')} onClick={resetRules}>{t('datasetCreation.stepTwo.reset')}</Button>
  673. </div>
  674. </div>
  675. )}
  676. </div>
  677. </div>
  678. <div className={s.label}>{t('datasetCreation.stepTwo.indexMode')}</div>
  679. <div className='max-w-[640px]'>
  680. <div className='flex items-center gap-3 flex-wrap sm:flex-nowrap'>
  681. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  682. <div
  683. className={cn(
  684. s.radioItem,
  685. s.indexItem,
  686. !isAPIKeySet && s.disabled,
  687. !hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
  688. hasSetIndexType && s.disabled,
  689. hasSetIndexType && '!w-full !min-h-[96px]',
  690. )}
  691. onClick={() => {
  692. if (isAPIKeySet)
  693. setIndexType(IndexingType.QUALIFIED)
  694. }}
  695. >
  696. <span className={cn(s.typeIcon, s.qualified)} />
  697. {!hasSetIndexType && <span className={cn(s.radio)} />}
  698. <div className={s.typeHeader}>
  699. <div className={s.title}>
  700. {t('datasetCreation.stepTwo.qualified')}
  701. {!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
  702. </div>
  703. <div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
  704. </div>
  705. {!isAPIKeySet && (
  706. <div className={s.warningTip}>
  707. <span>{t('datasetCreation.stepTwo.warning')}&nbsp;</span>
  708. <span className={s.click} onClick={onSetting}>{t('datasetCreation.stepTwo.click')}</span>
  709. </div>
  710. )}
  711. </div>
  712. )}
  713. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  714. <div
  715. className={cn(
  716. s.radioItem,
  717. s.indexItem,
  718. !hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
  719. hasSetIndexType && s.disabled,
  720. hasSetIndexType && '!w-full !min-h-[96px]',
  721. )}
  722. onClick={changeToEconomicalType}
  723. >
  724. <span className={cn(s.typeIcon, s.economical)} />
  725. {!hasSetIndexType && <span className={cn(s.radio)} />}
  726. <div className={s.typeHeader}>
  727. <div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
  728. <div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
  729. </div>
  730. </div>
  731. )}
  732. </div>
  733. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  734. <div className='mt-2 text-xs text-gray-500 font-medium'>
  735. {t('datasetCreation.stepTwo.indexSettingTip')}
  736. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  737. </div>
  738. )}
  739. {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
  740. <div className='mt-3 rounded-xl bg-gray-50 border border-gray-100'>
  741. <div className='flex justify-between items-center px-5 py-4'>
  742. <div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
  743. <MessageChatSquare className='w-4 h-4' />
  744. </div>
  745. <div className='grow mx-3'>
  746. <div className='mb-[2px] text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
  747. <div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
  748. <span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
  749. <LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} disabled={isLanguageSelectDisabled} />
  750. </div>
  751. </div>
  752. <div className='shrink-0'>
  753. <Switch
  754. defaultValue={docForm === DocForm.QA}
  755. onChange={handleSwitch}
  756. size='md'
  757. />
  758. </div>
  759. </div>
  760. {docForm === DocForm.QA && !QATipHide && (
  761. <div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
  762. {t('datasetCreation.stepTwo.QATip')}
  763. <RiCloseLine className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
  764. </div>
  765. )}
  766. </div>
  767. )}
  768. {/* Embedding model */}
  769. {indexType === IndexingType.QUALIFIED && (
  770. <div className='mb-2'>
  771. <div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
  772. <ModelSelector
  773. readonly={!!datasetId}
  774. defaultModel={embeddingModel}
  775. modelList={embeddingModelList}
  776. onSelect={(model: DefaultModel) => {
  777. setEmbeddingModel(model)
  778. }}
  779. />
  780. {!!datasetId && (
  781. <div className='mt-2 text-xs text-gray-500 font-medium'>
  782. {t('datasetCreation.stepTwo.indexSettingTip')}
  783. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  784. </div>
  785. )}
  786. </div>
  787. )}
  788. {/* Retrieval Method Config */}
  789. <div>
  790. {!datasetId
  791. ? (
  792. <div className={s.label}>
  793. <div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  794. <div className='leading-[18px] text-xs font-normal text-gray-500'>
  795. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  796. {t('datasetSettings.form.retrievalSetting.longDescription')}
  797. </div>
  798. </div>
  799. )
  800. : (
  801. <div className={cn(s.label, 'flex justify-between items-center')}>
  802. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  803. </div>
  804. )}
  805. <div className='max-w-[640px]'>
  806. {
  807. getIndexing_technique() === IndexingType.QUALIFIED
  808. ? (
  809. <RetrievalMethodConfig
  810. value={retrievalConfig}
  811. onChange={setRetrievalConfig}
  812. />
  813. )
  814. : (
  815. <EconomicalRetrievalMethodConfig
  816. value={retrievalConfig}
  817. onChange={setRetrievalConfig}
  818. />
  819. )
  820. }
  821. </div>
  822. </div>
  823. <div className={s.source}>
  824. <div className={s.sourceContent}>
  825. {dataSourceType === DataSourceType.FILE && (
  826. <>
  827. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.fileSource')}</div>
  828. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  829. <span className={cn(s.fileIcon, files.length && s[files[0].extension || ''])} />
  830. {getFileName(files[0].name || '')}
  831. {files.length > 1 && (
  832. <span className={s.sourceCount}>
  833. <span>{t('datasetCreation.stepTwo.other')}</span>
  834. <span>{files.length - 1}</span>
  835. <span>{t('datasetCreation.stepTwo.fileUnit')}</span>
  836. </span>
  837. )}
  838. </div>
  839. </>
  840. )}
  841. {dataSourceType === DataSourceType.NOTION && (
  842. <>
  843. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.notionSource')}</div>
  844. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  845. <NotionIcon
  846. className='shrink-0 mr-1'
  847. type='page'
  848. src={notionPages[0]?.page_icon}
  849. />
  850. {notionPages[0]?.page_name}
  851. {notionPages.length > 1 && (
  852. <span className={s.sourceCount}>
  853. <span>{t('datasetCreation.stepTwo.other')}</span>
  854. <span>{notionPages.length - 1}</span>
  855. <span>{t('datasetCreation.stepTwo.notionUnit')}</span>
  856. </span>
  857. )}
  858. </div>
  859. </>
  860. )}
  861. {dataSourceType === DataSourceType.WEB && (
  862. <>
  863. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.websiteSource')}</div>
  864. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  865. <Globe01 className='shrink-0 mr-1' />
  866. <span className='grow w-0 truncate'>{websitePages[0].source_url}</span>
  867. {websitePages.length > 1 && (
  868. <span className={s.sourceCount}>
  869. <span>{t('datasetCreation.stepTwo.other')}</span>
  870. <span>{websitePages.length - 1}</span>
  871. <span>{t('datasetCreation.stepTwo.webpageUnit')}</span>
  872. </span>
  873. )}
  874. </div>
  875. </>
  876. )}
  877. </div>
  878. <div className={s.divider} />
  879. <div className={s.segmentCount}>
  880. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.estimateSegment')}</div>
  881. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  882. {
  883. fileIndexingEstimate
  884. ? (
  885. <div className='text-xs font-medium text-gray-800'>{formatNumber(fileIndexingEstimate.total_segments)} </div>
  886. )
  887. : (
  888. <div className={s.calculating}>{t('datasetCreation.stepTwo.calculating')}</div>
  889. )
  890. }
  891. </div>
  892. </div>
  893. </div>
  894. {!isSetting
  895. ? (
  896. <div className='flex items-center mt-8 py-2'>
  897. <Button onClick={() => onStepChange && onStepChange(-1)}>{t('datasetCreation.stepTwo.previousStep')}</Button>
  898. <div className={s.divider} />
  899. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  900. </div>
  901. )
  902. : (
  903. <div className='flex items-center mt-8 py-2'>
  904. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  905. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  906. </div>
  907. )}
  908. </div>
  909. </div>
  910. </div>
  911. <FloatRightContainer isMobile={isMobile} isOpen={showPreview} onClose={hidePreview} footer={null}>
  912. {showPreview && <div ref={previewScrollRef} className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll border-l border-[#F2F4F7]')}>
  913. <div className={cn(s.previewHeader, previewScrolled && `${s.fixed} pb-3`)}>
  914. <div className='flex items-center justify-between px-8'>
  915. <div className='grow flex items-center'>
  916. <div>{t('datasetCreation.stepTwo.previewTitle')}</div>
  917. {docForm === DocForm.QA && !previewSwitched && (
  918. <Button className='ml-2' variant='secondary-accent' onClick={() => previewSwitch()}>{t('datasetCreation.stepTwo.previewButton')}</Button>
  919. )}
  920. </div>
  921. <div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
  922. <XMarkIcon className='h-4 w-4'></XMarkIcon>
  923. </div>
  924. </div>
  925. {docForm === DocForm.QA && !previewSwitched && (
  926. <div className='px-8 pr-12 text-xs text-gray-500'>
  927. <span>{t('datasetCreation.stepTwo.previewSwitchTipStart')}</span>
  928. <span className='text-amber-600'>{t('datasetCreation.stepTwo.previewSwitchTipEnd')}</span>
  929. </div>
  930. )}
  931. </div>
  932. <div className='my-4 px-8 space-y-4'>
  933. {previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && (
  934. <>
  935. {fileIndexingEstimate?.qa_preview.map((item, index) => (
  936. <PreviewItem type={PreviewType.QA} key={item.question} qa={item} index={index + 1} />
  937. ))}
  938. </>
  939. )}
  940. {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && (
  941. <>
  942. {fileIndexingEstimate?.preview.map((item, index) => (
  943. <PreviewItem type={PreviewType.TEXT} key={item} content={item} index={index + 1} />
  944. ))}
  945. </>
  946. )}
  947. {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
  948. <div className='flex items-center justify-center h-[200px]'>
  949. <Loading type='area' />
  950. </div>
  951. )}
  952. {!previewSwitched && !fileIndexingEstimate?.preview && (
  953. <div className='flex items-center justify-center h-[200px]'>
  954. <Loading type='area' />
  955. </div>
  956. )}
  957. </div>
  958. </div>}
  959. {!showPreview && (
  960. <div className={cn(s.sideTip)}>
  961. <div className={s.tipCard}>
  962. <span className={s.icon} />
  963. <div className={s.title}>{t('datasetCreation.stepTwo.sideTipTitle')}</div>
  964. <div className={s.content}>
  965. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP1')}</p>
  966. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP2')}</p>
  967. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP3')}</p>
  968. <p>{t('datasetCreation.stepTwo.sideTipP4')}</p>
  969. </div>
  970. </div>
  971. </div>
  972. )}
  973. </FloatRightContainer>
  974. </div>
  975. )
  976. }
  977. export default StepTwo