datasets.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. export enum DataSourceType {
  8. FILE = 'upload_file',
  9. NOTION = 'notion_import',
  10. WEB = 'website_crawl',
  11. }
  12. export enum DatasetPermission {
  13. onlyMe = 'only_me',
  14. allTeamMembers = 'all_team_members',
  15. partialMembers = 'partial_members',
  16. }
  17. export enum ChunkingMode {
  18. text = 'text_model', // General text
  19. qa = 'qa_model', // General QA
  20. parentChild = 'hierarchical_model', // Parent-Child
  21. }
  22. export type MetadataInDoc = {
  23. value: string
  24. id: string
  25. type: MetadataFilteringVariableType
  26. name: string
  27. }
  28. export type DataSet = {
  29. categories: any;
  30. id: string
  31. name: string
  32. icon: string
  33. icon_background: string
  34. description: string
  35. permission: DatasetPermission
  36. data_source_type: DataSourceType
  37. indexing_technique: IndexingType
  38. created_by: string
  39. updated_by: string
  40. updated_at: number
  41. app_count: number
  42. doc_form: ChunkingMode
  43. document_count: number
  44. word_count: number
  45. provider: string
  46. embedding_model: string
  47. embedding_model_provider: string
  48. embedding_available: boolean
  49. retrieval_model_dict: RetrievalConfig
  50. retrieval_model: RetrievalConfig
  51. tags: Tag[]
  52. partial_member_list?: string[]
  53. external_knowledge_info: {
  54. external_knowledge_id: string
  55. external_knowledge_api_id: string
  56. external_knowledge_api_name: string
  57. external_knowledge_api_endpoint: string
  58. }
  59. external_retrieval_model: {
  60. top_k: number
  61. score_threshold: number
  62. score_threshold_enabled: boolean
  63. }
  64. built_in_field_enabled: boolean
  65. doc_metadata?: MetadataInDoc[],
  66. }
  67. export type ExternalAPIItem = {
  68. id: string
  69. tenant_id: string
  70. name: string
  71. description: string
  72. settings: {
  73. endpoint: string
  74. api_key: string
  75. }
  76. dataset_bindings: { id: string; name: string }[]
  77. created_by: string
  78. created_at: string
  79. }
  80. export type ExternalKnowledgeItem = {
  81. id: string
  82. name: string
  83. description: string | null
  84. provider: 'external'
  85. permission: DatasetPermission
  86. data_source_type: null
  87. indexing_technique: null
  88. app_count: number
  89. document_count: number
  90. word_count: number
  91. created_by: string
  92. created_at: string
  93. updated_by: string
  94. updated_at: string
  95. tags: Tag[]
  96. }
  97. export type ExternalAPIDeleteResponse = {
  98. result: 'success' | 'error'
  99. }
  100. export type ExternalAPIUsage = {
  101. is_using: boolean
  102. count: number
  103. }
  104. export type CustomFile = File & {
  105. id?: string
  106. extension?: string
  107. mime_type?: string
  108. created_by?: string
  109. created_at?: number
  110. }
  111. export type DocumentItem = {
  112. id: string
  113. name: string
  114. extension: string
  115. }
  116. export type CrawlOptions = {
  117. crawl_sub_pages: boolean
  118. only_main_content: boolean
  119. includes: string
  120. excludes: string
  121. limit: number | string
  122. max_depth: number | string
  123. use_sitemap: boolean
  124. }
  125. export type CrawlResultItem = {
  126. title: string
  127. markdown: string
  128. description: string
  129. source_url: string
  130. }
  131. export type FileItem = {
  132. fileID: string
  133. file: CustomFile
  134. progress: number
  135. }
  136. export type FetchDatasetsParams = {
  137. url: string
  138. params: {
  139. page: number
  140. ids?: string[]
  141. tag_ids?: string[]
  142. limit?: number
  143. include_all?: boolean
  144. keyword?: string,
  145. category_ids?: string[],
  146. type?: string,
  147. creatorDept?: string,
  148. authType?: string,
  149. }
  150. }
  151. export type DataSetListResponse = {
  152. data: DataSet[]
  153. has_more: boolean
  154. limit: number
  155. page: number
  156. total: number
  157. }
  158. export type ExternalAPIListResponse = {
  159. data: ExternalAPIItem[]
  160. has_more: boolean
  161. limit: number
  162. page: number
  163. total: number
  164. }
  165. export type QA = {
  166. question: string
  167. answer: string
  168. }
  169. export type IndexingEstimateResponse = {
  170. tokens: number
  171. total_price: number
  172. currency: string
  173. total_segments: number
  174. preview: Array<{ content: string; child_chunks: string[] }>
  175. qa_preview?: QA[]
  176. }
  177. export type FileIndexingEstimateResponse = {
  178. total_nodes: number
  179. } & IndexingEstimateResponse
  180. export type IndexingStatusResponse = {
  181. id: string
  182. indexing_status: DocumentIndexingStatus
  183. processing_started_at: number
  184. parsing_completed_at: number
  185. cleaning_completed_at: number
  186. splitting_completed_at: number
  187. completed_at: any
  188. paused_at: any
  189. error: any
  190. stopped_at: any
  191. completed_segments: number
  192. total_segments: number
  193. }
  194. export type IndexingStatusBatchResponse = {
  195. data: IndexingStatusResponse[]
  196. }
  197. export enum ProcessMode {
  198. general = 'custom',
  199. parentChild = 'hierarchical',
  200. }
  201. export type ParentMode = 'full-doc' | 'paragraph'
  202. export type ProcessRuleResponse = {
  203. mode: ProcessMode
  204. rules: Rules
  205. limits: Limits
  206. }
  207. export type Rules = {
  208. pre_processing_rules: PreProcessingRule[]
  209. segmentation: Segmentation
  210. parent_mode: ParentMode
  211. subchunk_segmentation: Segmentation
  212. }
  213. export type Limits = {
  214. indexing_max_segmentation_tokens_length: number
  215. }
  216. export type PreProcessingRule = {
  217. id: string
  218. enabled: boolean
  219. }
  220. export type Segmentation = {
  221. separator: string
  222. max_tokens: number
  223. chunk_overlap?: number
  224. }
  225. export const DocumentIndexingStatusList = [
  226. 'waiting',
  227. 'parsing',
  228. 'cleaning',
  229. 'splitting',
  230. 'indexing',
  231. 'paused',
  232. 'error',
  233. 'completed',
  234. ] as const
  235. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  236. export const DisplayStatusList = [
  237. 'queuing',
  238. 'indexing',
  239. 'paused',
  240. 'error',
  241. 'available',
  242. 'enabled',
  243. 'disabled',
  244. 'archived',
  245. ] as const
  246. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  247. export type DataSourceInfo = {
  248. upload_file: {
  249. id: string
  250. name: string
  251. size: number
  252. mime_type: string
  253. created_at: number
  254. created_by: string
  255. extension: string
  256. }
  257. notion_page_icon?: string
  258. notion_workspace_id?: string
  259. notion_page_id?: string
  260. provider?: DataSourceProvider
  261. job_id: string
  262. url: string
  263. }
  264. export type InitialDocumentDetail = {
  265. id: string
  266. batch: string
  267. position: number
  268. dataset_id: string
  269. data_source_type: DataSourceType
  270. data_source_info: DataSourceInfo
  271. dataset_process_rule_id: string
  272. name: string
  273. created_from: 'api' | 'web'
  274. created_by: string
  275. created_at: number
  276. indexing_status: DocumentIndexingStatus
  277. display_status: DocumentDisplayStatus
  278. completed_segments?: number
  279. total_segments?: number
  280. doc_form: ChunkingMode
  281. doc_language: string
  282. }
  283. export type SimpleDocumentDetail = InitialDocumentDetail & {
  284. enabled: boolean
  285. word_count: number
  286. is_qa: boolean // TODO waiting for backend to add this field
  287. error?: string | null
  288. archived: boolean
  289. updated_at: number
  290. hit_count: number
  291. dataset_process_rule_id?: string
  292. data_source_detail_dict?: {
  293. upload_file: {
  294. name: string
  295. extension: string
  296. }
  297. }
  298. doc_metadata?: MetadataItemWithValue[]
  299. check_status: number
  300. check_by: string
  301. enable_application: string
  302. }
  303. export type DocumentListResponse = {
  304. data: SimpleDocumentDetail[]
  305. has_more: boolean
  306. total: number
  307. page: number
  308. limit: number
  309. }
  310. export type DocumentReq = {
  311. original_document_id?: string
  312. indexing_technique?: string
  313. doc_form: ChunkingMode
  314. doc_language: string
  315. process_rule: ProcessRule
  316. }
  317. export type CreateDocumentReq = DocumentReq & {
  318. data_source: DataSource
  319. retrieval_model: RetrievalConfig
  320. embedding_model: string
  321. embedding_model_provider: string
  322. }
  323. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  324. dataset_id: string
  325. }
  326. export type DataSource = {
  327. type: DataSourceType
  328. info_list: {
  329. data_source_type: DataSourceType
  330. notion_info_list?: NotionInfo[]
  331. file_info_list?: {
  332. file_ids: string[]
  333. }
  334. website_info_list?: {
  335. provider: string
  336. job_id: string
  337. urls: string[]
  338. }
  339. }
  340. }
  341. export type NotionInfo = {
  342. workspace_id: string
  343. pages: DataSourceNotionPage[]
  344. }
  345. export type NotionPage = {
  346. page_id: string
  347. type: string
  348. }
  349. export type ProcessRule = {
  350. mode: ProcessMode
  351. rules: Rules
  352. }
  353. export type createDocumentResponse = {
  354. dataset?: DataSet
  355. batch: string
  356. documents: InitialDocumentDetail[]
  357. }
  358. export type PrecessRule = {
  359. mode: ProcessMode
  360. rules: Rules
  361. }
  362. export type FullDocumentDetail = SimpleDocumentDetail & {
  363. batch: string
  364. created_api_request_id: string
  365. processing_started_at: number
  366. parsing_completed_at: number
  367. cleaning_completed_at: number
  368. splitting_completed_at: number
  369. tokens: number
  370. indexing_latency: number
  371. completed_at: number
  372. paused_by: string
  373. paused_at: number
  374. stopped_at: number
  375. indexing_status: string
  376. disabled_at: number
  377. disabled_by: string
  378. archived_reason: 'rule_modified' | 're_upload'
  379. archived_by: string
  380. archived_at: number
  381. doc_type?: DocType | null | 'others'
  382. doc_metadata?: DocMetadata | null
  383. segment_count: number
  384. dataset_process_rule: PrecessRule
  385. document_process_rule: ProcessRule
  386. [key: string]: any
  387. }
  388. export type DocMetadata = {
  389. title: string
  390. language: string
  391. author: string
  392. publisher: string
  393. publicationDate: string
  394. ISBN: string
  395. category: string
  396. [key: string]: string
  397. }
  398. export const CUSTOMIZABLE_DOC_TYPES = [
  399. 'book',
  400. 'web_page',
  401. 'paper',
  402. 'social_media_post',
  403. 'personal_document',
  404. 'business_document',
  405. 'im_chat_log',
  406. ] as const
  407. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  408. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  409. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  410. export type DocType = CustomizableDocType | FixedDocType
  411. export type DocumentDetailResponse = FullDocumentDetail
  412. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  413. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  414. export type SegmentsQuery = {
  415. page?: string
  416. limit: number
  417. // status?: SegmentStatus
  418. hit_count_gte?: number
  419. keyword?: string
  420. enabled?: boolean | 'all'
  421. }
  422. export type SegmentDetailModel = {
  423. id: string
  424. position: number
  425. document_id: string
  426. content: string
  427. sign_content: string
  428. word_count: number
  429. tokens: number
  430. keywords: string[]
  431. index_node_id: string
  432. index_node_hash: string
  433. hit_count: number
  434. enabled: boolean
  435. disabled_at: number
  436. disabled_by: string
  437. status: SegmentStatus
  438. created_by: string
  439. created_at: number
  440. indexing_at: number
  441. completed_at: number
  442. error: string | null
  443. stopped_at: number
  444. answer?: string
  445. child_chunks?: ChildChunkDetail[]
  446. updated_at: number
  447. }
  448. export type SegmentsResponse = {
  449. data: SegmentDetailModel[]
  450. has_more: boolean
  451. limit: number
  452. total: number
  453. total_pages: number
  454. page: number
  455. }
  456. export type HitTestingRecord = {
  457. id: string
  458. content: string
  459. source: 'app' | 'hit_testing' | 'plugin'
  460. source_app_id: string
  461. created_by_role: 'account' | 'end_user'
  462. created_by: string
  463. created_at: number
  464. }
  465. export type HitTestingChildChunk = {
  466. id: string
  467. content: string
  468. position: number
  469. score: number
  470. }
  471. export type HitTesting = {
  472. segment: Segment
  473. content: Segment
  474. score: number
  475. tsne_position: TsnePosition
  476. child_chunks?: HitTestingChildChunk[] | null
  477. }
  478. export type ExternalKnowledgeBaseHitTesting = {
  479. content: string
  480. title: string
  481. score: number
  482. metadata: {
  483. 'x-amz-bedrock-kb-source-uri': string
  484. 'x-amz-bedrock-kb-data-source-id': string
  485. }
  486. }
  487. export type Segment = {
  488. id: string
  489. document: Document
  490. content: string
  491. sign_content: string
  492. position: number
  493. word_count: number
  494. tokens: number
  495. keywords: string[]
  496. hit_count: number
  497. index_node_hash: string
  498. }
  499. export type Document = {
  500. id: string
  501. data_source_type: string
  502. name: string
  503. doc_type: DocType
  504. }
  505. export type HitTestingRecordsResponse = {
  506. data: HitTestingRecord[]
  507. has_more: boolean
  508. limit: number
  509. total: number
  510. page: number
  511. }
  512. export type TsnePosition = {
  513. x: number
  514. y: number
  515. }
  516. export type HitTestingResponse = {
  517. query: {
  518. content: string
  519. tsne_position: TsnePosition
  520. }
  521. records: Array<HitTesting>
  522. }
  523. export type ExternalKnowledgeBaseHitTestingResponse = {
  524. query: {
  525. content: string
  526. }
  527. records: Array<ExternalKnowledgeBaseHitTesting>
  528. }
  529. export type RelatedApp = {
  530. id: string
  531. name: string
  532. mode: AppMode
  533. icon_type: AppIconType | null
  534. icon: string
  535. icon_background: string
  536. icon_url: string
  537. }
  538. export type RelatedAppResponse = {
  539. data: Array<RelatedApp>
  540. total: number
  541. }
  542. export type SegmentUpdater = {
  543. content: string
  544. answer?: string
  545. keywords?: string[]
  546. regenerate_child_chunks?: boolean
  547. }
  548. export type ErrorDocsResponse = {
  549. data: IndexingStatusResponse[]
  550. total: number
  551. }
  552. export type SelectedDatasetsMode = {
  553. allHighQuality: boolean
  554. allHighQualityVectorSearch: boolean
  555. allHighQualityFullTextSearch: boolean
  556. allEconomic: boolean
  557. mixtureHighQualityAndEconomic: boolean
  558. allInternal: boolean
  559. allExternal: boolean
  560. mixtureInternalAndExternal: boolean
  561. inconsistentEmbeddingModel: boolean
  562. }
  563. export enum WeightedScoreEnum {
  564. SemanticFirst = 'semantic_first',
  565. KeywordFirst = 'keyword_first',
  566. Customized = 'customized',
  567. }
  568. export enum RerankingModeEnum {
  569. RerankingModel = 'reranking_model',
  570. WeightedScore = 'weighted_score',
  571. }
  572. export const DEFAULT_WEIGHTED_SCORE = {
  573. allHighQualityVectorSearch: {
  574. semantic: 1.0,
  575. keyword: 0,
  576. },
  577. allHighQualityFullTextSearch: {
  578. semantic: 0,
  579. keyword: 1.0,
  580. },
  581. other: {
  582. semantic: 0.7,
  583. keyword: 0.3,
  584. },
  585. }
  586. export type ChildChunkType = 'automatic' | 'customized'
  587. export type ChildChunkDetail = {
  588. id: string
  589. position: number
  590. segment_id: string
  591. content: string
  592. word_count: number
  593. created_at: number
  594. updated_at: number
  595. type: ChildChunkType
  596. }
  597. export type ChildSegmentsResponse = {
  598. data: ChildChunkDetail[]
  599. total: number
  600. total_pages: number
  601. page: number
  602. limit: number
  603. }
  604. export type UpdateDocumentParams = {
  605. datasetId: string
  606. documentId: string
  607. }
  608. // Used in api url
  609. export enum DocumentActionType {
  610. enable = 'enable',
  611. disable = 'disable',
  612. archive = 'archive',
  613. unArchive = 'un_archive',
  614. delete = 'delete',
  615. check_fail = 'check_fail',
  616. }
  617. export type UpdateDocumentBatchParams = {
  618. datasetId: string
  619. documentId?: string
  620. documentIds?: string[] | string
  621. }
  622. export type BatchImportResponse = {
  623. job_id: string
  624. job_status: string
  625. }