datasets.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. export enum DataSourceType {
  8. FILE = 'upload_file',
  9. NOTION = 'notion_import',
  10. WEB = 'website_crawl',
  11. }
  12. export enum DatasetPermission {
  13. onlyMe = 'only_me',
  14. allTeamMembers = 'all_team_members',
  15. partialMembers = 'partial_members',
  16. }
  17. export enum ChunkingMode {
  18. text = 'text_model', // General text
  19. qa = 'qa_model', // General QA
  20. parentChild = 'hierarchical_model', // Parent-Child
  21. }
  22. export type MetadataInDoc = {
  23. value: string
  24. id: string
  25. type: MetadataFilteringVariableType
  26. name: string
  27. }
  28. export type DataSet = {
  29. id: string
  30. name: string
  31. icon: string
  32. icon_background: string
  33. description: string
  34. permission: DatasetPermission
  35. data_source_type: DataSourceType
  36. indexing_technique: IndexingType
  37. created_by: string
  38. updated_by: string
  39. updated_at: number
  40. app_count: number
  41. doc_form: ChunkingMode
  42. document_count: number
  43. word_count: number
  44. provider: string
  45. embedding_model: string
  46. embedding_model_provider: string
  47. embedding_available: boolean
  48. retrieval_model_dict: RetrievalConfig
  49. retrieval_model: RetrievalConfig
  50. tags: Tag[]
  51. partial_member_list?: string[]
  52. external_knowledge_info: {
  53. external_knowledge_id: string
  54. external_knowledge_api_id: string
  55. external_knowledge_api_name: string
  56. external_knowledge_api_endpoint: string
  57. }
  58. external_retrieval_model: {
  59. top_k: number
  60. score_threshold: number
  61. score_threshold_enabled: boolean
  62. }
  63. built_in_field_enabled: boolean
  64. doc_metadata?: MetadataInDoc[],
  65. type: string,
  66. }
  67. export type ExternalAPIItem = {
  68. id: string
  69. tenant_id: string
  70. name: string
  71. description: string
  72. settings: {
  73. endpoint: string
  74. api_key: string
  75. }
  76. dataset_bindings: { id: string; name: string }[]
  77. created_by: string
  78. created_at: string
  79. }
  80. export type ExternalKnowledgeItem = {
  81. id: string
  82. name: string
  83. description: string | null
  84. provider: 'external'
  85. permission: DatasetPermission
  86. data_source_type: null
  87. indexing_technique: null
  88. app_count: number
  89. document_count: number
  90. word_count: number
  91. created_by: string
  92. created_at: string
  93. updated_by: string
  94. updated_at: string
  95. tags: Tag[]
  96. }
  97. export type ExternalAPIDeleteResponse = {
  98. result: 'success' | 'error'
  99. }
  100. export type ExternalAPIUsage = {
  101. is_using: boolean
  102. count: number
  103. }
  104. export type CustomFile = File & {
  105. id?: string
  106. extension?: string
  107. mime_type?: string
  108. created_by?: string
  109. created_at?: number
  110. }
  111. export type DocumentItem = {
  112. id: string
  113. name: string
  114. extension: string
  115. }
  116. export type CrawlOptions = {
  117. crawl_sub_pages: boolean
  118. only_main_content: boolean
  119. includes: string
  120. excludes: string
  121. limit: number | string
  122. max_depth: number | string
  123. use_sitemap: boolean
  124. }
  125. export type CrawlResultItem = {
  126. title: string
  127. markdown: string
  128. description: string
  129. source_url: string
  130. }
  131. export type FileItem = {
  132. fileID: string
  133. file: CustomFile
  134. progress: number
  135. }
  136. export type FetchDatasetsParams = {
  137. url: string
  138. params: {
  139. page: number
  140. ids?: string[]
  141. tag_ids?: string[]
  142. limit?: number
  143. include_all?: boolean
  144. keyword?: string
  145. }
  146. }
  147. export type DataSetListResponse = {
  148. data: DataSet[]
  149. has_more: boolean
  150. limit: number
  151. page: number
  152. total: number
  153. }
  154. export type ExternalAPIListResponse = {
  155. data: ExternalAPIItem[]
  156. has_more: boolean
  157. limit: number
  158. page: number
  159. total: number
  160. }
  161. export type QA = {
  162. question: string
  163. answer: string
  164. }
  165. export type IndexingEstimateResponse = {
  166. tokens: number
  167. total_price: number
  168. currency: string
  169. total_segments: number
  170. preview: Array<{ content: string; child_chunks: string[] }>
  171. qa_preview?: QA[]
  172. }
  173. export type FileIndexingEstimateResponse = {
  174. total_nodes: number
  175. } & IndexingEstimateResponse
  176. export type IndexingStatusResponse = {
  177. id: string
  178. indexing_status: DocumentIndexingStatus
  179. processing_started_at: number
  180. parsing_completed_at: number
  181. cleaning_completed_at: number
  182. splitting_completed_at: number
  183. completed_at: any
  184. paused_at: any
  185. error: any
  186. stopped_at: any
  187. completed_segments: number
  188. total_segments: number
  189. }
  190. export type IndexingStatusBatchResponse = {
  191. data: IndexingStatusResponse[]
  192. }
  193. export enum ProcessMode {
  194. general = 'custom',
  195. parentChild = 'hierarchical',
  196. }
  197. export type ParentMode = 'full-doc' | 'paragraph'
  198. export type ProcessRuleResponse = {
  199. mode: ProcessMode
  200. rules: Rules
  201. limits: Limits
  202. }
  203. export type Rules = {
  204. pre_processing_rules: PreProcessingRule[]
  205. segmentation: Segmentation
  206. parent_mode: ParentMode
  207. subchunk_segmentation: Segmentation
  208. }
  209. export type Limits = {
  210. indexing_max_segmentation_tokens_length: number
  211. }
  212. export type PreProcessingRule = {
  213. id: string
  214. enabled: boolean
  215. }
  216. export type Segmentation = {
  217. separator: string
  218. max_tokens: number
  219. chunk_overlap?: number
  220. }
  221. export const DocumentIndexingStatusList = [
  222. 'waiting',
  223. 'parsing',
  224. 'cleaning',
  225. 'splitting',
  226. 'indexing',
  227. 'paused',
  228. 'error',
  229. 'completed',
  230. ] as const
  231. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  232. export const DisplayStatusList = [
  233. 'queuing',
  234. 'indexing',
  235. 'paused',
  236. 'error',
  237. 'available',
  238. 'enabled',
  239. 'disabled',
  240. 'archived',
  241. ] as const
  242. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  243. export type DataSourceInfo = {
  244. upload_file: {
  245. id: string
  246. name: string
  247. size: number
  248. mime_type: string
  249. created_at: number
  250. created_by: string
  251. extension: string
  252. }
  253. notion_page_icon?: string
  254. notion_workspace_id?: string
  255. notion_page_id?: string
  256. provider?: DataSourceProvider
  257. job_id: string
  258. url: string
  259. }
  260. export type InitialDocumentDetail = {
  261. id: string
  262. batch: string
  263. position: number
  264. dataset_id: string
  265. data_source_type: DataSourceType
  266. data_source_info: DataSourceInfo
  267. dataset_process_rule_id: string
  268. name: string
  269. created_from: 'api' | 'web'
  270. created_by: string
  271. created_at: number
  272. indexing_status: DocumentIndexingStatus
  273. display_status: DocumentDisplayStatus
  274. completed_segments?: number
  275. total_segments?: number
  276. doc_form: ChunkingMode
  277. doc_language: string
  278. }
  279. export type SimpleDocumentDetail = InitialDocumentDetail & {
  280. enabled: boolean
  281. word_count: number
  282. is_qa: boolean // TODO waiting for backend to add this field
  283. error?: string | null
  284. archived: boolean
  285. updated_at: number
  286. hit_count: number
  287. dataset_process_rule_id?: string
  288. data_source_detail_dict?: {
  289. upload_file: {
  290. name: string
  291. extension: string
  292. }
  293. }
  294. doc_metadata?: MetadataItemWithValue[]
  295. }
  296. export type DocumentListResponse = {
  297. data: SimpleDocumentDetail[]
  298. has_more: boolean
  299. total: number
  300. page: number
  301. limit: number
  302. }
  303. export type DocumentReq = {
  304. original_document_id?: string
  305. indexing_technique?: string
  306. doc_form: ChunkingMode
  307. doc_language: string
  308. process_rule: ProcessRule
  309. }
  310. export type CreateDocumentReq = DocumentReq & {
  311. data_source: DataSource
  312. retrieval_model: RetrievalConfig
  313. embedding_model: string
  314. embedding_model_provider: string
  315. }
  316. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  317. dataset_id: string
  318. }
  319. export type DataSource = {
  320. type: DataSourceType
  321. info_list: {
  322. data_source_type: DataSourceType
  323. notion_info_list?: NotionInfo[]
  324. file_info_list?: {
  325. file_ids: string[]
  326. }
  327. website_info_list?: {
  328. provider: string
  329. job_id: string
  330. urls: string[]
  331. }
  332. }
  333. }
  334. export type NotionInfo = {
  335. workspace_id: string
  336. pages: DataSourceNotionPage[]
  337. }
  338. export type NotionPage = {
  339. page_id: string
  340. type: string
  341. }
  342. export type ProcessRule = {
  343. mode: ProcessMode
  344. rules: Rules
  345. }
  346. export type createDocumentResponse = {
  347. dataset?: DataSet
  348. batch: string
  349. documents: InitialDocumentDetail[]
  350. }
  351. export type PrecessRule = {
  352. mode: ProcessMode
  353. rules: Rules
  354. }
  355. export type FullDocumentDetail = SimpleDocumentDetail & {
  356. batch: string
  357. created_api_request_id: string
  358. processing_started_at: number
  359. parsing_completed_at: number
  360. cleaning_completed_at: number
  361. splitting_completed_at: number
  362. tokens: number
  363. indexing_latency: number
  364. completed_at: number
  365. paused_by: string
  366. paused_at: number
  367. stopped_at: number
  368. indexing_status: string
  369. disabled_at: number
  370. disabled_by: string
  371. archived_reason: 'rule_modified' | 're_upload'
  372. archived_by: string
  373. archived_at: number
  374. doc_type?: DocType | null | 'others'
  375. doc_metadata?: DocMetadata | null
  376. segment_count: number
  377. dataset_process_rule: PrecessRule
  378. document_process_rule: ProcessRule
  379. [key: string]: any
  380. }
  381. export type DocMetadata = {
  382. title: string
  383. language: string
  384. author: string
  385. publisher: string
  386. publicationDate: string
  387. ISBN: string
  388. category: string
  389. [key: string]: string
  390. }
  391. export const CUSTOMIZABLE_DOC_TYPES = [
  392. 'book',
  393. 'web_page',
  394. 'paper',
  395. 'social_media_post',
  396. 'personal_document',
  397. 'business_document',
  398. 'im_chat_log',
  399. ] as const
  400. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  401. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  402. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  403. export type DocType = CustomizableDocType | FixedDocType
  404. export type DocumentDetailResponse = FullDocumentDetail
  405. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  406. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  407. export type SegmentsQuery = {
  408. page?: string
  409. limit: number
  410. // status?: SegmentStatus
  411. hit_count_gte?: number
  412. keyword?: string
  413. enabled?: boolean | 'all'
  414. }
  415. export type SegmentDetailModel = {
  416. id: string
  417. position: number
  418. document_id: string
  419. content: string
  420. sign_content: string
  421. word_count: number
  422. tokens: number
  423. keywords: string[]
  424. index_node_id: string
  425. index_node_hash: string
  426. hit_count: number
  427. enabled: boolean
  428. disabled_at: number
  429. disabled_by: string
  430. status: SegmentStatus
  431. created_by: string
  432. created_at: number
  433. indexing_at: number
  434. completed_at: number
  435. error: string | null
  436. stopped_at: number
  437. answer?: string
  438. child_chunks?: ChildChunkDetail[]
  439. updated_at: number
  440. }
  441. export type SegmentsResponse = {
  442. data: SegmentDetailModel[]
  443. has_more: boolean
  444. limit: number
  445. total: number
  446. total_pages: number
  447. page: number
  448. }
  449. export type HitTestingRecord = {
  450. id: string
  451. content: string
  452. source: 'app' | 'hit_testing' | 'plugin'
  453. source_app_id: string
  454. created_by_role: 'account' | 'end_user'
  455. created_by: string
  456. created_at: number
  457. }
  458. export type HitTestingChildChunk = {
  459. id: string
  460. content: string
  461. position: number
  462. score: number
  463. }
  464. export type HitTesting = {
  465. segment: Segment
  466. content: Segment
  467. score: number
  468. tsne_position: TsnePosition
  469. child_chunks?: HitTestingChildChunk[] | null
  470. }
  471. export type ExternalKnowledgeBaseHitTesting = {
  472. content: string
  473. title: string
  474. score: number
  475. metadata: {
  476. 'x-amz-bedrock-kb-source-uri': string
  477. 'x-amz-bedrock-kb-data-source-id': string
  478. }
  479. }
  480. export type Segment = {
  481. id: string
  482. document: Document
  483. content: string
  484. sign_content: string
  485. position: number
  486. word_count: number
  487. tokens: number
  488. keywords: string[]
  489. hit_count: number
  490. index_node_hash: string
  491. }
  492. export type Document = {
  493. id: string
  494. data_source_type: string
  495. name: string
  496. doc_type: DocType
  497. }
  498. export type HitTestingRecordsResponse = {
  499. data: HitTestingRecord[]
  500. has_more: boolean
  501. limit: number
  502. total: number
  503. page: number
  504. }
  505. export type TsnePosition = {
  506. x: number
  507. y: number
  508. }
  509. export type HitTestingResponse = {
  510. query: {
  511. content: string
  512. tsne_position: TsnePosition
  513. }
  514. records: Array<HitTesting>
  515. }
  516. export type ExternalKnowledgeBaseHitTestingResponse = {
  517. query: {
  518. content: string
  519. }
  520. records: Array<ExternalKnowledgeBaseHitTesting>
  521. }
  522. export type RelatedApp = {
  523. id: string
  524. name: string
  525. mode: AppMode
  526. icon_type: AppIconType | null
  527. icon: string
  528. icon_background: string
  529. icon_url: string
  530. }
  531. export type RelatedAppResponse = {
  532. data: Array<RelatedApp>
  533. total: number
  534. }
  535. export type SegmentUpdater = {
  536. content: string
  537. answer?: string
  538. keywords?: string[]
  539. regenerate_child_chunks?: boolean
  540. }
  541. export type ErrorDocsResponse = {
  542. data: IndexingStatusResponse[]
  543. total: number
  544. }
  545. export type SelectedDatasetsMode = {
  546. allHighQuality: boolean
  547. allHighQualityVectorSearch: boolean
  548. allHighQualityFullTextSearch: boolean
  549. allEconomic: boolean
  550. mixtureHighQualityAndEconomic: boolean
  551. allInternal: boolean
  552. allExternal: boolean
  553. mixtureInternalAndExternal: boolean
  554. inconsistentEmbeddingModel: boolean
  555. }
  556. export enum WeightedScoreEnum {
  557. SemanticFirst = 'semantic_first',
  558. KeywordFirst = 'keyword_first',
  559. Customized = 'customized',
  560. }
  561. export enum RerankingModeEnum {
  562. RerankingModel = 'reranking_model',
  563. WeightedScore = 'weighted_score',
  564. }
  565. export const DEFAULT_WEIGHTED_SCORE = {
  566. allHighQualityVectorSearch: {
  567. semantic: 1.0,
  568. keyword: 0,
  569. },
  570. allHighQualityFullTextSearch: {
  571. semantic: 0,
  572. keyword: 1.0,
  573. },
  574. other: {
  575. semantic: 0.7,
  576. keyword: 0.3,
  577. },
  578. }
  579. export type ChildChunkType = 'automatic' | 'customized'
  580. export type ChildChunkDetail = {
  581. id: string
  582. position: number
  583. segment_id: string
  584. content: string
  585. word_count: number
  586. created_at: number
  587. updated_at: number
  588. type: ChildChunkType
  589. }
  590. export type ChildSegmentsResponse = {
  591. data: ChildChunkDetail[]
  592. total: number
  593. total_pages: number
  594. page: number
  595. limit: number
  596. }
  597. export type UpdateDocumentParams = {
  598. datasetId: string
  599. documentId: string
  600. }
  601. // Used in api url
  602. export enum DocumentActionType {
  603. enable = 'enable',
  604. disable = 'disable',
  605. archive = 'archive',
  606. unArchive = 'un_archive',
  607. delete = 'delete',
  608. }
  609. export type UpdateDocumentBatchParams = {
  610. datasetId: string
  611. documentId?: string
  612. documentIds?: string[] | string
  613. }
  614. export type BatchImportResponse = {
  615. job_id: string
  616. job_status: string
  617. }