datasets.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. export enum DataSourceType {
  8. FILE = 'upload_file',
  9. NOTION = 'notion_import',
  10. WEB = 'website_crawl',
  11. }
  12. export enum DatasetPermission {
  13. onlyMe = 'only_me',
  14. allTeamMembers = 'all_team_members',
  15. partialMembers = 'partial_members',
  16. }
  17. export enum ChunkingMode {
  18. text = 'text_model', // General text
  19. qa = 'qa_model', // General QA
  20. parentChild = 'hierarchical_model', // Parent-Child
  21. }
  22. export type MetadataInDoc = {
  23. value: string
  24. id: string
  25. type: MetadataFilteringVariableType
  26. name: string
  27. }
  28. export type DataSet = {
  29. categories: any;
  30. id: string
  31. name: string
  32. icon: string
  33. icon_background: string
  34. description: string
  35. permission: DatasetPermission
  36. data_source_type: DataSourceType
  37. indexing_technique: IndexingType
  38. created_by: string
  39. updated_by: string
  40. updated_at: number
  41. app_count: number
  42. doc_form: ChunkingMode
  43. document_count: number
  44. word_count: number
  45. provider: string
  46. embedding_model: string
  47. embedding_model_provider: string
  48. embedding_available: boolean
  49. retrieval_model_dict: RetrievalConfig
  50. retrieval_model: RetrievalConfig
  51. tags: Tag[]
  52. partial_member_list?: string[]
  53. external_knowledge_info: {
  54. external_knowledge_id: string
  55. external_knowledge_api_id: string
  56. external_knowledge_api_name: string
  57. external_knowledge_api_endpoint: string
  58. }
  59. external_retrieval_model: {
  60. top_k: number
  61. score_threshold: number
  62. score_threshold_enabled: boolean
  63. }
  64. built_in_field_enabled: boolean
  65. doc_metadata?: MetadataInDoc[],
  66. }
  67. export type ExternalAPIItem = {
  68. id: string
  69. tenant_id: string
  70. name: string
  71. description: string
  72. settings: {
  73. endpoint: string
  74. api_key: string
  75. }
  76. dataset_bindings: { id: string; name: string }[]
  77. created_by: string
  78. created_at: string
  79. }
  80. export type ExternalKnowledgeItem = {
  81. id: string
  82. name: string
  83. description: string | null
  84. provider: 'external'
  85. permission: DatasetPermission
  86. data_source_type: null
  87. indexing_technique: null
  88. app_count: number
  89. document_count: number
  90. word_count: number
  91. created_by: string
  92. created_at: string
  93. updated_by: string
  94. updated_at: string
  95. tags: Tag[]
  96. }
  97. export type ExternalAPIDeleteResponse = {
  98. result: 'success' | 'error'
  99. }
  100. export type ExternalAPIUsage = {
  101. is_using: boolean
  102. count: number
  103. }
  104. export type CustomFile = File & {
  105. id?: string
  106. extension?: string
  107. mime_type?: string
  108. created_by?: string
  109. created_at?: number
  110. }
  111. export type DocumentItem = {
  112. id: string
  113. name: string
  114. extension: string
  115. }
  116. export type CrawlOptions = {
  117. crawl_sub_pages: boolean
  118. only_main_content: boolean
  119. includes: string
  120. excludes: string
  121. limit: number | string
  122. max_depth: number | string
  123. use_sitemap: boolean
  124. }
  125. export type CrawlResultItem = {
  126. title: string
  127. markdown: string
  128. description: string
  129. source_url: string
  130. }
  131. export type FileItem = {
  132. fileID: string
  133. file: CustomFile
  134. progress: number
  135. }
  136. export type FetchDatasetsParams = {
  137. url: string
  138. params: {
  139. page: number
  140. ids?: string[]
  141. tag_ids?: string[]
  142. limit?: number
  143. include_all?: boolean
  144. keyword?: string,
  145. category_ids?: string[],
  146. }
  147. }
  148. export type DataSetListResponse = {
  149. data: DataSet[]
  150. has_more: boolean
  151. limit: number
  152. page: number
  153. total: number
  154. }
  155. export type ExternalAPIListResponse = {
  156. data: ExternalAPIItem[]
  157. has_more: boolean
  158. limit: number
  159. page: number
  160. total: number
  161. }
  162. export type QA = {
  163. question: string
  164. answer: string
  165. }
  166. export type IndexingEstimateResponse = {
  167. tokens: number
  168. total_price: number
  169. currency: string
  170. total_segments: number
  171. preview: Array<{ content: string; child_chunks: string[] }>
  172. qa_preview?: QA[]
  173. }
  174. export type FileIndexingEstimateResponse = {
  175. total_nodes: number
  176. } & IndexingEstimateResponse
  177. export type IndexingStatusResponse = {
  178. id: string
  179. indexing_status: DocumentIndexingStatus
  180. processing_started_at: number
  181. parsing_completed_at: number
  182. cleaning_completed_at: number
  183. splitting_completed_at: number
  184. completed_at: any
  185. paused_at: any
  186. error: any
  187. stopped_at: any
  188. completed_segments: number
  189. total_segments: number
  190. }
  191. export type IndexingStatusBatchResponse = {
  192. data: IndexingStatusResponse[]
  193. }
  194. export enum ProcessMode {
  195. general = 'custom',
  196. parentChild = 'hierarchical',
  197. }
  198. export type ParentMode = 'full-doc' | 'paragraph'
  199. export type ProcessRuleResponse = {
  200. mode: ProcessMode
  201. rules: Rules
  202. limits: Limits
  203. }
  204. export type Rules = {
  205. pre_processing_rules: PreProcessingRule[]
  206. segmentation: Segmentation
  207. parent_mode: ParentMode
  208. subchunk_segmentation: Segmentation
  209. }
  210. export type Limits = {
  211. indexing_max_segmentation_tokens_length: number
  212. }
  213. export type PreProcessingRule = {
  214. id: string
  215. enabled: boolean
  216. }
  217. export type Segmentation = {
  218. separator: string
  219. max_tokens: number
  220. chunk_overlap?: number
  221. }
  222. export const DocumentIndexingStatusList = [
  223. 'waiting',
  224. 'parsing',
  225. 'cleaning',
  226. 'splitting',
  227. 'indexing',
  228. 'paused',
  229. 'error',
  230. 'completed',
  231. ] as const
  232. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  233. export const DisplayStatusList = [
  234. 'queuing',
  235. 'indexing',
  236. 'paused',
  237. 'error',
  238. 'available',
  239. 'enabled',
  240. 'disabled',
  241. 'archived',
  242. ] as const
  243. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  244. export type DataSourceInfo = {
  245. upload_file: {
  246. id: string
  247. name: string
  248. size: number
  249. mime_type: string
  250. created_at: number
  251. created_by: string
  252. extension: string
  253. }
  254. notion_page_icon?: string
  255. notion_workspace_id?: string
  256. notion_page_id?: string
  257. provider?: DataSourceProvider
  258. job_id: string
  259. url: string
  260. }
  261. export type InitialDocumentDetail = {
  262. id: string
  263. batch: string
  264. position: number
  265. dataset_id: string
  266. data_source_type: DataSourceType
  267. data_source_info: DataSourceInfo
  268. dataset_process_rule_id: string
  269. name: string
  270. created_from: 'api' | 'web'
  271. created_by: string
  272. created_at: number
  273. indexing_status: DocumentIndexingStatus
  274. display_status: DocumentDisplayStatus
  275. completed_segments?: number
  276. total_segments?: number
  277. doc_form: ChunkingMode
  278. doc_language: string
  279. }
  280. export type SimpleDocumentDetail = InitialDocumentDetail & {
  281. enabled: boolean
  282. word_count: number
  283. is_qa: boolean // TODO waiting for backend to add this field
  284. error?: string | null
  285. archived: boolean
  286. updated_at: number
  287. hit_count: number
  288. dataset_process_rule_id?: string
  289. data_source_detail_dict?: {
  290. upload_file: {
  291. name: string
  292. extension: string
  293. }
  294. }
  295. doc_metadata?: MetadataItemWithValue[]
  296. }
  297. export type DocumentListResponse = {
  298. data: SimpleDocumentDetail[]
  299. has_more: boolean
  300. total: number
  301. page: number
  302. limit: number
  303. }
  304. export type DocumentReq = {
  305. original_document_id?: string
  306. indexing_technique?: string
  307. doc_form: ChunkingMode
  308. doc_language: string
  309. process_rule: ProcessRule
  310. }
  311. export type CreateDocumentReq = DocumentReq & {
  312. data_source: DataSource
  313. retrieval_model: RetrievalConfig
  314. embedding_model: string
  315. embedding_model_provider: string
  316. }
  317. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  318. dataset_id: string
  319. }
  320. export type DataSource = {
  321. type: DataSourceType
  322. info_list: {
  323. data_source_type: DataSourceType
  324. notion_info_list?: NotionInfo[]
  325. file_info_list?: {
  326. file_ids: string[]
  327. }
  328. website_info_list?: {
  329. provider: string
  330. job_id: string
  331. urls: string[]
  332. }
  333. }
  334. }
  335. export type NotionInfo = {
  336. workspace_id: string
  337. pages: DataSourceNotionPage[]
  338. }
  339. export type NotionPage = {
  340. page_id: string
  341. type: string
  342. }
  343. export type ProcessRule = {
  344. mode: ProcessMode
  345. rules: Rules
  346. }
  347. export type createDocumentResponse = {
  348. dataset?: DataSet
  349. batch: string
  350. documents: InitialDocumentDetail[]
  351. }
  352. export type PrecessRule = {
  353. mode: ProcessMode
  354. rules: Rules
  355. }
  356. export type FullDocumentDetail = SimpleDocumentDetail & {
  357. batch: string
  358. created_api_request_id: string
  359. processing_started_at: number
  360. parsing_completed_at: number
  361. cleaning_completed_at: number
  362. splitting_completed_at: number
  363. tokens: number
  364. indexing_latency: number
  365. completed_at: number
  366. paused_by: string
  367. paused_at: number
  368. stopped_at: number
  369. indexing_status: string
  370. disabled_at: number
  371. disabled_by: string
  372. archived_reason: 'rule_modified' | 're_upload'
  373. archived_by: string
  374. archived_at: number
  375. doc_type?: DocType | null | 'others'
  376. doc_metadata?: DocMetadata | null
  377. segment_count: number
  378. dataset_process_rule: PrecessRule
  379. document_process_rule: ProcessRule
  380. [key: string]: any
  381. }
  382. export type DocMetadata = {
  383. title: string
  384. language: string
  385. author: string
  386. publisher: string
  387. publicationDate: string
  388. ISBN: string
  389. category: string
  390. [key: string]: string
  391. }
  392. export const CUSTOMIZABLE_DOC_TYPES = [
  393. 'book',
  394. 'web_page',
  395. 'paper',
  396. 'social_media_post',
  397. 'personal_document',
  398. 'business_document',
  399. 'im_chat_log',
  400. ] as const
  401. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  402. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  403. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  404. export type DocType = CustomizableDocType | FixedDocType
  405. export type DocumentDetailResponse = FullDocumentDetail
  406. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  407. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  408. export type SegmentsQuery = {
  409. page?: string
  410. limit: number
  411. // status?: SegmentStatus
  412. hit_count_gte?: number
  413. keyword?: string
  414. enabled?: boolean | 'all'
  415. }
  416. export type SegmentDetailModel = {
  417. id: string
  418. position: number
  419. document_id: string
  420. content: string
  421. sign_content: string
  422. word_count: number
  423. tokens: number
  424. keywords: string[]
  425. index_node_id: string
  426. index_node_hash: string
  427. hit_count: number
  428. enabled: boolean
  429. disabled_at: number
  430. disabled_by: string
  431. status: SegmentStatus
  432. created_by: string
  433. created_at: number
  434. indexing_at: number
  435. completed_at: number
  436. error: string | null
  437. stopped_at: number
  438. answer?: string
  439. child_chunks?: ChildChunkDetail[]
  440. updated_at: number
  441. }
  442. export type SegmentsResponse = {
  443. data: SegmentDetailModel[]
  444. has_more: boolean
  445. limit: number
  446. total: number
  447. total_pages: number
  448. page: number
  449. }
  450. export type HitTestingRecord = {
  451. id: string
  452. content: string
  453. source: 'app' | 'hit_testing' | 'plugin'
  454. source_app_id: string
  455. created_by_role: 'account' | 'end_user'
  456. created_by: string
  457. created_at: number
  458. }
  459. export type HitTestingChildChunk = {
  460. id: string
  461. content: string
  462. position: number
  463. score: number
  464. }
  465. export type HitTesting = {
  466. segment: Segment
  467. content: Segment
  468. score: number
  469. tsne_position: TsnePosition
  470. child_chunks?: HitTestingChildChunk[] | null
  471. }
  472. export type ExternalKnowledgeBaseHitTesting = {
  473. content: string
  474. title: string
  475. score: number
  476. metadata: {
  477. 'x-amz-bedrock-kb-source-uri': string
  478. 'x-amz-bedrock-kb-data-source-id': string
  479. }
  480. }
  481. export type Segment = {
  482. id: string
  483. document: Document
  484. content: string
  485. sign_content: string
  486. position: number
  487. word_count: number
  488. tokens: number
  489. keywords: string[]
  490. hit_count: number
  491. index_node_hash: string
  492. }
  493. export type Document = {
  494. id: string
  495. data_source_type: string
  496. name: string
  497. doc_type: DocType
  498. }
  499. export type HitTestingRecordsResponse = {
  500. data: HitTestingRecord[]
  501. has_more: boolean
  502. limit: number
  503. total: number
  504. page: number
  505. }
  506. export type TsnePosition = {
  507. x: number
  508. y: number
  509. }
  510. export type HitTestingResponse = {
  511. query: {
  512. content: string
  513. tsne_position: TsnePosition
  514. }
  515. records: Array<HitTesting>
  516. }
  517. export type ExternalKnowledgeBaseHitTestingResponse = {
  518. query: {
  519. content: string
  520. }
  521. records: Array<ExternalKnowledgeBaseHitTesting>
  522. }
  523. export type RelatedApp = {
  524. id: string
  525. name: string
  526. mode: AppMode
  527. icon_type: AppIconType | null
  528. icon: string
  529. icon_background: string
  530. icon_url: string
  531. }
  532. export type RelatedAppResponse = {
  533. data: Array<RelatedApp>
  534. total: number
  535. }
  536. export type SegmentUpdater = {
  537. content: string
  538. answer?: string
  539. keywords?: string[]
  540. regenerate_child_chunks?: boolean
  541. }
  542. export type ErrorDocsResponse = {
  543. data: IndexingStatusResponse[]
  544. total: number
  545. }
  546. export type SelectedDatasetsMode = {
  547. allHighQuality: boolean
  548. allHighQualityVectorSearch: boolean
  549. allHighQualityFullTextSearch: boolean
  550. allEconomic: boolean
  551. mixtureHighQualityAndEconomic: boolean
  552. allInternal: boolean
  553. allExternal: boolean
  554. mixtureInternalAndExternal: boolean
  555. inconsistentEmbeddingModel: boolean
  556. }
  557. export enum WeightedScoreEnum {
  558. SemanticFirst = 'semantic_first',
  559. KeywordFirst = 'keyword_first',
  560. Customized = 'customized',
  561. }
  562. export enum RerankingModeEnum {
  563. RerankingModel = 'reranking_model',
  564. WeightedScore = 'weighted_score',
  565. }
  566. export const DEFAULT_WEIGHTED_SCORE = {
  567. allHighQualityVectorSearch: {
  568. semantic: 1.0,
  569. keyword: 0,
  570. },
  571. allHighQualityFullTextSearch: {
  572. semantic: 0,
  573. keyword: 1.0,
  574. },
  575. other: {
  576. semantic: 0.7,
  577. keyword: 0.3,
  578. },
  579. }
  580. export type ChildChunkType = 'automatic' | 'customized'
  581. export type ChildChunkDetail = {
  582. id: string
  583. position: number
  584. segment_id: string
  585. content: string
  586. word_count: number
  587. created_at: number
  588. updated_at: number
  589. type: ChildChunkType
  590. }
  591. export type ChildSegmentsResponse = {
  592. data: ChildChunkDetail[]
  593. total: number
  594. total_pages: number
  595. page: number
  596. limit: number
  597. }
  598. export type UpdateDocumentParams = {
  599. datasetId: string
  600. documentId: string
  601. }
  602. // Used in api url
  603. export enum DocumentActionType {
  604. enable = 'enable',
  605. disable = 'disable',
  606. archive = 'archive',
  607. unArchive = 'un_archive',
  608. delete = 'delete',
  609. }
  610. export type UpdateDocumentBatchParams = {
  611. datasetId: string
  612. documentId?: string
  613. documentIds?: string[] | string
  614. }
  615. export type BatchImportResponse = {
  616. job_id: string
  617. job_status: string
  618. }