datasets.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. export enum DataSourceType {
  8. FILE = 'upload_file',
  9. NOTION = 'notion_import',
  10. WEB = 'website_crawl',
  11. }
  12. export enum DatasetPermission {
  13. onlyMe = 'only_me',
  14. allTeamMembers = 'all_team_members',
  15. partialMembers = 'partial_members',
  16. }
  17. export enum ChunkingMode {
  18. text = 'text_model', // General text
  19. qa = 'qa_model', // General QA
  20. parentChild = 'hierarchical_model', // Parent-Child
  21. }
  22. export type MetadataInDoc = {
  23. value: string
  24. id: string
  25. type: MetadataFilteringVariableType
  26. name: string
  27. }
  28. export type DataSet = {
  29. categories: any;
  30. id: string
  31. name: string
  32. icon: string
  33. icon_background: string
  34. description: string
  35. permission: DatasetPermission
  36. data_source_type: DataSourceType
  37. indexing_technique: IndexingType
  38. created_by: string
  39. updated_by: string
  40. updated_at: number
  41. app_count: number
  42. doc_form: ChunkingMode
  43. document_count: number
  44. word_count: number
  45. provider: string
  46. embedding_model: string
  47. embedding_model_provider: string
  48. embedding_available: boolean
  49. retrieval_model_dict: RetrievalConfig
  50. retrieval_model: RetrievalConfig
  51. tags: Tag[]
  52. partial_member_list?: string[]
  53. external_knowledge_info: {
  54. external_knowledge_id: string
  55. external_knowledge_api_id: string
  56. external_knowledge_api_name: string
  57. external_knowledge_api_endpoint: string
  58. }
  59. external_retrieval_model: {
  60. top_k: number
  61. score_threshold: number
  62. score_threshold_enabled: boolean
  63. }
  64. built_in_field_enabled: boolean
  65. doc_metadata?: MetadataInDoc[],
  66. }
  67. export type ExternalAPIItem = {
  68. id: string
  69. tenant_id: string
  70. name: string
  71. description: string
  72. settings: {
  73. endpoint: string
  74. api_key: string
  75. }
  76. dataset_bindings: { id: string; name: string }[]
  77. created_by: string
  78. created_at: string
  79. }
  80. export type ExternalKnowledgeItem = {
  81. id: string
  82. name: string
  83. description: string | null
  84. provider: 'external'
  85. permission: DatasetPermission
  86. data_source_type: null
  87. indexing_technique: null
  88. app_count: number
  89. document_count: number
  90. word_count: number
  91. created_by: string
  92. created_at: string
  93. updated_by: string
  94. updated_at: string
  95. tags: Tag[]
  96. }
  97. export type ExternalAPIDeleteResponse = {
  98. result: 'success' | 'error'
  99. }
  100. export type ExternalAPIUsage = {
  101. is_using: boolean
  102. count: number
  103. }
  104. export type CustomFile = File & {
  105. id?: string
  106. extension?: string
  107. mime_type?: string
  108. created_by?: string
  109. created_at?: number
  110. }
  111. export type DocumentItem = {
  112. id: string
  113. name: string
  114. extension: string
  115. }
  116. export type CrawlOptions = {
  117. crawl_sub_pages: boolean
  118. only_main_content: boolean
  119. includes: string
  120. excludes: string
  121. limit: number | string
  122. max_depth: number | string
  123. use_sitemap: boolean
  124. }
  125. export type CrawlResultItem = {
  126. title: string
  127. markdown: string
  128. description: string
  129. source_url: string
  130. }
  131. export type FileItem = {
  132. fileID: string
  133. file: CustomFile
  134. progress: number
  135. }
  136. export type FetchDatasetsParams = {
  137. url: string
  138. params: {
  139. page: number
  140. ids?: string[]
  141. tag_ids?: string[]
  142. limit?: number
  143. include_all?: boolean
  144. keyword?: string,
  145. category_ids?: string[],
  146. type?: string,
  147. dept?: string,
  148. }
  149. }
  150. export type DataSetListResponse = {
  151. data: DataSet[]
  152. has_more: boolean
  153. limit: number
  154. page: number
  155. total: number
  156. }
  157. export type ExternalAPIListResponse = {
  158. data: ExternalAPIItem[]
  159. has_more: boolean
  160. limit: number
  161. page: number
  162. total: number
  163. }
  164. export type QA = {
  165. question: string
  166. answer: string
  167. }
  168. export type IndexingEstimateResponse = {
  169. tokens: number
  170. total_price: number
  171. currency: string
  172. total_segments: number
  173. preview: Array<{ content: string; child_chunks: string[] }>
  174. qa_preview?: QA[]
  175. }
  176. export type FileIndexingEstimateResponse = {
  177. total_nodes: number
  178. } & IndexingEstimateResponse
  179. export type IndexingStatusResponse = {
  180. id: string
  181. indexing_status: DocumentIndexingStatus
  182. processing_started_at: number
  183. parsing_completed_at: number
  184. cleaning_completed_at: number
  185. splitting_completed_at: number
  186. completed_at: any
  187. paused_at: any
  188. error: any
  189. stopped_at: any
  190. completed_segments: number
  191. total_segments: number
  192. }
  193. export type IndexingStatusBatchResponse = {
  194. data: IndexingStatusResponse[]
  195. }
  196. export enum ProcessMode {
  197. general = 'custom',
  198. parentChild = 'hierarchical',
  199. }
  200. export type ParentMode = 'full-doc' | 'paragraph'
  201. export type ProcessRuleResponse = {
  202. mode: ProcessMode
  203. rules: Rules
  204. limits: Limits
  205. }
  206. export type Rules = {
  207. pre_processing_rules: PreProcessingRule[]
  208. segmentation: Segmentation
  209. parent_mode: ParentMode
  210. subchunk_segmentation: Segmentation
  211. }
  212. export type Limits = {
  213. indexing_max_segmentation_tokens_length: number
  214. }
  215. export type PreProcessingRule = {
  216. id: string
  217. enabled: boolean
  218. }
  219. export type Segmentation = {
  220. separator: string
  221. max_tokens: number
  222. chunk_overlap?: number
  223. }
  224. export const DocumentIndexingStatusList = [
  225. 'waiting',
  226. 'parsing',
  227. 'cleaning',
  228. 'splitting',
  229. 'indexing',
  230. 'paused',
  231. 'error',
  232. 'completed',
  233. ] as const
  234. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  235. export const DisplayStatusList = [
  236. 'queuing',
  237. 'indexing',
  238. 'paused',
  239. 'error',
  240. 'available',
  241. 'enabled',
  242. 'disabled',
  243. 'archived',
  244. ] as const
  245. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  246. export type DataSourceInfo = {
  247. upload_file: {
  248. id: string
  249. name: string
  250. size: number
  251. mime_type: string
  252. created_at: number
  253. created_by: string
  254. extension: string
  255. }
  256. notion_page_icon?: string
  257. notion_workspace_id?: string
  258. notion_page_id?: string
  259. provider?: DataSourceProvider
  260. job_id: string
  261. url: string
  262. }
  263. export type InitialDocumentDetail = {
  264. id: string
  265. batch: string
  266. position: number
  267. dataset_id: string
  268. data_source_type: DataSourceType
  269. data_source_info: DataSourceInfo
  270. dataset_process_rule_id: string
  271. name: string
  272. created_from: 'api' | 'web'
  273. created_by: string
  274. created_at: number
  275. indexing_status: DocumentIndexingStatus
  276. display_status: DocumentDisplayStatus
  277. completed_segments?: number
  278. total_segments?: number
  279. doc_form: ChunkingMode
  280. doc_language: string
  281. }
  282. export type SimpleDocumentDetail = InitialDocumentDetail & {
  283. enabled: boolean
  284. word_count: number
  285. is_qa: boolean // TODO waiting for backend to add this field
  286. error?: string | null
  287. archived: boolean
  288. updated_at: number
  289. hit_count: number
  290. dataset_process_rule_id?: string
  291. data_source_detail_dict?: {
  292. upload_file: {
  293. name: string
  294. extension: string
  295. }
  296. }
  297. doc_metadata?: MetadataItemWithValue[]
  298. check_status: number
  299. check_by: string
  300. enable_application: string
  301. }
  302. export type DocumentListResponse = {
  303. data: SimpleDocumentDetail[]
  304. has_more: boolean
  305. total: number
  306. page: number
  307. limit: number
  308. }
  309. export type DocumentReq = {
  310. original_document_id?: string
  311. indexing_technique?: string
  312. doc_form: ChunkingMode
  313. doc_language: string
  314. process_rule: ProcessRule
  315. }
  316. export type CreateDocumentReq = DocumentReq & {
  317. data_source: DataSource
  318. retrieval_model: RetrievalConfig
  319. embedding_model: string
  320. embedding_model_provider: string
  321. }
  322. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  323. dataset_id: string
  324. }
  325. export type DataSource = {
  326. type: DataSourceType
  327. info_list: {
  328. data_source_type: DataSourceType
  329. notion_info_list?: NotionInfo[]
  330. file_info_list?: {
  331. file_ids: string[]
  332. }
  333. website_info_list?: {
  334. provider: string
  335. job_id: string
  336. urls: string[]
  337. }
  338. }
  339. }
  340. export type NotionInfo = {
  341. workspace_id: string
  342. pages: DataSourceNotionPage[]
  343. }
  344. export type NotionPage = {
  345. page_id: string
  346. type: string
  347. }
  348. export type ProcessRule = {
  349. mode: ProcessMode
  350. rules: Rules
  351. }
  352. export type createDocumentResponse = {
  353. dataset?: DataSet
  354. batch: string
  355. documents: InitialDocumentDetail[]
  356. }
  357. export type PrecessRule = {
  358. mode: ProcessMode
  359. rules: Rules
  360. }
  361. export type FullDocumentDetail = SimpleDocumentDetail & {
  362. batch: string
  363. created_api_request_id: string
  364. processing_started_at: number
  365. parsing_completed_at: number
  366. cleaning_completed_at: number
  367. splitting_completed_at: number
  368. tokens: number
  369. indexing_latency: number
  370. completed_at: number
  371. paused_by: string
  372. paused_at: number
  373. stopped_at: number
  374. indexing_status: string
  375. disabled_at: number
  376. disabled_by: string
  377. archived_reason: 'rule_modified' | 're_upload'
  378. archived_by: string
  379. archived_at: number
  380. doc_type?: DocType | null | 'others'
  381. doc_metadata?: DocMetadata | null
  382. segment_count: number
  383. dataset_process_rule: PrecessRule
  384. document_process_rule: ProcessRule
  385. [key: string]: any
  386. }
  387. export type DocMetadata = {
  388. title: string
  389. language: string
  390. author: string
  391. publisher: string
  392. publicationDate: string
  393. ISBN: string
  394. category: string
  395. [key: string]: string
  396. }
  397. export const CUSTOMIZABLE_DOC_TYPES = [
  398. 'book',
  399. 'web_page',
  400. 'paper',
  401. 'social_media_post',
  402. 'personal_document',
  403. 'business_document',
  404. 'im_chat_log',
  405. ] as const
  406. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  407. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  408. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  409. export type DocType = CustomizableDocType | FixedDocType
  410. export type DocumentDetailResponse = FullDocumentDetail
  411. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  412. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  413. export type SegmentsQuery = {
  414. page?: string
  415. limit: number
  416. // status?: SegmentStatus
  417. hit_count_gte?: number
  418. keyword?: string
  419. enabled?: boolean | 'all'
  420. }
  421. export type SegmentDetailModel = {
  422. id: string
  423. position: number
  424. document_id: string
  425. content: string
  426. sign_content: string
  427. word_count: number
  428. tokens: number
  429. keywords: string[]
  430. index_node_id: string
  431. index_node_hash: string
  432. hit_count: number
  433. enabled: boolean
  434. disabled_at: number
  435. disabled_by: string
  436. status: SegmentStatus
  437. created_by: string
  438. created_at: number
  439. indexing_at: number
  440. completed_at: number
  441. error: string | null
  442. stopped_at: number
  443. answer?: string
  444. child_chunks?: ChildChunkDetail[]
  445. updated_at: number
  446. }
  447. export type SegmentsResponse = {
  448. data: SegmentDetailModel[]
  449. has_more: boolean
  450. limit: number
  451. total: number
  452. total_pages: number
  453. page: number
  454. }
  455. export type HitTestingRecord = {
  456. id: string
  457. content: string
  458. source: 'app' | 'hit_testing' | 'plugin'
  459. source_app_id: string
  460. created_by_role: 'account' | 'end_user'
  461. created_by: string
  462. created_at: number
  463. }
  464. export type HitTestingChildChunk = {
  465. id: string
  466. content: string
  467. position: number
  468. score: number
  469. }
  470. export type HitTesting = {
  471. segment: Segment
  472. content: Segment
  473. score: number
  474. tsne_position: TsnePosition
  475. child_chunks?: HitTestingChildChunk[] | null
  476. }
  477. export type ExternalKnowledgeBaseHitTesting = {
  478. content: string
  479. title: string
  480. score: number
  481. metadata: {
  482. 'x-amz-bedrock-kb-source-uri': string
  483. 'x-amz-bedrock-kb-data-source-id': string
  484. }
  485. }
  486. export type Segment = {
  487. id: string
  488. document: Document
  489. content: string
  490. sign_content: string
  491. position: number
  492. word_count: number
  493. tokens: number
  494. keywords: string[]
  495. hit_count: number
  496. index_node_hash: string
  497. }
  498. export type Document = {
  499. id: string
  500. data_source_type: string
  501. name: string
  502. doc_type: DocType
  503. }
  504. export type HitTestingRecordsResponse = {
  505. data: HitTestingRecord[]
  506. has_more: boolean
  507. limit: number
  508. total: number
  509. page: number
  510. }
  511. export type TsnePosition = {
  512. x: number
  513. y: number
  514. }
  515. export type HitTestingResponse = {
  516. query: {
  517. content: string
  518. tsne_position: TsnePosition
  519. }
  520. records: Array<HitTesting>
  521. }
  522. export type ExternalKnowledgeBaseHitTestingResponse = {
  523. query: {
  524. content: string
  525. }
  526. records: Array<ExternalKnowledgeBaseHitTesting>
  527. }
  528. export type RelatedApp = {
  529. id: string
  530. name: string
  531. mode: AppMode
  532. icon_type: AppIconType | null
  533. icon: string
  534. icon_background: string
  535. icon_url: string
  536. }
  537. export type RelatedAppResponse = {
  538. data: Array<RelatedApp>
  539. total: number
  540. }
  541. export type SegmentUpdater = {
  542. content: string
  543. answer?: string
  544. keywords?: string[]
  545. regenerate_child_chunks?: boolean
  546. }
  547. export type ErrorDocsResponse = {
  548. data: IndexingStatusResponse[]
  549. total: number
  550. }
  551. export type SelectedDatasetsMode = {
  552. allHighQuality: boolean
  553. allHighQualityVectorSearch: boolean
  554. allHighQualityFullTextSearch: boolean
  555. allEconomic: boolean
  556. mixtureHighQualityAndEconomic: boolean
  557. allInternal: boolean
  558. allExternal: boolean
  559. mixtureInternalAndExternal: boolean
  560. inconsistentEmbeddingModel: boolean
  561. }
  562. export enum WeightedScoreEnum {
  563. SemanticFirst = 'semantic_first',
  564. KeywordFirst = 'keyword_first',
  565. Customized = 'customized',
  566. }
  567. export enum RerankingModeEnum {
  568. RerankingModel = 'reranking_model',
  569. WeightedScore = 'weighted_score',
  570. }
  571. export const DEFAULT_WEIGHTED_SCORE = {
  572. allHighQualityVectorSearch: {
  573. semantic: 1.0,
  574. keyword: 0,
  575. },
  576. allHighQualityFullTextSearch: {
  577. semantic: 0,
  578. keyword: 1.0,
  579. },
  580. other: {
  581. semantic: 0.7,
  582. keyword: 0.3,
  583. },
  584. }
  585. export type ChildChunkType = 'automatic' | 'customized'
  586. export type ChildChunkDetail = {
  587. id: string
  588. position: number
  589. segment_id: string
  590. content: string
  591. word_count: number
  592. created_at: number
  593. updated_at: number
  594. type: ChildChunkType
  595. }
  596. export type ChildSegmentsResponse = {
  597. data: ChildChunkDetail[]
  598. total: number
  599. total_pages: number
  600. page: number
  601. limit: number
  602. }
  603. export type UpdateDocumentParams = {
  604. datasetId: string
  605. documentId: string
  606. }
  607. // Used in api url
  608. export enum DocumentActionType {
  609. enable = 'enable',
  610. disable = 'disable',
  611. archive = 'archive',
  612. unArchive = 'un_archive',
  613. delete = 'delete',
  614. check_fail = 'check_fail',
  615. }
  616. export type UpdateDocumentBatchParams = {
  617. datasetId: string
  618. documentId?: string
  619. documentIds?: string[] | string
  620. }
  621. export type BatchImportResponse = {
  622. job_id: string
  623. job_status: string
  624. }