datasets.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. export enum DataSourceType {
  8. FILE = 'upload_file',
  9. NOTION = 'notion_import',
  10. WEB = 'website_crawl',
  11. }
  12. export enum DatasetPermission {
  13. onlyMe = 'only_me',
  14. allTeamMembers = 'all_team_members',
  15. partialMembers = 'partial_members',
  16. }
  17. export enum ChunkingMode {
  18. text = 'text_model', // General text
  19. qa = 'qa_model', // General QA
  20. parentChild = 'hierarchical_model', // Parent-Child
  21. }
  22. export type MetadataInDoc = {
  23. value: string
  24. id: string
  25. type: MetadataFilteringVariableType
  26. name: string
  27. }
  28. export type DataSet = {
  29. categories: any;
  30. id: string
  31. name: string
  32. icon: string
  33. icon_background: string
  34. description: string
  35. permission: DatasetPermission
  36. data_source_type: DataSourceType
  37. indexing_technique: IndexingType
  38. created_by: string
  39. updated_by: string
  40. updated_at: number
  41. app_count: number
  42. doc_form: ChunkingMode
  43. document_count: number
  44. word_count: number
  45. provider: string
  46. embedding_model: string
  47. embedding_model_provider: string
  48. embedding_available: boolean
  49. retrieval_model_dict: RetrievalConfig
  50. retrieval_model: RetrievalConfig
  51. tags: Tag[]
  52. partial_member_list?: string[]
  53. external_knowledge_info: {
  54. external_knowledge_id: string
  55. external_knowledge_api_id: string
  56. external_knowledge_api_name: string
  57. external_knowledge_api_endpoint: string
  58. }
  59. external_retrieval_model: {
  60. top_k: number
  61. score_threshold: number
  62. score_threshold_enabled: boolean
  63. }
  64. built_in_field_enabled: boolean
  65. doc_metadata?: MetadataInDoc[],
  66. has_edit_permission: boolean
  67. dept_id: string
  68. edit_auth: number
  69. }
  70. export type ExternalAPIItem = {
  71. id: string
  72. tenant_id: string
  73. name: string
  74. description: string
  75. settings: {
  76. endpoint: string
  77. api_key: string
  78. }
  79. dataset_bindings: { id: string; name: string }[]
  80. created_by: string
  81. created_at: string
  82. }
  83. export type ExternalKnowledgeItem = {
  84. id: string
  85. name: string
  86. description: string | null
  87. provider: 'external'
  88. permission: DatasetPermission
  89. data_source_type: null
  90. indexing_technique: null
  91. app_count: number
  92. document_count: number
  93. word_count: number
  94. created_by: string
  95. created_at: string
  96. updated_by: string
  97. updated_at: string
  98. tags: Tag[]
  99. }
  100. export type ExternalAPIDeleteResponse = {
  101. result: 'success' | 'error'
  102. }
  103. export type ExternalAPIUsage = {
  104. is_using: boolean
  105. count: number
  106. }
  107. export type CustomFile = File & {
  108. id?: string
  109. extension?: string
  110. mime_type?: string
  111. created_by?: string
  112. created_at?: number
  113. }
  114. export type DocumentItem = {
  115. id: string
  116. name: string
  117. extension: string
  118. }
  119. export type CrawlOptions = {
  120. crawl_sub_pages: boolean
  121. only_main_content: boolean
  122. includes: string
  123. excludes: string
  124. limit: number | string
  125. max_depth: number | string
  126. use_sitemap: boolean
  127. }
  128. export type CrawlResultItem = {
  129. title: string
  130. markdown: string
  131. description: string
  132. source_url: string
  133. }
  134. export type FileItem = {
  135. fileID: string
  136. file: CustomFile
  137. progress: number
  138. }
  139. export type FetchDatasetsParams = {
  140. url: string
  141. params: {
  142. page: number
  143. ids?: string[]
  144. tag_ids?: string[]
  145. limit?: number
  146. include_all?: boolean
  147. keyword?: string,
  148. category_ids?: string[],
  149. type?: string,
  150. creatorDept?: string,
  151. authType?: string,
  152. }
  153. }
  154. export type DataSetListResponse = {
  155. data: DataSet[]
  156. has_more: boolean
  157. limit: number
  158. page: number
  159. total: number
  160. }
  161. export type ExternalAPIListResponse = {
  162. data: ExternalAPIItem[]
  163. has_more: boolean
  164. limit: number
  165. page: number
  166. total: number
  167. }
  168. export type QA = {
  169. question: string
  170. answer: string
  171. }
  172. export type IndexingEstimateResponse = {
  173. tokens: number
  174. total_price: number
  175. currency: string
  176. total_segments: number
  177. preview: Array<{ content: string; child_chunks: string[] }>
  178. qa_preview?: QA[]
  179. }
  180. export type FileIndexingEstimateResponse = {
  181. total_nodes: number
  182. } & IndexingEstimateResponse
  183. export type IndexingStatusResponse = {
  184. id: string
  185. indexing_status: DocumentIndexingStatus
  186. processing_started_at: number
  187. parsing_completed_at: number
  188. cleaning_completed_at: number
  189. splitting_completed_at: number
  190. completed_at: any
  191. paused_at: any
  192. error: any
  193. stopped_at: any
  194. completed_segments: number
  195. total_segments: number
  196. }
  197. export type IndexingStatusBatchResponse = {
  198. data: IndexingStatusResponse[]
  199. }
  200. export enum ProcessMode {
  201. general = 'custom',
  202. parentChild = 'hierarchical',
  203. }
  204. export type ParentMode = 'full-doc' | 'paragraph'
  205. export type ProcessRuleResponse = {
  206. mode: ProcessMode
  207. rules: Rules
  208. limits: Limits
  209. }
  210. export type Rules = {
  211. pre_processing_rules: PreProcessingRule[]
  212. segmentation: Segmentation
  213. parent_mode: ParentMode
  214. subchunk_segmentation: Segmentation
  215. }
  216. export type Limits = {
  217. indexing_max_segmentation_tokens_length: number
  218. }
  219. export type PreProcessingRule = {
  220. id: string
  221. enabled: boolean
  222. }
  223. export type Segmentation = {
  224. separator: string
  225. max_tokens: number
  226. chunk_overlap?: number
  227. }
  228. export const DocumentIndexingStatusList = [
  229. 'waiting',
  230. 'parsing',
  231. 'cleaning',
  232. 'splitting',
  233. 'indexing',
  234. 'paused',
  235. 'error',
  236. 'completed',
  237. ] as const
  238. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  239. export const DisplayStatusList = [
  240. 'queuing',
  241. 'indexing',
  242. 'paused',
  243. 'error',
  244. 'available',
  245. 'enabled',
  246. 'disabled',
  247. 'archived',
  248. ] as const
  249. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  250. export type DataSourceInfo = {
  251. upload_file: {
  252. id: string
  253. name: string
  254. size: number
  255. mime_type: string
  256. created_at: number
  257. created_by: string
  258. extension: string
  259. }
  260. notion_page_icon?: string
  261. notion_workspace_id?: string
  262. notion_page_id?: string
  263. provider?: DataSourceProvider
  264. job_id: string
  265. url: string
  266. }
  267. export type InitialDocumentDetail = {
  268. id: string
  269. batch: string
  270. position: number
  271. dataset_id: string
  272. data_source_type: DataSourceType
  273. data_source_info: DataSourceInfo
  274. dataset_process_rule_id: string
  275. name: string
  276. created_from: 'api' | 'web'
  277. created_by: string
  278. created_at: number
  279. indexing_status: DocumentIndexingStatus
  280. display_status: DocumentDisplayStatus
  281. completed_segments?: number
  282. total_segments?: number
  283. doc_form: ChunkingMode
  284. doc_language: string
  285. }
  286. export type SimpleDocumentDetail = InitialDocumentDetail & {
  287. enabled: boolean
  288. word_count: number
  289. is_qa: boolean // TODO waiting for backend to add this field
  290. error?: string | null
  291. archived: boolean
  292. updated_at: number
  293. hit_count: number
  294. dataset_process_rule_id?: string
  295. data_source_detail_dict?: {
  296. upload_file: {
  297. name: string
  298. extension: string
  299. }
  300. }
  301. doc_metadata?: MetadataItemWithValue[]
  302. check_status: number
  303. check_by: string
  304. enable_application: string
  305. }
  306. export type DocumentListResponse = {
  307. data: SimpleDocumentDetail[]
  308. has_more: boolean
  309. total: number
  310. page: number
  311. limit: number
  312. }
  313. export type DocumentReq = {
  314. original_document_id?: string
  315. indexing_technique?: string
  316. doc_form: ChunkingMode
  317. doc_language: string
  318. process_rule: ProcessRule
  319. }
  320. export type CreateDocumentReq = DocumentReq & {
  321. data_source: DataSource
  322. retrieval_model: RetrievalConfig
  323. embedding_model: string
  324. embedding_model_provider: string
  325. }
  326. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  327. dataset_id: string
  328. }
  329. export type DataSource = {
  330. type: DataSourceType
  331. info_list: {
  332. data_source_type: DataSourceType
  333. notion_info_list?: NotionInfo[]
  334. file_info_list?: {
  335. file_ids: string[]
  336. }
  337. website_info_list?: {
  338. provider: string
  339. job_id: string
  340. urls: string[]
  341. }
  342. }
  343. }
  344. export type NotionInfo = {
  345. workspace_id: string
  346. pages: DataSourceNotionPage[]
  347. }
  348. export type NotionPage = {
  349. page_id: string
  350. type: string
  351. }
  352. export type ProcessRule = {
  353. mode: ProcessMode
  354. rules: Rules
  355. }
  356. export type createDocumentResponse = {
  357. dataset?: DataSet
  358. batch: string
  359. documents: InitialDocumentDetail[]
  360. }
  361. export type PrecessRule = {
  362. mode: ProcessMode
  363. rules: Rules
  364. }
  365. export type FullDocumentDetail = SimpleDocumentDetail & {
  366. batch: string
  367. created_api_request_id: string
  368. processing_started_at: number
  369. parsing_completed_at: number
  370. cleaning_completed_at: number
  371. splitting_completed_at: number
  372. tokens: number
  373. indexing_latency: number
  374. completed_at: number
  375. paused_by: string
  376. paused_at: number
  377. stopped_at: number
  378. indexing_status: string
  379. disabled_at: number
  380. disabled_by: string
  381. archived_reason: 'rule_modified' | 're_upload'
  382. archived_by: string
  383. archived_at: number
  384. doc_type?: DocType | null | 'others'
  385. doc_metadata?: DocMetadata | null
  386. segment_count: number
  387. dataset_process_rule: PrecessRule
  388. document_process_rule: ProcessRule
  389. [key: string]: any
  390. }
  391. export type DocMetadata = {
  392. title: string
  393. language: string
  394. author: string
  395. publisher: string
  396. publicationDate: string
  397. ISBN: string
  398. category: string
  399. [key: string]: string
  400. }
  401. export const CUSTOMIZABLE_DOC_TYPES = [
  402. 'book',
  403. 'web_page',
  404. 'paper',
  405. 'social_media_post',
  406. 'personal_document',
  407. 'business_document',
  408. 'im_chat_log',
  409. ] as const
  410. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  411. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  412. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  413. export type DocType = CustomizableDocType | FixedDocType
  414. export type DocumentDetailResponse = FullDocumentDetail
  415. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  416. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  417. export type SegmentsQuery = {
  418. page?: string
  419. limit: number
  420. // status?: SegmentStatus
  421. hit_count_gte?: number
  422. keyword?: string
  423. enabled?: boolean | 'all'
  424. }
  425. export type SegmentDetailModel = {
  426. id: string
  427. position: number
  428. document_id: string
  429. content: string
  430. sign_content: string
  431. word_count: number
  432. tokens: number
  433. keywords: string[]
  434. index_node_id: string
  435. index_node_hash: string
  436. hit_count: number
  437. enabled: boolean
  438. disabled_at: number
  439. disabled_by: string
  440. status: SegmentStatus
  441. created_by: string
  442. created_at: number
  443. indexing_at: number
  444. completed_at: number
  445. error: string | null
  446. stopped_at: number
  447. answer?: string
  448. child_chunks?: ChildChunkDetail[]
  449. updated_at: number
  450. }
  451. export type SegmentsResponse = {
  452. data: SegmentDetailModel[]
  453. has_more: boolean
  454. limit: number
  455. total: number
  456. total_pages: number
  457. page: number
  458. }
  459. export type HitTestingRecord = {
  460. id: string
  461. content: string
  462. source: 'app' | 'hit_testing' | 'plugin'
  463. source_app_id: string
  464. created_by_role: 'account' | 'end_user'
  465. created_by: string
  466. created_at: number
  467. }
  468. export type HitTestingChildChunk = {
  469. id: string
  470. content: string
  471. position: number
  472. score: number
  473. }
  474. export type HitTesting = {
  475. segment: Segment
  476. content: Segment
  477. score: number
  478. tsne_position: TsnePosition
  479. child_chunks?: HitTestingChildChunk[] | null
  480. }
  481. export type ExternalKnowledgeBaseHitTesting = {
  482. content: string
  483. title: string
  484. score: number
  485. metadata: {
  486. 'x-amz-bedrock-kb-source-uri': string
  487. 'x-amz-bedrock-kb-data-source-id': string
  488. }
  489. }
  490. export type Segment = {
  491. id: string
  492. document: Document
  493. content: string
  494. sign_content: string
  495. position: number
  496. word_count: number
  497. tokens: number
  498. keywords: string[]
  499. hit_count: number
  500. index_node_hash: string
  501. }
  502. export type Document = {
  503. id: string
  504. data_source_type: string
  505. name: string
  506. doc_type: DocType
  507. }
  508. export type HitTestingRecordsResponse = {
  509. data: HitTestingRecord[]
  510. has_more: boolean
  511. limit: number
  512. total: number
  513. page: number
  514. }
  515. export type TsnePosition = {
  516. x: number
  517. y: number
  518. }
  519. export type HitTestingResponse = {
  520. query: {
  521. content: string
  522. tsne_position: TsnePosition
  523. }
  524. records: Array<HitTesting>
  525. }
  526. export type ExternalKnowledgeBaseHitTestingResponse = {
  527. query: {
  528. content: string
  529. }
  530. records: Array<ExternalKnowledgeBaseHitTesting>
  531. }
  532. export type RelatedApp = {
  533. id: string
  534. name: string
  535. mode: AppMode
  536. icon_type: AppIconType | null
  537. icon: string
  538. icon_background: string
  539. icon_url: string
  540. }
  541. export type RelatedAppResponse = {
  542. data: Array<RelatedApp>
  543. total: number
  544. }
  545. export type SegmentUpdater = {
  546. content: string
  547. answer?: string
  548. keywords?: string[]
  549. regenerate_child_chunks?: boolean
  550. }
  551. export type ErrorDocsResponse = {
  552. data: IndexingStatusResponse[]
  553. total: number
  554. }
  555. export type SelectedDatasetsMode = {
  556. allHighQuality: boolean
  557. allHighQualityVectorSearch: boolean
  558. allHighQualityFullTextSearch: boolean
  559. allEconomic: boolean
  560. mixtureHighQualityAndEconomic: boolean
  561. allInternal: boolean
  562. allExternal: boolean
  563. mixtureInternalAndExternal: boolean
  564. inconsistentEmbeddingModel: boolean
  565. }
  566. export enum WeightedScoreEnum {
  567. SemanticFirst = 'semantic_first',
  568. KeywordFirst = 'keyword_first',
  569. Customized = 'customized',
  570. }
  571. export enum RerankingModeEnum {
  572. RerankingModel = 'reranking_model',
  573. WeightedScore = 'weighted_score',
  574. }
  575. export const DEFAULT_WEIGHTED_SCORE = {
  576. allHighQualityVectorSearch: {
  577. semantic: 1.0,
  578. keyword: 0,
  579. },
  580. allHighQualityFullTextSearch: {
  581. semantic: 0,
  582. keyword: 1.0,
  583. },
  584. other: {
  585. semantic: 0.7,
  586. keyword: 0.3,
  587. },
  588. }
  589. export type ChildChunkType = 'automatic' | 'customized'
  590. export type ChildChunkDetail = {
  591. id: string
  592. position: number
  593. segment_id: string
  594. content: string
  595. word_count: number
  596. created_at: number
  597. updated_at: number
  598. type: ChildChunkType
  599. }
  600. export type ChildSegmentsResponse = {
  601. data: ChildChunkDetail[]
  602. total: number
  603. total_pages: number
  604. page: number
  605. limit: number
  606. }
  607. export type UpdateDocumentParams = {
  608. datasetId: string
  609. documentId: string
  610. }
  611. // Used in api url
  612. export enum DocumentActionType {
  613. enable = 'enable',
  614. disable = 'disable',
  615. archive = 'archive',
  616. unArchive = 'un_archive',
  617. delete = 'delete',
  618. check_fail = 'check_fail',
  619. }
  620. export type UpdateDocumentBatchParams = {
  621. datasetId: string
  622. documentId?: string
  623. documentIds?: string[] | string
  624. }
  625. export type BatchImportResponse = {
  626. job_id: string
  627. job_status: string
  628. }