datasets.ts 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. import type { DataSourceNotionPage } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. export enum DataSourceType {
  5. FILE = 'upload_file',
  6. NOTION = 'notion_import',
  7. WEB = 'website_crawl',
  8. }
  9. export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
  10. export type DataSet = {
  11. id: string
  12. name: string
  13. icon: string
  14. icon_background: string
  15. description: string
  16. permission: DatasetPermission
  17. data_source_type: DataSourceType
  18. indexing_technique: 'high_quality' | 'economy'
  19. created_by: string
  20. updated_by: string
  21. updated_at: number
  22. app_count: number
  23. document_count: number
  24. word_count: number
  25. embedding_model: string
  26. embedding_model_provider: string
  27. embedding_available: boolean
  28. retrieval_model_dict: RetrievalConfig
  29. retrieval_model: RetrievalConfig
  30. tags: Tag[]
  31. partial_member_list?: any[]
  32. }
  33. export type CustomFile = File & {
  34. id?: string
  35. extension?: string
  36. mime_type?: string
  37. created_by?: string
  38. created_at?: number
  39. }
  40. export type CrawlOptions = {
  41. crawl_sub_pages: boolean
  42. only_main_content: boolean
  43. includes: string
  44. excludes: string
  45. limit: number | string
  46. max_depth: number | string
  47. }
  48. export type CrawlResultItem = {
  49. title: string
  50. markdown: string
  51. description: string
  52. source_url: string
  53. }
  54. export type FileItem = {
  55. fileID: string
  56. file: CustomFile
  57. progress: number
  58. }
  59. export type DataSetListResponse = {
  60. data: DataSet[]
  61. has_more: boolean
  62. limit: number
  63. page: number
  64. total: number
  65. }
  66. export type QA = {
  67. question: string
  68. answer: string
  69. }
  70. export type IndexingEstimateResponse = {
  71. tokens: number
  72. total_price: number
  73. currency: string
  74. total_segments: number
  75. preview: string[]
  76. qa_preview?: QA[]
  77. }
  78. export type FileIndexingEstimateResponse = {
  79. total_nodes: number
  80. } & IndexingEstimateResponse
  81. export type IndexingStatusResponse = {
  82. id: string
  83. indexing_status: DocumentIndexingStatus
  84. processing_started_at: number
  85. parsing_completed_at: number
  86. cleaning_completed_at: number
  87. splitting_completed_at: number
  88. completed_at: any
  89. paused_at: any
  90. error: any
  91. stopped_at: any
  92. completed_segments: number
  93. total_segments: number
  94. }
  95. export type IndexingStatusBatchResponse = {
  96. data: IndexingStatusResponse[]
  97. }
  98. export type ProcessMode = 'automatic' | 'custom'
  99. export type ProcessRuleResponse = {
  100. mode: ProcessMode
  101. rules: Rules
  102. }
  103. export type Rules = {
  104. pre_processing_rules: PreProcessingRule[]
  105. segmentation: Segmentation
  106. }
  107. export type PreProcessingRule = {
  108. id: string
  109. enabled: boolean
  110. }
  111. export type Segmentation = {
  112. separator: string
  113. max_tokens: number
  114. chunk_overlap: number
  115. }
  116. export const DocumentIndexingStatusList = [
  117. 'waiting',
  118. 'parsing',
  119. 'cleaning',
  120. 'splitting',
  121. 'indexing',
  122. 'paused',
  123. 'error',
  124. 'completed',
  125. ] as const
  126. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  127. export const DisplayStatusList = [
  128. 'queuing',
  129. 'indexing',
  130. 'paused',
  131. 'error',
  132. 'available',
  133. 'enabled',
  134. 'disabled',
  135. 'archived',
  136. ] as const
  137. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  138. export type DataSourceInfo = {
  139. upload_file: {
  140. id: string
  141. name: string
  142. size: number
  143. mime_type: string
  144. created_at: number
  145. created_by: string
  146. extension: string
  147. }
  148. notion_page_icon?: string
  149. job_id: string
  150. url: string
  151. }
  152. export type InitialDocumentDetail = {
  153. id: string
  154. batch: string
  155. position: number
  156. dataset_id: string
  157. data_source_type: DataSourceType
  158. data_source_info: DataSourceInfo
  159. dataset_process_rule_id: string
  160. name: string
  161. created_from: 'api' | 'web'
  162. created_by: string
  163. created_at: number
  164. indexing_status: DocumentIndexingStatus
  165. display_status: DocumentDisplayStatus
  166. completed_segments?: number
  167. total_segments?: number
  168. doc_form: 'text_model' | 'qa_model'
  169. doc_language: string
  170. }
  171. export type SimpleDocumentDetail = InitialDocumentDetail & {
  172. enabled: boolean
  173. word_count: number
  174. error?: string | null
  175. archived: boolean
  176. updated_at: number
  177. hit_count: number
  178. dataset_process_rule_id?: string
  179. data_source_detail_dict?: {
  180. upload_file: {
  181. name: string
  182. extension: string
  183. }
  184. }
  185. }
  186. export type DocumentListResponse = {
  187. data: SimpleDocumentDetail[]
  188. has_more: boolean
  189. total: number
  190. page: number
  191. limit: number
  192. }
  193. export type DocumentReq = {
  194. original_document_id?: string
  195. indexing_technique?: string
  196. doc_form: 'text_model' | 'qa_model'
  197. doc_language: string
  198. process_rule: ProcessRule
  199. }
  200. export type CreateDocumentReq = DocumentReq & {
  201. data_source: DataSource
  202. retrieval_model: RetrievalConfig
  203. embedding_model: string
  204. embedding_model_provider: string
  205. }
  206. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  207. dataset_id: string
  208. }
  209. export type DataSource = {
  210. type: DataSourceType
  211. info_list: {
  212. data_source_type: DataSourceType
  213. notion_info_list?: NotionInfo[]
  214. file_info_list?: {
  215. file_ids: string[]
  216. }
  217. website_info_list?: {
  218. provider: string
  219. job_id: string
  220. urls: string[]
  221. }
  222. }
  223. }
  224. export type NotionInfo = {
  225. workspace_id: string
  226. pages: DataSourceNotionPage[]
  227. }
  228. export type NotionPage = {
  229. page_id: string
  230. type: string
  231. }
  232. export type ProcessRule = {
  233. mode: string
  234. rules: Rules
  235. }
  236. export type createDocumentResponse = {
  237. dataset?: DataSet
  238. batch: string
  239. documents: InitialDocumentDetail[]
  240. }
  241. export type FullDocumentDetail = SimpleDocumentDetail & {
  242. batch: string
  243. created_api_request_id: string
  244. processing_started_at: number
  245. parsing_completed_at: number
  246. cleaning_completed_at: number
  247. splitting_completed_at: number
  248. tokens: number
  249. indexing_latency: number
  250. completed_at: number
  251. paused_by: string
  252. paused_at: number
  253. stopped_at: number
  254. indexing_status: string
  255. disabled_at: number
  256. disabled_by: string
  257. archived_reason: 'rule_modified' | 're_upload'
  258. archived_by: string
  259. archived_at: number
  260. doc_type?: DocType | null | 'others'
  261. doc_metadata?: DocMetadata | null
  262. segment_count: number
  263. [key: string]: any
  264. }
  265. export type DocMetadata = {
  266. title: string
  267. language: string
  268. author: string
  269. publisher: string
  270. publicationDate: string
  271. ISBN: string
  272. category: string
  273. [key: string]: string
  274. }
  275. export const CUSTOMIZABLE_DOC_TYPES = [
  276. 'book',
  277. 'web_page',
  278. 'paper',
  279. 'social_media_post',
  280. 'personal_document',
  281. 'business_document',
  282. 'im_chat_log',
  283. ] as const
  284. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  285. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  286. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  287. export type DocType = CustomizableDocType | FixedDocType
  288. export type DocumentDetailResponse = FullDocumentDetail
  289. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  290. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  291. export type SegmentsQuery = {
  292. last_id?: string
  293. limit: number
  294. // status?: SegmentStatus
  295. hit_count_gte?: number
  296. keyword?: string
  297. enabled?: boolean
  298. }
  299. export type SegmentDetailModel = {
  300. id: string
  301. position: number
  302. document_id: string
  303. content: string
  304. word_count: number
  305. tokens: number
  306. keywords: string[]
  307. index_node_id: string
  308. index_node_hash: string
  309. hit_count: number
  310. enabled: boolean
  311. disabled_at: number
  312. disabled_by: string
  313. status: SegmentStatus
  314. created_by: string
  315. created_at: number
  316. indexing_at: number
  317. completed_at: number
  318. error: string | null
  319. stopped_at: number
  320. answer?: string
  321. }
  322. export type SegmentsResponse = {
  323. data: SegmentDetailModel[]
  324. has_more: boolean
  325. limit: number
  326. total: number
  327. }
  328. export type HitTestingRecord = {
  329. id: string
  330. content: string
  331. source: 'app' | 'hit_testing' | 'plugin'
  332. source_app_id: string
  333. created_by_role: 'account' | 'end_user'
  334. created_by: string
  335. created_at: number
  336. }
  337. export type HitTesting = {
  338. segment: Segment
  339. score: number
  340. tsne_position: TsnePosition
  341. }
  342. export type Segment = {
  343. id: string
  344. document: Document
  345. content: string
  346. position: number
  347. word_count: number
  348. tokens: number
  349. keywords: string[]
  350. hit_count: number
  351. index_node_hash: string
  352. }
  353. export type Document = {
  354. id: string
  355. data_source_type: string
  356. name: string
  357. doc_type: DocType
  358. }
  359. export type HitTestingRecordsResponse = {
  360. data: HitTestingRecord[]
  361. has_more: boolean
  362. limit: number
  363. total: number
  364. page: number
  365. }
  366. export type TsnePosition = {
  367. x: number
  368. y: number
  369. }
  370. export type HitTestingResponse = {
  371. query: {
  372. content: string
  373. tsne_position: TsnePosition
  374. }
  375. records: Array<HitTesting>
  376. }
  377. export type RelatedApp = {
  378. id: string
  379. name: string
  380. mode: AppMode
  381. icon_type: AppIconType | null
  382. icon: string
  383. icon_background: string
  384. icon_url: string
  385. }
  386. export type RelatedAppResponse = {
  387. data: Array<RelatedApp>
  388. total: number
  389. }
  390. export type SegmentUpdater = {
  391. content: string
  392. answer?: string
  393. keywords?: string[]
  394. }
  395. export enum DocForm {
  396. TEXT = 'text_model',
  397. QA = 'qa_model',
  398. }
  399. export type ErrorDocsResponse = {
  400. data: IndexingStatusResponse[]
  401. total: number
  402. }
  403. export type SelectedDatasetsMode = {
  404. allHighQuality: boolean
  405. allHighQualityVectorSearch: boolean
  406. allHighQualityFullTextSearch: boolean
  407. allEconomic: boolean
  408. mixtureHighQualityAndEconomic: boolean
  409. inconsistentEmbeddingModel: boolean
  410. }
  411. export enum WeightedScoreEnum {
  412. SemanticFirst = 'semantic_first',
  413. KeywordFirst = 'keyword_first',
  414. Customized = 'customized',
  415. }
  416. export enum RerankingModeEnum {
  417. RerankingModel = 'reranking_model',
  418. WeightedScore = 'weighted_score',
  419. }
  420. export const DEFAULT_WEIGHTED_SCORE = {
  421. allHighQualityVectorSearch: {
  422. semantic: 1.0,
  423. keyword: 0,
  424. },
  425. allHighQualityFullTextSearch: {
  426. semantic: 0,
  427. keyword: 1.0,
  428. },
  429. semanticFirst: {
  430. semantic: 0.7,
  431. keyword: 0.3,
  432. },
  433. keywordFirst: {
  434. semantic: 0.3,
  435. keyword: 0.7,
  436. },
  437. other: {
  438. semantic: 0.7,
  439. keyword: 0.3,
  440. },
  441. }