datasets.ts 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. import type { DataSourceNotionPage } from './common'
  2. import type { AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. export enum DataSourceType {
  5. FILE = 'upload_file',
  6. NOTION = 'notion_import',
  7. WEB = 'website_crawl',
  8. }
  9. export type DataSet = {
  10. id: string
  11. name: string
  12. icon: string
  13. icon_background: string
  14. description: string
  15. permission: 'only_me' | 'all_team_members'
  16. data_source_type: DataSourceType
  17. indexing_technique: 'high_quality' | 'economy'
  18. created_by: string
  19. updated_by: string
  20. updated_at: number
  21. app_count: number
  22. document_count: number
  23. word_count: number
  24. embedding_model: string
  25. embedding_model_provider: string
  26. embedding_available: boolean
  27. retrieval_model_dict: RetrievalConfig
  28. retrieval_model: RetrievalConfig
  29. tags: Tag[]
  30. }
  31. export type CustomFile = File & {
  32. id?: string
  33. extension?: string
  34. mime_type?: string
  35. created_by?: string
  36. created_at?: number
  37. }
  38. export type CrawlOptions = {
  39. crawl_sub_pages: boolean
  40. only_main_content: boolean
  41. includes: string
  42. excludes: string
  43. limit: number | string
  44. max_depth: number | string
  45. }
  46. export type CrawlResultItem = {
  47. title: string
  48. markdown: string
  49. description: string
  50. source_url: string
  51. }
  52. export type FileItem = {
  53. fileID: string
  54. file: CustomFile
  55. progress: number
  56. }
  57. export type DataSetListResponse = {
  58. data: DataSet[]
  59. has_more: boolean
  60. limit: number
  61. page: number
  62. total: number
  63. }
  64. export type QA = {
  65. question: string
  66. answer: string
  67. }
  68. export type IndexingEstimateResponse = {
  69. tokens: number
  70. total_price: number
  71. currency: string
  72. total_segments: number
  73. preview: string[]
  74. qa_preview?: QA[]
  75. }
  76. export type FileIndexingEstimateResponse = {
  77. total_nodes: number
  78. } & IndexingEstimateResponse
  79. export type IndexingStatusResponse = {
  80. id: string
  81. indexing_status: DocumentIndexingStatus
  82. processing_started_at: number
  83. parsing_completed_at: number
  84. cleaning_completed_at: number
  85. splitting_completed_at: number
  86. completed_at: any
  87. paused_at: any
  88. error: any
  89. stopped_at: any
  90. completed_segments: number
  91. total_segments: number
  92. }
  93. export type IndexingStatusBatchResponse = {
  94. data: IndexingStatusResponse[]
  95. }
  96. export type ProcessMode = 'automatic' | 'custom'
  97. export type ProcessRuleResponse = {
  98. mode: ProcessMode
  99. rules: Rules
  100. }
  101. export type Rules = {
  102. pre_processing_rules: PreProcessingRule[]
  103. segmentation: Segmentation
  104. }
  105. export type PreProcessingRule = {
  106. id: string
  107. enabled: boolean
  108. }
  109. export type Segmentation = {
  110. separator: string
  111. max_tokens: number
  112. chunk_overlap: number
  113. }
  114. export const DocumentIndexingStatusList = [
  115. 'waiting',
  116. 'parsing',
  117. 'cleaning',
  118. 'splitting',
  119. 'indexing',
  120. 'paused',
  121. 'error',
  122. 'completed',
  123. ] as const
  124. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  125. export const DisplayStatusList = [
  126. 'queuing',
  127. 'indexing',
  128. 'paused',
  129. 'error',
  130. 'available',
  131. 'enabled',
  132. 'disabled',
  133. 'archived',
  134. ] as const
  135. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  136. export type DataSourceInfo = {
  137. upload_file: {
  138. id: string
  139. name: string
  140. size: number
  141. mime_type: string
  142. created_at: number
  143. created_by: string
  144. extension: string
  145. }
  146. notion_page_icon?: string
  147. job_id: string
  148. url: string
  149. }
  150. export type InitialDocumentDetail = {
  151. id: string
  152. batch: string
  153. position: number
  154. dataset_id: string
  155. data_source_type: DataSourceType
  156. data_source_info: DataSourceInfo
  157. dataset_process_rule_id: string
  158. name: string
  159. created_from: 'api' | 'web'
  160. created_by: string
  161. created_at: number
  162. indexing_status: DocumentIndexingStatus
  163. display_status: DocumentDisplayStatus
  164. completed_segments?: number
  165. total_segments?: number
  166. doc_form: 'text_model' | 'qa_model'
  167. }
  168. export type SimpleDocumentDetail = InitialDocumentDetail & {
  169. enabled: boolean
  170. word_count: number
  171. error?: string | null
  172. archived: boolean
  173. updated_at: number
  174. hit_count: number
  175. dataset_process_rule_id?: string
  176. data_source_detail_dict?: {
  177. upload_file: {
  178. name: string
  179. extension: string
  180. }
  181. }
  182. }
  183. export type DocumentListResponse = {
  184. data: SimpleDocumentDetail[]
  185. has_more: boolean
  186. total: number
  187. page: number
  188. limit: number
  189. }
  190. export type DocumentReq = {
  191. original_document_id?: string
  192. indexing_technique?: string
  193. doc_form: 'text_model' | 'qa_model'
  194. doc_language: string
  195. process_rule: ProcessRule
  196. }
  197. export type CreateDocumentReq = DocumentReq & {
  198. data_source: DataSource
  199. retrieval_model: RetrievalConfig
  200. }
  201. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  202. dataset_id: string
  203. }
  204. export type DataSource = {
  205. type: DataSourceType
  206. info_list: {
  207. data_source_type: DataSourceType
  208. notion_info_list?: NotionInfo[]
  209. file_info_list?: {
  210. file_ids: string[]
  211. }
  212. website_info_list?: {
  213. provider: string
  214. job_id: string
  215. urls: string[]
  216. }
  217. }
  218. }
  219. export type NotionInfo = {
  220. workspace_id: string
  221. pages: DataSourceNotionPage[]
  222. }
  223. export type NotionPage = {
  224. page_id: string
  225. type: string
  226. }
  227. export type ProcessRule = {
  228. mode: string
  229. rules: Rules
  230. }
  231. export type createDocumentResponse = {
  232. dataset?: DataSet
  233. batch: string
  234. documents: InitialDocumentDetail[]
  235. }
  236. export type FullDocumentDetail = SimpleDocumentDetail & {
  237. batch: string
  238. created_api_request_id: string
  239. processing_started_at: number
  240. parsing_completed_at: number
  241. cleaning_completed_at: number
  242. splitting_completed_at: number
  243. tokens: number
  244. indexing_latency: number
  245. completed_at: number
  246. paused_by: string
  247. paused_at: number
  248. stopped_at: number
  249. indexing_status: string
  250. disabled_at: number
  251. disabled_by: string
  252. archived_reason: 'rule_modified' | 're_upload'
  253. archived_by: string
  254. archived_at: number
  255. doc_type?: DocType | null | 'others'
  256. doc_metadata?: DocMetadata | null
  257. segment_count: number
  258. [key: string]: any
  259. }
  260. export type DocMetadata = {
  261. title: string
  262. language: string
  263. author: string
  264. publisher: string
  265. publicationDate: string
  266. ISBN: string
  267. category: string
  268. [key: string]: string
  269. }
  270. export const CUSTOMIZABLE_DOC_TYPES = [
  271. 'book',
  272. 'web_page',
  273. 'paper',
  274. 'social_media_post',
  275. 'personal_document',
  276. 'business_document',
  277. 'im_chat_log',
  278. ] as const
  279. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  280. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  281. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  282. export type DocType = CustomizableDocType | FixedDocType
  283. export type DocumentDetailResponse = FullDocumentDetail
  284. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  285. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  286. export type SegmentsQuery = {
  287. last_id?: string
  288. limit: number
  289. // status?: SegmentStatus
  290. hit_count_gte?: number
  291. keyword?: string
  292. enabled?: boolean
  293. }
  294. export type SegmentDetailModel = {
  295. id: string
  296. position: number
  297. document_id: string
  298. content: string
  299. word_count: number
  300. tokens: number
  301. keywords: string[]
  302. index_node_id: string
  303. index_node_hash: string
  304. hit_count: number
  305. enabled: boolean
  306. disabled_at: number
  307. disabled_by: string
  308. status: SegmentStatus
  309. created_by: string
  310. created_at: number
  311. indexing_at: number
  312. completed_at: number
  313. error: string | null
  314. stopped_at: number
  315. answer?: string
  316. }
  317. export type SegmentsResponse = {
  318. data: SegmentDetailModel[]
  319. has_more: boolean
  320. limit: number
  321. total: number
  322. }
  323. export type HitTestingRecord = {
  324. id: string
  325. content: string
  326. source: 'app' | 'hit_testing' | 'plugin'
  327. source_app_id: string
  328. created_by_role: 'account' | 'end_user'
  329. created_by: string
  330. created_at: number
  331. }
  332. export type HitTesting = {
  333. segment: Segment
  334. score: number
  335. tsne_position: TsnePosition
  336. }
  337. export type Segment = {
  338. id: string
  339. document: Document
  340. content: string
  341. position: number
  342. word_count: number
  343. tokens: number
  344. keywords: string[]
  345. hit_count: number
  346. index_node_hash: string
  347. }
  348. export type Document = {
  349. id: string
  350. data_source_type: string
  351. name: string
  352. doc_type: DocType
  353. }
  354. export type HitTestingRecordsResponse = {
  355. data: HitTestingRecord[]
  356. has_more: boolean
  357. limit: number
  358. total: number
  359. page: number
  360. }
  361. export type TsnePosition = {
  362. x: number
  363. y: number
  364. }
  365. export type HitTestingResponse = {
  366. query: {
  367. content: string
  368. tsne_position: TsnePosition
  369. }
  370. records: Array<HitTesting>
  371. }
  372. export type RelatedApp = {
  373. id: string
  374. name: string
  375. mode: AppMode
  376. icon: string
  377. icon_background: string
  378. }
  379. export type RelatedAppResponse = {
  380. data: Array<RelatedApp>
  381. total: number
  382. }
  383. export type SegmentUpdator = {
  384. content: string
  385. answer?: string
  386. keywords?: string[]
  387. }
  388. export enum DocForm {
  389. TEXT = 'text_model',
  390. QA = 'qa_model',
  391. }
  392. export type ErrorDocsResponse = {
  393. data: IndexingStatusResponse[]
  394. total: number
  395. }