-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Expand file tree
/
Copy pathdocs-chunker.ts
More file actions
446 lines (373 loc) · 13.3 KB
/
docs-chunker.ts
File metadata and controls
446 lines (373 loc) · 13.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
import fs from 'fs/promises'
import path from 'path'
import { createLogger } from '@sim/logger'
import { TextChunker } from '@/lib/chunkers/text-chunker'
import type { DocChunk, DocsChunkerOptions } from '@/lib/chunkers/types'
import { generateEmbeddings } from '@/lib/knowledge/embeddings'
interface HeaderInfo {
level: number
text: string
slug?: string
anchor?: string
position?: number
}
interface Frontmatter {
title?: string
description?: string
[key: string]: unknown
}
const logger = createLogger('DocsChunker')
/**
* Docs-specific chunker that processes .mdx files and tracks header context
*/
export class DocsChunker {
private readonly textChunker: TextChunker
private readonly baseUrl: string
constructor(options: DocsChunkerOptions = {}) {
// Use the existing TextChunker for chunking logic
this.textChunker = new TextChunker({
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
minCharactersPerChunk: options.minCharactersPerChunk ?? 1,
chunkOverlap: options.chunkOverlap ?? 50,
})
// Use localhost docs in development, production docs otherwise
this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai'
}
/**
* Process all .mdx files in the docs directory
*/
async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
const allChunks: DocChunk[] = []
try {
const mdxFiles = await this.findMdxFiles(docsPath)
logger.info(`Found ${mdxFiles.length} .mdx files to process`)
for (const filePath of mdxFiles) {
try {
const chunks = await this.chunkMdxFile(filePath, docsPath)
allChunks.push(...chunks)
logger.info(`Processed ${filePath}: ${chunks.length} chunks`)
} catch (error) {
logger.error(`Error processing ${filePath}:`, error)
}
}
logger.info(`Total chunks generated: ${allChunks.length}`)
return allChunks
} catch (error) {
logger.error('Error processing docs:', error)
throw error
}
}
/**
* Process a single .mdx file
*/
async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
const content = await fs.readFile(filePath, 'utf-8')
const relativePath = path.relative(basePath, filePath)
// Parse frontmatter and content
const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
// Extract headers from the content
const headers = this.extractHeaders(markdownContent)
// Generate document URL
const documentUrl = this.generateDocumentUrl(relativePath)
// Split content into chunks
const textChunks = await this.splitContent(markdownContent)
// Generate embeddings for all chunks at once (batch processing)
logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
const embeddings = textChunks.length > 0 ? await generateEmbeddings(textChunks) : []
const embeddingModel = 'text-embedding-3-small'
// Convert to DocChunk objects with header context and embeddings
const chunks: DocChunk[] = []
let currentPosition = 0
for (let i = 0; i < textChunks.length; i++) {
const chunkText = textChunks[i]
const chunkStart = currentPosition
const chunkEnd = currentPosition + chunkText.length
// Find the most relevant header for this chunk
const relevantHeader = this.findRelevantHeader(headers, chunkStart)
const chunk: DocChunk = {
text: chunkText,
tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
sourceDocument: relativePath,
headerLink: relevantHeader ? `${documentUrl}#${relevantHeader.anchor}` : documentUrl,
headerText: relevantHeader?.text || frontmatter.title || 'Document Root',
headerLevel: relevantHeader?.level || 1,
embedding: embeddings[i] || [],
embeddingModel,
metadata: {
startIndex: chunkStart,
endIndex: chunkEnd,
title: frontmatter.title,
},
}
chunks.push(chunk)
currentPosition = chunkEnd
}
return chunks
}
/**
* Find all .mdx files recursively
*/
private async findMdxFiles(dirPath: string): Promise<string[]> {
const files: string[] = []
const entries = await fs.readdir(dirPath, { withFileTypes: true })
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name)
if (entry.isDirectory()) {
const subFiles = await this.findMdxFiles(fullPath)
files.push(...subFiles)
} else if (entry.isFile() && entry.name.endsWith('.mdx')) {
files.push(fullPath)
}
}
return files
}
/**
* Extract headers and their positions from markdown content
*/
private extractHeaders(content: string): HeaderInfo[] {
const headers: HeaderInfo[] = []
const headerRegex = /^(#{1,6})\s+(.+)$/gm
let match
while ((match = headerRegex.exec(content)) !== null) {
const level = match[1].length
const text = match[2].trim()
const anchor = this.generateAnchor(text)
headers.push({
text,
level,
anchor,
position: match.index,
})
}
return headers
}
/**
* Generate URL-safe anchor from header text
*/
private generateAnchor(headerText: string): string {
return headerText
.toLowerCase()
.replace(/[^\w\s-]/g, '') // Remove special characters except hyphens
.replace(/\s+/g, '-') // Replace spaces with hyphens
.replace(/-+/g, '-') // Replace multiple hyphens with single
.replace(/^-|-$/g, '') // Remove leading/trailing hyphens
}
/**
* Generate document URL from relative path
*/
private generateDocumentUrl(relativePath: string): string {
// Convert file path to URL path
// e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
const urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') // Handle Windows paths
return `${this.baseUrl}/${urlPath}`
}
/**
* Find the most relevant header for a given position
*/
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
if (headers.length === 0) return null
// Find the last header that comes before this position
let relevantHeader: HeaderInfo | null = null
for (const header of headers) {
if (header.position !== undefined && header.position <= position) {
relevantHeader = header
} else {
break
}
}
return relevantHeader
}
/**
* Split content into chunks using the existing TextChunker with table awareness
*/
private async splitContent(content: string): Promise<string[]> {
// Clean the content first
const cleanedContent = this.cleanContent(content)
// Detect table boundaries to avoid splitting them
const tableBoundaries = this.detectTableBoundaries(cleanedContent)
// Use the existing TextChunker
const chunks = await this.textChunker.chunk(cleanedContent)
// Post-process chunks to ensure tables aren't split
const processedChunks = this.mergeTableChunks(
chunks.map((chunk) => chunk.text),
tableBoundaries,
cleanedContent
)
// Ensure no chunk exceeds 300 tokens
const finalChunks = this.enforceSizeLimit(processedChunks)
return finalChunks
}
/**
* Clean content by removing MDX-specific elements and excessive whitespace
*/
private cleanContent(content: string): string {
return (
content
// Remove import statements
.replace(/^import\s+.*$/gm, '')
// Remove JSX components and React-style comments
.replace(/<[^>]+>/g, ' ')
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
// Remove excessive whitespace
.replace(/\n{3,}/g, '\n\n')
.replace(/[ \t]{2,}/g, ' ')
.trim()
)
}
/**
* Parse frontmatter from MDX content
*/
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
const match = content.match(frontmatterRegex)
if (!match) {
return { data: {}, content }
}
const [, frontmatterText, markdownContent] = match
const data: Frontmatter = {}
// Simple YAML parsing for title and description
const lines = frontmatterText.split('\n')
for (const line of lines) {
const colonIndex = line.indexOf(':')
if (colonIndex > 0) {
const key = line.slice(0, colonIndex).trim()
const value = line
.slice(colonIndex + 1)
.trim()
.replace(/^['"]|['"]$/g, '')
data[key] = value
}
}
return { data, content: markdownContent }
}
/**
* Estimate token count (rough approximation)
*/
private estimateTokens(text: string): number {
// Rough approximation: 1 token ≈ 4 characters
return Math.ceil(text.length / 4)
}
/**
* Detect table boundaries in markdown content to avoid splitting them
*/
private detectTableBoundaries(content: string): { start: number; end: number }[] {
const tables: { start: number; end: number }[] = []
const lines = content.split('\n')
let inTable = false
let tableStart = -1
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim()
// Detect table start (markdown table row with pipes)
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
// Check if next line is table separator (contains dashes and pipes)
const nextLine = lines[i + 1]?.trim()
if (nextLine?.includes('|') && nextLine.includes('-')) {
inTable = true
tableStart = i
}
}
// Detect table end (empty line or non-table content)
else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) {
tables.push({
start: this.getCharacterPosition(lines, tableStart),
end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0,
})
inTable = false
}
}
// Handle table at end of content
if (inTable && tableStart >= 0) {
tables.push({
start: this.getCharacterPosition(lines, tableStart),
end: content.length,
})
}
return tables
}
/**
* Get character position from line number
*/
private getCharacterPosition(lines: string[], lineIndex: number): number {
return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0)
}
/**
* Merge chunks that would split tables
*/
private mergeTableChunks(
chunks: string[],
tableBoundaries: { start: number; end: number }[],
originalContent: string
): string[] {
if (tableBoundaries.length === 0) {
return chunks
}
const mergedChunks: string[] = []
let currentPosition = 0
for (const chunk of chunks) {
const chunkStart = originalContent.indexOf(chunk, currentPosition)
const chunkEnd = chunkStart + chunk.length
// Check if this chunk intersects with any table
const intersectsTable = tableBoundaries.some(
(table) =>
(chunkStart >= table.start && chunkStart <= table.end) ||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
(chunkStart <= table.start && chunkEnd >= table.end)
)
if (intersectsTable) {
// Find which table(s) this chunk intersects with
const affectedTables = tableBoundaries.filter(
(table) =>
(chunkStart >= table.start && chunkStart <= table.end) ||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
(chunkStart <= table.start && chunkEnd >= table.end)
)
// Create a chunk that includes the complete table(s)
const minStart = Math.min(chunkStart, ...affectedTables.map((t) => t.start))
const maxEnd = Math.max(chunkEnd, ...affectedTables.map((t) => t.end))
const completeChunk = originalContent.slice(minStart, maxEnd)
// Only add if we haven't already included this content
if (!mergedChunks.some((existing) => existing.includes(completeChunk.trim()))) {
mergedChunks.push(completeChunk.trim())
}
} else {
mergedChunks.push(chunk)
}
currentPosition = chunkEnd
}
return mergedChunks.filter((chunk) => chunk.length > 50) // Filter out tiny chunks
}
/**
* Enforce 300 token size limit on chunks
*/
private enforceSizeLimit(chunks: string[]): string[] {
const finalChunks: string[] = []
for (const chunk of chunks) {
const tokens = this.estimateTokens(chunk)
if (tokens <= 300) {
// Chunk is within limit
finalChunks.push(chunk)
} else {
// Chunk is too large - split it
const lines = chunk.split('\n')
let currentChunk = ''
for (const line of lines) {
const testChunk = currentChunk ? `${currentChunk}\n${line}` : line
if (this.estimateTokens(testChunk) <= 300) {
currentChunk = testChunk
} else {
// Adding this line would exceed limit
if (currentChunk.trim()) {
finalChunks.push(currentChunk.trim())
}
currentChunk = line
}
}
// Add final chunk if it has content
if (currentChunk.trim()) {
finalChunks.push(currentChunk.trim())
}
}
}
return finalChunks.filter((chunk) => chunk.trim().length > 100)
}
}