@@ -190,7 +190,7 @@ function extractDocstring(
190190 * - `file (copy).ts` → `file__copy__ts`
191191 * - `c++/main.cpp` → `c___main_cpp`
192192 */
193- function generateChunkId ( filePath : string , startLine : number ) : string {
193+ export function generateChunkId ( filePath : string , startLine : number ) : string {
194194 const normalized = filePath
195195 . replace ( / [ \\ / ] / g, '_' ) // path separators
196196 . replace ( / \. / g, '_' ) // dots
@@ -253,256 +253,177 @@ function splitLargeContent(
253253}
254254
255255/**
256- * Main chunking function - processes source code into semantic chunks.
257- *
258- * Parses source code using tree-sitter and extracts semantic units
259- * (functions, classes, methods) as individual chunks. Large chunks
260- * are automatically split with overlap for better embedding quality.
261- *
262- * @param sourceCode - The source code content to chunk
263- * @param filePath - The file path (used for language detection and IDs)
264- * @returns Array of code chunks with metadata
265- *
266- * @example
267- * ```typescript
268- * const chunks = await chunkCode(fileContent, '/project/src/auth.ts');
269- * for (const chunk of chunks) {
270- * console.log(`${chunk.name}: ${chunk.startLine}-${chunk.endLine}`);
271- * }
272- * ```
256+ * Build CodeChunk objects from a semantic AST node, splitting if too large.
273257 */
274- export async function chunkCode (
258+ function buildChunksFromNode (
259+ node : Parser . SyntaxNode ,
260+ filePath : string ,
275261 sourceCode : string ,
276- filePath : string
277- ) : Promise < CodeChunk [ ] > {
278- // Strip BOM if present (common in files from Windows editors)
279- const cleanedSource = stripBOM ( sourceCode ) ;
280-
281- const ext = path . extname ( filePath ) ;
282- const lang = getLanguageByExtension ( ext ) ;
283-
284- if ( ! lang ) {
285- // Fall back to simple line-based chunking for unsupported languages
286- log . debug ( 'Unsupported language, using fallback chunking' , { filePath, ext } ) ;
287- return fallbackChunking ( cleanedSource , filePath ) ;
288- }
289-
290- const config = LANGUAGE_CONFIGS [ lang ] ;
291- if ( ! config ) {
292- return fallbackChunking ( cleanedSource , filePath ) ;
262+ config : LanguageConfig ,
263+ lang : string ,
264+ ) : CodeChunk [ ] {
265+ const content = node . text ;
266+ const startLine = node . startPosition . row + 1 ; // 1-indexed
267+ const endLine = node . endPosition . row + 1 ;
268+ const name = extractName ( node , config ) ;
269+ const signature = extractSignature ( node , sourceCode ) ;
270+ const docstring = extractDocstring ( node , sourceCode , config ) ;
271+
272+ if ( isTooSmall ( content ) ) return [ ] ;
273+
274+ const outputLang = lang === 'tsx' ? 'typescript' : lang === 'jsx' ? 'javascript' : lang ;
275+
276+ if ( isTooLarge ( content ) ) {
277+ const subChunks = splitLargeContent ( content , startLine ) ;
278+ return subChunks
279+ . map ( ( sub , i ) => sub ? ( {
280+ id : generateChunkId ( filePath , sub . startLine ) + `_p${ i } ` ,
281+ filePath,
282+ content : sub . content ,
283+ startLine : sub . startLine ,
284+ endLine : sub . endLine ,
285+ name : name ? `${ name } (part ${ i + 1 } )` : null ,
286+ nodeType : node . type ,
287+ signature : i === 0 ? signature : null ,
288+ docstring : i === 0 ? docstring : null ,
289+ language : outputLang ,
290+ } ) : null )
291+ . filter ( ( c ) : c is CodeChunk => c !== null ) ;
293292 }
294293
295- let parser : Parser | null = null ;
296- let tree : Parser . Tree | null = null ;
297-
298- try {
299- // Create parser with the language's WASM grammar
300- parser = await createParser ( config . wasmPath ) ;
301- tree = parser . parse ( cleanedSource ) ;
302- const chunks : CodeChunk [ ] = [ ] ;
303-
304- // Collect all semantic nodes with depth limiting
305- const semanticNodes : Parser . SyntaxNode [ ] = [ ] ;
306- collectSemanticNodes ( tree . rootNode , config . chunkNodeTypes , semanticNodes , 0 ) ;
307-
308- for ( const node of semanticNodes ) {
309- const content = node . text ;
310- const startLine = node . startPosition . row + 1 ; // 1-indexed
311- const endLine = node . endPosition . row + 1 ;
312- const name = extractName ( node , config ) ;
313- const signature = extractSignature ( node , cleanedSource ) ;
314- const docstring = extractDocstring ( node , cleanedSource , config ) ;
315-
316- // Skip if too small
317- if ( isTooSmall ( content ) ) continue ;
318-
319- // Normalize language name for output (tsx -> typescript)
320- const outputLang = lang === 'tsx' ? 'typescript' : lang === 'jsx' ? 'javascript' : lang ;
321-
322- // Split if too large
323- if ( isTooLarge ( content ) ) {
324- const subChunks = splitLargeContent ( content , startLine ) ;
325- for ( let i = 0 ; i < subChunks . length ; i ++ ) {
326- const sub = subChunks [ i ] ;
327- if ( ! sub ) continue ;
328- chunks . push ( {
329- id : generateChunkId ( filePath , sub . startLine ) + `_p${ i } ` ,
330- filePath,
331- content : sub . content ,
332- startLine : sub . startLine ,
333- endLine : sub . endLine ,
334- name : name ? `${ name } (part ${ i + 1 } )` : null ,
335- nodeType : node . type ,
336- signature : i === 0 ? signature : null ,
337- docstring : i === 0 ? docstring : null ,
338- language : outputLang ,
339- } ) ;
340- }
341- } else {
342- chunks . push ( {
343- id : generateChunkId ( filePath , startLine ) ,
344- filePath,
345- content,
346- startLine,
347- endLine,
348- name,
349- nodeType : node . type ,
350- signature,
351- docstring,
352- language : outputLang ,
353- } ) ;
354- }
355- }
356-
357- // If we didn't find any semantic nodes, fall back to simple chunking
358- if ( chunks . length === 0 ) {
359- log . debug ( 'No semantic nodes found, using fallback chunking' , { filePath } ) ;
360- return fallbackChunking ( cleanedSource , filePath ) ;
361- }
362-
363- log . debug ( 'Chunking complete' , { filePath, chunkCount : chunks . length } ) ;
364- return chunks ;
365- } catch ( error ) {
366- const errorMessage = error instanceof Error ? error . message : String ( error ) ;
367- log . warn ( 'Tree-sitter parsing failed, using fallback' , {
368- filePath,
369- error : errorMessage ,
370- errorType : error instanceof Error ? error . constructor . name : 'Unknown' ,
371- } ) ;
372- return fallbackChunking ( cleanedSource , filePath ) ;
373- } finally {
374- // IMPORTANT: Free WASM memory by deleting the tree
375- if ( tree ) {
376- tree . delete ( ) ;
377- }
378- // Note: Parser instances are lightweight and don't need explicit cleanup
379- // as long as the tree is deleted
380- }
294+ return [ {
295+ id : generateChunkId ( filePath , startLine ) ,
296+ filePath,
297+ content,
298+ startLine,
299+ endLine,
300+ name,
301+ nodeType : node . type ,
302+ signature,
303+ docstring,
304+ language : outputLang ,
305+ } ] ;
381306}
382307
383308/**
384- * Chunk code and extract raw edges for the context graph.
385- *
386- * This extends `chunkCode` by additionally extracting structural edges
387- * (calls, imports, extends/implements) from the AST. The existing
388- * `chunkCode()` is unchanged for backward compatibility.
309+ * Core chunking implementation shared by chunkCode and chunkCodeWithEdges.
389310 *
390- * @param sourceCode - The source code content
391- * @param filePath - The file path (for language detection and IDs)
392- * @returns ChunkResult with chunks and raw edges
311+ * @param sourceCode - Raw source code
312+ * @param filePath - File path for language detection and IDs
313+ * @param extractEdges - Whether to extract raw edges for the context graph
314+ * @returns ChunkResult with chunks and optionally raw edges
393315 */
394- export async function chunkCodeWithEdges (
316+ async function chunkCodeCore (
395317 sourceCode : string ,
396- filePath : string
318+ filePath : string ,
319+ extractEdges : boolean ,
397320) : Promise < ChunkResult > {
398321 const cleanedSource = stripBOM ( sourceCode ) ;
399322 const ext = path . extname ( filePath ) ;
400323 const lang = getLanguageByExtension ( ext ) ;
401324
402325 if ( ! lang ) {
403- const chunks = await fallbackChunking ( cleanedSource , filePath ) ;
404- return { chunks, rawEdges : [ ] } ;
326+ log . debug ( 'Unsupported language, using fallback chunking' , { filePath, ext } ) ;
327+ return { chunks : fallbackChunking ( cleanedSource , filePath ) , rawEdges : [ ] } ;
405328 }
406329
407330 const config = LANGUAGE_CONFIGS [ lang ] ;
408331 if ( ! config ) {
409- const chunks = await fallbackChunking ( cleanedSource , filePath ) ;
410- return { chunks, rawEdges : [ ] } ;
332+ return { chunks : fallbackChunking ( cleanedSource , filePath ) , rawEdges : [ ] } ;
411333 }
412334
413- let parser : Parser | null = null ;
414335 let tree : Parser . Tree | null = null ;
415336
416337 try {
417- parser = await createParser ( config . wasmPath ) ;
338+ const parser = await createParser ( config . wasmPath ) ;
418339 tree = parser . parse ( cleanedSource ) ;
419340 const chunks : CodeChunk [ ] = [ ] ;
420341 const rawEdges : RawEdge [ ] = [ ] ;
421342
422- // Collect all semantic nodes
423343 const semanticNodes : Parser . SyntaxNode [ ] = [ ] ;
424344 collectSemanticNodes ( tree . rootNode , config . chunkNodeTypes , semanticNodes , 0 ) ;
425345
426346 for ( const node of semanticNodes ) {
427- const content = node . text ;
428- const startLine = node . startPosition . row + 1 ;
429- const endLine = node . endPosition . row + 1 ;
430- const name = extractName ( node , config ) ;
431- const signature = extractSignature ( node , cleanedSource ) ;
432- const docstring = extractDocstring ( node , cleanedSource , config ) ;
433-
434- if ( isTooSmall ( content ) ) continue ;
435-
436- const outputLang = lang === 'tsx' ? 'typescript' : lang === 'jsx' ? 'javascript' : lang ;
437-
438- // Build chunk(s) for this node
439- const nodeChunks : CodeChunk [ ] = [ ] ;
440- if ( isTooLarge ( content ) ) {
441- const subChunks = splitLargeContent ( content , startLine ) ;
442- for ( let i = 0 ; i < subChunks . length ; i ++ ) {
443- const sub = subChunks [ i ] ;
444- if ( ! sub ) continue ;
445- nodeChunks . push ( {
446- id : generateChunkId ( filePath , sub . startLine ) + `_p${ i } ` ,
447- filePath,
448- content : sub . content ,
449- startLine : sub . startLine ,
450- endLine : sub . endLine ,
451- name : name ? `${ name } (part ${ i + 1 } )` : null ,
452- nodeType : node . type ,
453- signature : i === 0 ? signature : null ,
454- docstring : i === 0 ? docstring : null ,
455- language : outputLang ,
456- } ) ;
457- }
458- } else {
459- nodeChunks . push ( {
460- id : generateChunkId ( filePath , startLine ) ,
461- filePath,
462- content,
463- startLine,
464- endLine,
465- name,
466- nodeType : node . type ,
467- signature,
468- docstring,
469- language : outputLang ,
470- } ) ;
471- }
347+ const nodeChunks = buildChunksFromNode ( node , filePath , cleanedSource , config , lang ) ;
348+ if ( nodeChunks . length === 0 ) continue ;
472349
473350 chunks . push ( ...nodeChunks ) ;
474351
475- // Extract edges from this semantic node's children
476- // Use the first chunk's ID as the source
477- const sourceChunkId = nodeChunks [ 0 ] ?. id ;
478- if ( sourceChunkId ) {
352+ if ( extractEdges ) {
353+ const sourceChunkId = nodeChunks [ 0 ] ! . id ;
479354 const edges = extractEdgesFromNode ( node , sourceChunkId , filePath , config ) ;
480355 rawEdges . push ( ...edges ) ;
481356 }
482357 }
483358
484359 if ( chunks . length === 0 ) {
360+ log . debug ( 'No semantic nodes found, using fallback chunking' , { filePath } ) ;
485361 return { chunks : fallbackChunking ( cleanedSource , filePath ) , rawEdges : [ ] } ;
486362 }
487363
488- log . debug ( 'Chunking with edges complete' , {
364+ log . debug ( 'Chunking complete' , {
489365 filePath,
490366 chunkCount : chunks . length ,
491- edgeCount : rawEdges . length ,
367+ ... ( extractEdges ? { edgeCount : rawEdges . length } : { } ) ,
492368 } ) ;
493369 return { chunks, rawEdges } ;
494370 } catch ( error ) {
495371 const errorMessage = error instanceof Error ? error . message : String ( error ) ;
496- log . warn ( 'Tree-sitter parsing failed for edge extraction , using fallback' , {
372+ log . warn ( 'Tree-sitter parsing failed, using fallback' , {
497373 filePath,
498374 error : errorMessage ,
375+ errorType : error instanceof Error ? error . constructor . name : 'Unknown' ,
499376 } ) ;
500377 return { chunks : fallbackChunking ( cleanedSource , filePath ) , rawEdges : [ ] } ;
501378 } finally {
502379 if ( tree ) tree . delete ( ) ;
503380 }
504381}
505382
383+ /**
384+ * Main chunking function - processes source code into semantic chunks.
385+ *
386+ * Parses source code using tree-sitter and extracts semantic units
387+ * (functions, classes, methods) as individual chunks. Large chunks
388+ * are automatically split with overlap for better embedding quality.
389+ *
390+ * @param sourceCode - The source code content to chunk
391+ * @param filePath - The file path (used for language detection and IDs)
392+ * @returns Array of code chunks with metadata
393+ *
394+ * @example
395+ * ```typescript
396+ * const chunks = await chunkCode(fileContent, '/project/src/auth.ts');
397+ * for (const chunk of chunks) {
398+ * console.log(`${chunk.name}: ${chunk.startLine}-${chunk.endLine}`);
399+ * }
400+ * ```
401+ */
402+ export async function chunkCode (
403+ sourceCode : string ,
404+ filePath : string
405+ ) : Promise < CodeChunk [ ] > {
406+ const result = await chunkCodeCore ( sourceCode , filePath , false ) ;
407+ return result . chunks ;
408+ }
409+
410+ /**
411+ * Chunk code and extract raw edges for the context graph.
412+ *
413+ * This extends `chunkCode` by additionally extracting structural edges
414+ * (calls, imports, extends/implements) from the AST.
415+ *
416+ * @param sourceCode - The source code content
417+ * @param filePath - The file path (for language detection and IDs)
418+ * @returns ChunkResult with chunks and raw edges
419+ */
420+ export async function chunkCodeWithEdges (
421+ sourceCode : string ,
422+ filePath : string
423+ ) : Promise < ChunkResult > {
424+ return chunkCodeCore ( sourceCode , filePath , true ) ;
425+ }
426+
506427/**
507428 * Extract raw edges from a semantic AST node by traversing its children.
508429 *
0 commit comments