Skip to content

Commit 69d4c91

Browse files
committed
Code review fixes: ID validation, frontier bug, chunker dedup, test hardening
- Extract shared ID validation into src/utils/validation.ts - Add validateId/validateIds to GraphStore operations (upsertNodes, upsertEdges, getNode, getNeighbors) for defense-in-depth - Add ID format regex to Zod schemas in all graph tools - Fix visitNode frontier bug: remove node from frontier even when visited-nodes cap is reached, preventing infinite revisit loops - Refactor chunkCode/chunkCodeWithEdges to share core logic via chunkCodeCore(), eliminating ~150 lines of duplication - Export generateChunkId and use it in context-query.ts instead of duplicated filePathToChunkId() - Strengthen always-passing test assertions (toBeGreaterThanOrEqual(0)) with meaningful checks; fix test data to produce valid chunks - Rename mislabeled integration test to reflect actual scope - Remove unused getAllNodes prepared statement
1 parent 40f861f commit 69d4c91

10 files changed

Lines changed: 199 additions & 346 deletions

File tree

src/chunker/index.ts

Lines changed: 112 additions & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ function extractDocstring(
190190
* - `file (copy).ts` → `file__copy__ts`
191191
* - `c++/main.cpp` → `c___main_cpp`
192192
*/
193-
function generateChunkId(filePath: string, startLine: number): string {
193+
export function generateChunkId(filePath: string, startLine: number): string {
194194
const normalized = filePath
195195
.replace(/[\\/]/g, '_') // path separators
196196
.replace(/\./g, '_') // dots
@@ -253,256 +253,177 @@ function splitLargeContent(
253253
}
254254

255255
/**
256-
* Main chunking function - processes source code into semantic chunks.
257-
*
258-
* Parses source code using tree-sitter and extracts semantic units
259-
* (functions, classes, methods) as individual chunks. Large chunks
260-
* are automatically split with overlap for better embedding quality.
261-
*
262-
* @param sourceCode - The source code content to chunk
263-
* @param filePath - The file path (used for language detection and IDs)
264-
* @returns Array of code chunks with metadata
265-
*
266-
* @example
267-
* ```typescript
268-
* const chunks = await chunkCode(fileContent, '/project/src/auth.ts');
269-
* for (const chunk of chunks) {
270-
* console.log(`${chunk.name}: ${chunk.startLine}-${chunk.endLine}`);
271-
* }
272-
* ```
256+
* Build CodeChunk objects from a semantic AST node, splitting if too large.
273257
*/
274-
export async function chunkCode(
258+
function buildChunksFromNode(
259+
node: Parser.SyntaxNode,
260+
filePath: string,
275261
sourceCode: string,
276-
filePath: string
277-
): Promise<CodeChunk[]> {
278-
// Strip BOM if present (common in files from Windows editors)
279-
const cleanedSource = stripBOM(sourceCode);
280-
281-
const ext = path.extname(filePath);
282-
const lang = getLanguageByExtension(ext);
283-
284-
if (!lang) {
285-
// Fall back to simple line-based chunking for unsupported languages
286-
log.debug('Unsupported language, using fallback chunking', { filePath, ext });
287-
return fallbackChunking(cleanedSource, filePath);
288-
}
289-
290-
const config = LANGUAGE_CONFIGS[lang];
291-
if (!config) {
292-
return fallbackChunking(cleanedSource, filePath);
262+
config: LanguageConfig,
263+
lang: string,
264+
): CodeChunk[] {
265+
const content = node.text;
266+
const startLine = node.startPosition.row + 1; // 1-indexed
267+
const endLine = node.endPosition.row + 1;
268+
const name = extractName(node, config);
269+
const signature = extractSignature(node, sourceCode);
270+
const docstring = extractDocstring(node, sourceCode, config);
271+
272+
if (isTooSmall(content)) return [];
273+
274+
const outputLang = lang === 'tsx' ? 'typescript' : lang === 'jsx' ? 'javascript' : lang;
275+
276+
if (isTooLarge(content)) {
277+
const subChunks = splitLargeContent(content, startLine);
278+
return subChunks
279+
.map((sub, i) => sub ? ({
280+
id: generateChunkId(filePath, sub.startLine) + `_p${i}`,
281+
filePath,
282+
content: sub.content,
283+
startLine: sub.startLine,
284+
endLine: sub.endLine,
285+
name: name ? `${name} (part ${i + 1})` : null,
286+
nodeType: node.type,
287+
signature: i === 0 ? signature : null,
288+
docstring: i === 0 ? docstring : null,
289+
language: outputLang,
290+
}) : null)
291+
.filter((c): c is CodeChunk => c !== null);
293292
}
294293

295-
let parser: Parser | null = null;
296-
let tree: Parser.Tree | null = null;
297-
298-
try {
299-
// Create parser with the language's WASM grammar
300-
parser = await createParser(config.wasmPath);
301-
tree = parser.parse(cleanedSource);
302-
const chunks: CodeChunk[] = [];
303-
304-
// Collect all semantic nodes with depth limiting
305-
const semanticNodes: Parser.SyntaxNode[] = [];
306-
collectSemanticNodes(tree.rootNode, config.chunkNodeTypes, semanticNodes, 0);
307-
308-
for (const node of semanticNodes) {
309-
const content = node.text;
310-
const startLine = node.startPosition.row + 1; // 1-indexed
311-
const endLine = node.endPosition.row + 1;
312-
const name = extractName(node, config);
313-
const signature = extractSignature(node, cleanedSource);
314-
const docstring = extractDocstring(node, cleanedSource, config);
315-
316-
// Skip if too small
317-
if (isTooSmall(content)) continue;
318-
319-
// Normalize language name for output (tsx -> typescript)
320-
const outputLang = lang === 'tsx' ? 'typescript' : lang === 'jsx' ? 'javascript' : lang;
321-
322-
// Split if too large
323-
if (isTooLarge(content)) {
324-
const subChunks = splitLargeContent(content, startLine);
325-
for (let i = 0; i < subChunks.length; i++) {
326-
const sub = subChunks[i];
327-
if (!sub) continue;
328-
chunks.push({
329-
id: generateChunkId(filePath, sub.startLine) + `_p${i}`,
330-
filePath,
331-
content: sub.content,
332-
startLine: sub.startLine,
333-
endLine: sub.endLine,
334-
name: name ? `${name} (part ${i + 1})` : null,
335-
nodeType: node.type,
336-
signature: i === 0 ? signature : null,
337-
docstring: i === 0 ? docstring : null,
338-
language: outputLang,
339-
});
340-
}
341-
} else {
342-
chunks.push({
343-
id: generateChunkId(filePath, startLine),
344-
filePath,
345-
content,
346-
startLine,
347-
endLine,
348-
name,
349-
nodeType: node.type,
350-
signature,
351-
docstring,
352-
language: outputLang,
353-
});
354-
}
355-
}
356-
357-
// If we didn't find any semantic nodes, fall back to simple chunking
358-
if (chunks.length === 0) {
359-
log.debug('No semantic nodes found, using fallback chunking', { filePath });
360-
return fallbackChunking(cleanedSource, filePath);
361-
}
362-
363-
log.debug('Chunking complete', { filePath, chunkCount: chunks.length });
364-
return chunks;
365-
} catch (error) {
366-
const errorMessage = error instanceof Error ? error.message : String(error);
367-
log.warn('Tree-sitter parsing failed, using fallback', {
368-
filePath,
369-
error: errorMessage,
370-
errorType: error instanceof Error ? error.constructor.name : 'Unknown',
371-
});
372-
return fallbackChunking(cleanedSource, filePath);
373-
} finally {
374-
// IMPORTANT: Free WASM memory by deleting the tree
375-
if (tree) {
376-
tree.delete();
377-
}
378-
// Note: Parser instances are lightweight and don't need explicit cleanup
379-
// as long as the tree is deleted
380-
}
294+
return [{
295+
id: generateChunkId(filePath, startLine),
296+
filePath,
297+
content,
298+
startLine,
299+
endLine,
300+
name,
301+
nodeType: node.type,
302+
signature,
303+
docstring,
304+
language: outputLang,
305+
}];
381306
}
382307

383308
/**
384-
* Chunk code and extract raw edges for the context graph.
385-
*
386-
* This extends `chunkCode` by additionally extracting structural edges
387-
* (calls, imports, extends/implements) from the AST. The existing
388-
* `chunkCode()` is unchanged for backward compatibility.
309+
* Core chunking implementation shared by chunkCode and chunkCodeWithEdges.
389310
*
390-
* @param sourceCode - The source code content
391-
* @param filePath - The file path (for language detection and IDs)
392-
* @returns ChunkResult with chunks and raw edges
311+
* @param sourceCode - Raw source code
312+
* @param filePath - File path for language detection and IDs
313+
* @param extractEdges - Whether to extract raw edges for the context graph
314+
* @returns ChunkResult with chunks and optionally raw edges
393315
*/
394-
export async function chunkCodeWithEdges(
316+
async function chunkCodeCore(
395317
sourceCode: string,
396-
filePath: string
318+
filePath: string,
319+
extractEdges: boolean,
397320
): Promise<ChunkResult> {
398321
const cleanedSource = stripBOM(sourceCode);
399322
const ext = path.extname(filePath);
400323
const lang = getLanguageByExtension(ext);
401324

402325
if (!lang) {
403-
const chunks = await fallbackChunking(cleanedSource, filePath);
404-
return { chunks, rawEdges: [] };
326+
log.debug('Unsupported language, using fallback chunking', { filePath, ext });
327+
return { chunks: fallbackChunking(cleanedSource, filePath), rawEdges: [] };
405328
}
406329

407330
const config = LANGUAGE_CONFIGS[lang];
408331
if (!config) {
409-
const chunks = await fallbackChunking(cleanedSource, filePath);
410-
return { chunks, rawEdges: [] };
332+
return { chunks: fallbackChunking(cleanedSource, filePath), rawEdges: [] };
411333
}
412334

413-
let parser: Parser | null = null;
414335
let tree: Parser.Tree | null = null;
415336

416337
try {
417-
parser = await createParser(config.wasmPath);
338+
const parser = await createParser(config.wasmPath);
418339
tree = parser.parse(cleanedSource);
419340
const chunks: CodeChunk[] = [];
420341
const rawEdges: RawEdge[] = [];
421342

422-
// Collect all semantic nodes
423343
const semanticNodes: Parser.SyntaxNode[] = [];
424344
collectSemanticNodes(tree.rootNode, config.chunkNodeTypes, semanticNodes, 0);
425345

426346
for (const node of semanticNodes) {
427-
const content = node.text;
428-
const startLine = node.startPosition.row + 1;
429-
const endLine = node.endPosition.row + 1;
430-
const name = extractName(node, config);
431-
const signature = extractSignature(node, cleanedSource);
432-
const docstring = extractDocstring(node, cleanedSource, config);
433-
434-
if (isTooSmall(content)) continue;
435-
436-
const outputLang = lang === 'tsx' ? 'typescript' : lang === 'jsx' ? 'javascript' : lang;
437-
438-
// Build chunk(s) for this node
439-
const nodeChunks: CodeChunk[] = [];
440-
if (isTooLarge(content)) {
441-
const subChunks = splitLargeContent(content, startLine);
442-
for (let i = 0; i < subChunks.length; i++) {
443-
const sub = subChunks[i];
444-
if (!sub) continue;
445-
nodeChunks.push({
446-
id: generateChunkId(filePath, sub.startLine) + `_p${i}`,
447-
filePath,
448-
content: sub.content,
449-
startLine: sub.startLine,
450-
endLine: sub.endLine,
451-
name: name ? `${name} (part ${i + 1})` : null,
452-
nodeType: node.type,
453-
signature: i === 0 ? signature : null,
454-
docstring: i === 0 ? docstring : null,
455-
language: outputLang,
456-
});
457-
}
458-
} else {
459-
nodeChunks.push({
460-
id: generateChunkId(filePath, startLine),
461-
filePath,
462-
content,
463-
startLine,
464-
endLine,
465-
name,
466-
nodeType: node.type,
467-
signature,
468-
docstring,
469-
language: outputLang,
470-
});
471-
}
347+
const nodeChunks = buildChunksFromNode(node, filePath, cleanedSource, config, lang);
348+
if (nodeChunks.length === 0) continue;
472349

473350
chunks.push(...nodeChunks);
474351

475-
// Extract edges from this semantic node's children
476-
// Use the first chunk's ID as the source
477-
const sourceChunkId = nodeChunks[0]?.id;
478-
if (sourceChunkId) {
352+
if (extractEdges) {
353+
const sourceChunkId = nodeChunks[0]!.id;
479354
const edges = extractEdgesFromNode(node, sourceChunkId, filePath, config);
480355
rawEdges.push(...edges);
481356
}
482357
}
483358

484359
if (chunks.length === 0) {
360+
log.debug('No semantic nodes found, using fallback chunking', { filePath });
485361
return { chunks: fallbackChunking(cleanedSource, filePath), rawEdges: [] };
486362
}
487363

488-
log.debug('Chunking with edges complete', {
364+
log.debug('Chunking complete', {
489365
filePath,
490366
chunkCount: chunks.length,
491-
edgeCount: rawEdges.length,
367+
...(extractEdges ? { edgeCount: rawEdges.length } : {}),
492368
});
493369
return { chunks, rawEdges };
494370
} catch (error) {
495371
const errorMessage = error instanceof Error ? error.message : String(error);
496-
log.warn('Tree-sitter parsing failed for edge extraction, using fallback', {
372+
log.warn('Tree-sitter parsing failed, using fallback', {
497373
filePath,
498374
error: errorMessage,
375+
errorType: error instanceof Error ? error.constructor.name : 'Unknown',
499376
});
500377
return { chunks: fallbackChunking(cleanedSource, filePath), rawEdges: [] };
501378
} finally {
502379
if (tree) tree.delete();
503380
}
504381
}
505382

383+
/**
384+
* Main chunking function - processes source code into semantic chunks.
385+
*
386+
* Parses source code using tree-sitter and extracts semantic units
387+
* (functions, classes, methods) as individual chunks. Large chunks
388+
* are automatically split with overlap for better embedding quality.
389+
*
390+
* @param sourceCode - The source code content to chunk
391+
* @param filePath - The file path (used for language detection and IDs)
392+
* @returns Array of code chunks with metadata
393+
*
394+
* @example
395+
* ```typescript
396+
* const chunks = await chunkCode(fileContent, '/project/src/auth.ts');
397+
* for (const chunk of chunks) {
398+
* console.log(`${chunk.name}: ${chunk.startLine}-${chunk.endLine}`);
399+
* }
400+
* ```
401+
*/
402+
export async function chunkCode(
403+
sourceCode: string,
404+
filePath: string
405+
): Promise<CodeChunk[]> {
406+
const result = await chunkCodeCore(sourceCode, filePath, false);
407+
return result.chunks;
408+
}
409+
410+
/**
411+
* Chunk code and extract raw edges for the context graph.
412+
*
413+
* This extends `chunkCode` by additionally extracting structural edges
414+
* (calls, imports, extends/implements) from the AST.
415+
*
416+
* @param sourceCode - The source code content
417+
* @param filePath - The file path (for language detection and IDs)
418+
* @returns ChunkResult with chunks and raw edges
419+
*/
420+
export async function chunkCodeWithEdges(
421+
sourceCode: string,
422+
filePath: string
423+
): Promise<ChunkResult> {
424+
return chunkCodeCore(sourceCode, filePath, true);
425+
}
426+
506427
/**
507428
* Extract raw edges from a semantic AST node by traversing its children.
508429
*

0 commit comments

Comments
 (0)