Skip to content

Commit 92f16f8

Browse files
committed
Fix invalid ID format error for paths with special characters
Chunk ID generation now sanitizes all non-alphanumeric characters (except `-` and `_`) to prevent InvalidIdError when indexing files with: - @ symbols (scoped packages like @scope/package) - Spaces in paths - Parentheses (e.g., file (copy).ts) - Plus signs (e.g., c++/main.cpp) - Colons (Windows paths like C:\Users\...) The fix adds a catch-all regex `.replace(/[^a-zA-Z0-9_-]/g, '_')` to generateChunkId(), sanitizePathPattern(), and sanitizeGlobPattern().
1 parent 905b736 commit 92f16f8

5 files changed

Lines changed: 163 additions & 44 deletions

File tree

src/chunker/index.ts

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,10 +172,28 @@ function extractDocstring(
172172
}
173173

174174
/**
175-
* Generate chunk ID from file path and position
175+
* Generate chunk ID from file path and position.
176+
*
177+
* Normalizes file paths to create IDs that match the allowed pattern
178+
* `/^[a-zA-Z0-9_-]+$/` used by the store's ID validation.
179+
*
180+
* Characters replaced:
181+
* - `/` and `\` (path separators) → `_`
182+
* - `.` (dots) → `_`
183+
* - `@`, spaces, parentheses, `+`, `:`, and any other non-alphanumeric
184+
* characters (except `-` and `_`) → `_`
185+
*
186+
* This handles paths like:
187+
* - `@scope/package/file.ts` → `_scope_package_file_ts`
188+
* - `path with spaces/file.ts` → `path_with_spaces_file_ts`
189+
* - `file (copy).ts` → `file__copy__ts`
190+
* - `c++/main.cpp` → `c___main_cpp`
176191
*/
177192
function generateChunkId(filePath: string, startLine: number): string {
178-
const normalized = filePath.replace(/[\\/]/g, '_').replace(/\./g, '_');
193+
const normalized = filePath
194+
.replace(/[\\/]/g, '_') // path separators
195+
.replace(/\./g, '_') // dots
196+
.replace(/[^a-zA-Z0-9_-]/g, '_'); // any remaining unsafe chars
179197
return `${normalized}_L${startLine}`;
180198
}
181199

src/search/filter-builder.ts

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,14 @@ export function validateFilterPattern(pattern: string): boolean {
108108
* ```
109109
*/
110110
export function sanitizePathPattern(path: string): string {
111-
// Replace path separators and dots with underscores
111+
// Replace path separators, dots, and any other non-safe characters
112+
// This must match the normalization in generateChunkId (chunker/index.ts)
112113
const sanitized = path
113-
.replace(/[\\/]/g, '_')
114-
.replace(/\./g, '_');
114+
.replace(/[\\/]/g, '_') // path separators
115+
.replace(/\./g, '_') // dots
116+
.replace(/[^a-zA-Z0-9_-]/g, '_'); // any remaining unsafe chars
115117

116-
// Validate the result
118+
// Validate the result (should always pass now, but kept for defense-in-depth)
117119
if (!validateFilterPattern(sanitized)) {
118120
throw new InvalidFilterError(
119121
`Invalid path pattern: contains disallowed characters`
@@ -147,14 +149,16 @@ export function sanitizePathPattern(path: string): string {
147149
*/
148150
export function sanitizeGlobPattern(pattern: string): string {
149151
// Convert glob patterns to SQL LIKE patterns
152+
// This must match the normalization in generateChunkId (chunker/index.ts)
150153
const sanitized = pattern
151-
.replace(/\*\*/g, '%') // ** -> %
152-
.replace(/\*/g, '%') // * -> %
153-
.replace(/\?/g, '_') // ? -> _
154-
.replace(/[\\/]/g, '_') // path separators -> _
155-
.replace(/\./g, '_'); // . -> _
154+
.replace(/\*\*/g, '%') // ** -> %
155+
.replace(/\*/g, '%') // * -> %
156+
.replace(/\?/g, '_') // ? -> _
157+
.replace(/[\\/]/g, '_') // path separators -> _
158+
.replace(/\./g, '_') // . -> _
159+
.replace(/[^a-zA-Z0-9_%-]/g, '_'); // any remaining unsafe chars (keep % for LIKE)
156160

157-
// Validate the result
161+
// Validate the result (should always pass now, but kept for defense-in-depth)
158162
if (!validateFilterPattern(sanitized)) {
159163
throw new InvalidFilterError(
160164
`Invalid file pattern: contains disallowed characters`

tests/chunker/chunker.test.ts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,58 @@ function c() { return 3; }
394394

395395
expect(chunks[0]?.id).toContain('path_to_file_ts');
396396
});
397+
398+
it('should handle @ symbols in paths (scoped packages)', async () => {
399+
const code = `function test() { return true; }`;
400+
const chunks = await chunkCode(code, '/node_modules/@scope/package/index.ts');
401+
402+
// @ should be replaced with underscore
403+
expect(chunks[0]?.id).toContain('_scope_package_index_ts');
404+
// ID should only contain safe characters
405+
expect(chunks[0]?.id).toMatch(/^[a-zA-Z0-9_-]+$/);
406+
});
407+
408+
it('should handle spaces in paths', async () => {
409+
const code = `function test() { return true; }`;
410+
const chunks = await chunkCode(code, '/path with spaces/my file.ts');
411+
412+
// Spaces should be replaced with underscores
413+
expect(chunks[0]?.id).toContain('path_with_spaces_my_file_ts');
414+
expect(chunks[0]?.id).toMatch(/^[a-zA-Z0-9_-]+$/);
415+
});
416+
417+
it('should handle parentheses in paths', async () => {
418+
const code = `function test() { return true; }`;
419+
const chunks = await chunkCode(code, '/test/file (copy).ts');
420+
421+
// Parentheses should be replaced with underscores
422+
expect(chunks[0]?.id).toContain('file__copy__ts');
423+
expect(chunks[0]?.id).toMatch(/^[a-zA-Z0-9_-]+$/);
424+
});
425+
426+
it('should handle plus signs in paths (c++ directories)', async () => {
427+
const code = `
428+
#include <iostream>
429+
int main() {
430+
std::cout << "Hello";
431+
return 0;
432+
}
433+
`;
434+
const chunks = await chunkCode(code, '/projects/c++/main.cpp');
435+
436+
// Plus signs should be replaced with underscores
437+
expect(chunks[0]?.id).toContain('c___main_cpp');
438+
expect(chunks[0]?.id).toMatch(/^[a-zA-Z0-9_-]+$/);
439+
});
440+
441+
it('should handle colons in Windows-style paths', async () => {
442+
const code = `function test() { return true; }`;
443+
const chunks = await chunkCode(code, 'C:\\Users\\test\\file.ts');
444+
445+
// Colons and backslashes should be replaced with underscores
446+
expect(chunks[0]?.id).toContain('C__Users_test_file_ts');
447+
expect(chunks[0]?.id).toMatch(/^[a-zA-Z0-9_-]+$/);
448+
});
397449
});
398450

399451
describe('Fallback chunking', () => {

tests/search/hybrid-search.test.ts

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,21 @@ describe('Hybrid Search', () => {
6060
expect(filter).toContain("language = 'typescript'");
6161
});
6262

63-
it('should throw on SQL injection in path', () => {
64-
expect(() => buildSafeFilter({ path: "'; DROP TABLE--" })).toThrow(
65-
InvalidFilterError
66-
);
63+
it('should sanitize SQL injection in path', () => {
64+
// SQL injection attempts are now sanitized (unsafe chars replaced with _)
65+
const result = buildSafeFilter({ path: "'; DROP TABLE--" });
66+
expect(result).toBe("id LIKE '___DROP_TABLE--%'");
67+
// The inner value should only contain safe characters
68+
const innerValue = result?.match(/id LIKE '([^']+)'/)?.[1];
69+
expect(innerValue).toMatch(/^[a-zA-Z0-9_%-]+$/);
6770
});
6871

69-
it('should throw on SQL injection in file pattern', () => {
70-
expect(() =>
71-
buildSafeFilter({ filePattern: "**/*'; DROP TABLE--" })
72-
).toThrow(InvalidFilterError);
72+
it('should sanitize SQL injection in file pattern', () => {
73+
// SQL injection attempts are now sanitized (unsafe chars replaced with _)
74+
const result = buildSafeFilter({ filePattern: "**/*'; DROP TABLE--" });
75+
// ** -> %, / -> _, * -> %, ' -> _, ; -> _, space -> _
76+
// buildFilePatternCondition adds leading % for suffix matching
77+
expect(result).toBe("id LIKE '%%_%___DROP_TABLE--'");
7378
});
7479
});
7580

tests/security/sql-injection.test.ts

Lines changed: 64 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,35 @@ describe('SQL Injection Prevention', () => {
7575
expect(sanitizePathPattern('src.test.file')).toBe('src_test_file');
7676
});
7777

78-
it('should throw on SQL injection attempts', () => {
79-
expect(() => sanitizePathPattern("src'; DROP TABLE--/")).toThrow(
80-
InvalidFilterError
81-
);
82-
expect(() => sanitizePathPattern("test' OR '1'='1")).toThrow(
83-
InvalidFilterError
84-
);
78+
it('should handle @ symbols in scoped package paths', () => {
79+
expect(sanitizePathPattern('@scope/package/file')).toBe('_scope_package_file');
80+
expect(sanitizePathPattern('node_modules/@types/node')).toBe('node_modules__types_node');
81+
});
82+
83+
it('should handle spaces in paths', () => {
84+
expect(sanitizePathPattern('path with spaces/file')).toBe('path_with_spaces_file');
85+
expect(sanitizePathPattern('My Documents/project')).toBe('My_Documents_project');
86+
});
87+
88+
it('should handle parentheses in paths', () => {
89+
expect(sanitizePathPattern('file (copy)')).toBe('file__copy_');
90+
expect(sanitizePathPattern('Component(HOC).tsx')).toBe('Component_HOC__tsx');
91+
});
92+
93+
it('should handle plus signs in paths', () => {
94+
expect(sanitizePathPattern('c++/main')).toBe('c___main');
95+
});
96+
97+
it('should handle colons in Windows paths', () => {
98+
expect(sanitizePathPattern('C:\\Users\\test')).toBe('C__Users_test');
99+
});
100+
101+
it('should sanitize SQL injection attempts instead of throwing', () => {
102+
// These now get sanitized rather than throwing, since the catch-all
103+
// replaces all unsafe characters with underscores
104+
const result = sanitizePathPattern("src'; DROP TABLE--/");
105+
expect(result).toBe('src___DROP_TABLE--_');
106+
expect(result).toMatch(/^[a-zA-Z0-9_\-%]+$/);
85107
});
86108
});
87109

@@ -93,10 +115,12 @@ describe('SQL Injection Prevention', () => {
93115
expect(sanitizeGlobPattern('src/??.ts')).toBe('src____ts');
94116
});
95117

96-
it('should throw on SQL injection in glob patterns', () => {
97-
expect(() => sanitizeGlobPattern("*'; DROP TABLE--")).toThrow(
98-
InvalidFilterError
99-
);
118+
it('should sanitize SQL injection in glob patterns', () => {
119+
// SQL injection attempts are now sanitized (unsafe chars replaced with _)
120+
const result = sanitizeGlobPattern("*'; DROP TABLE--");
121+
// * -> %, ' -> _, ; -> _, space -> _
122+
expect(result).toBe('%___DROP_TABLE--');
123+
expect(result).toMatch(/^[a-zA-Z0-9_%-]+$/);
100124
});
101125
});
102126

@@ -164,16 +188,21 @@ describe('SQL Injection Prevention', () => {
164188
expect(result).toBe("id LIKE 'src%' AND language = 'typescript'");
165189
});
166190

167-
it('should throw on SQL injection in path', () => {
168-
expect(() => buildSafeFilter({ path: "'; DROP TABLE--" })).toThrow(
169-
InvalidFilterError
170-
);
191+
it('should sanitize SQL injection in path', () => {
192+
// SQL injection attempts are now sanitized (unsafe chars replaced with _)
193+
const result = buildSafeFilter({ path: "'; DROP TABLE--" });
194+
expect(result).toBe("id LIKE '___DROP_TABLE--%'");
195+
// The sanitized inner value contains only safe characters
196+
// (the outer quotes are SQL string delimiters, not part of the user input)
197+
const innerValue = result?.match(/id LIKE '([^']+)'/)?.[1];
198+
expect(innerValue).toMatch(/^[a-zA-Z0-9_%-]+$/);
171199
});
172200

173-
it('should throw on SQL injection in file pattern', () => {
174-
expect(() => buildSafeFilter({ filePattern: "*.ts'; DROP TABLE--" })).toThrow(
175-
InvalidFilterError
176-
);
201+
it('should sanitize SQL injection in file pattern', () => {
202+
// SQL injection attempts are now sanitized (unsafe chars replaced with _)
203+
const result = buildSafeFilter({ filePattern: "*.ts'; DROP TABLE--" });
204+
// * -> %, . -> _, ' -> _, ; -> _, space -> _, etc.
205+
expect(result).toBe("id LIKE '%%_ts___DROP_TABLE--'");
177206
});
178207
});
179208

@@ -192,23 +221,34 @@ describe('SQL Injection Prevention', () => {
192221
];
193222

194223
it.each(sqlInjectionPayloads)(
195-
'should reject injection payload: %s',
224+
'should reject injection payload via validateFilterPattern: %s',
196225
(payload) => {
197226
expect(validateFilterPattern(payload)).toBe(false);
198227
}
199228
);
200229

201230
it.each(sqlInjectionPayloads)(
202-
'should throw on sanitizePathPattern with: %s',
231+
'should sanitize injection payload and produce safe result: %s',
203232
(payload) => {
204-
expect(() => sanitizePathPattern(payload)).toThrow(InvalidFilterError);
233+
// sanitizePathPattern now sanitizes rather than throws
234+
const result = sanitizePathPattern(payload);
235+
// The result should only contain safe characters
236+
expect(validateFilterPattern(result)).toBe(true);
237+
// Should not contain any SQL-dangerous characters
238+
expect(result).not.toMatch(/['";\(\)=<>]/);
205239
}
206240
);
207241

208242
it.each(sqlInjectionPayloads)(
209-
'should throw on buildSafeFilter path with: %s',
243+
'should produce safe filter for buildSafeFilter path: %s',
210244
(payload) => {
211-
expect(() => buildSafeFilter({ path: payload })).toThrow(InvalidFilterError);
245+
// buildSafeFilter now sanitizes rather than throws
246+
const result = buildSafeFilter({ path: payload });
247+
// Should be a valid LIKE condition
248+
expect(result).toMatch(/^id LIKE '[a-zA-Z0-9_%-]+'$/);
249+
// Should not contain unescaped quotes (the one at start/end are the SQL string delimiters)
250+
const innerContent = result?.slice(9, -1); // Extract content between "id LIKE '" and "'"
251+
expect(innerContent).not.toContain("'");
212252
}
213253
);
214254
});

0 commit comments

Comments
 (0)