Skip to content

Commit 62726ee

Browse files
committed
fix: handle multi-byte characters
Ticket: DX-2800 This commit handles arbitrary sized characters. SWC uses byte offsets, while JS uses character offsets. This results in an index drift. #1109 had an initial fix, but upon adding more tests, it seemed that the fix was incomplete. This commit should pass the additional tests
1 parent 3cb64a2 commit 62726ee

2 files changed

Lines changed: 430 additions & 70 deletions

File tree

packages/openapi-generator/src/comments.ts

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,27 @@ import { parse as parseComment, Block } from 'comment-parser';
22
import { Schema } from './ir';
33

44
/**
5-
* Compute the difference between byte length and character length for a string.
6-
* This accounts for multibyte UTF-8 characters.
5+
* Convert a UTF-8 byte offset to a JavaScript string character offset.
6+
* SWC (written in Rust) uses byte offsets, but JavaScript strings use
7+
* UTF-16 code unit offsets. This function handles the conversion by
8+
* iterating through the string and accumulating byte lengths.
9+
*
10+
* @param str The source string
11+
* @param byteOffset The byte offset to convert
12+
* @returns The corresponding character offset
713
*/
8-
function computeByteLengthDiff(str: string): number {
9-
return Buffer.byteLength(str, 'utf8') - str.length;
14+
function byteOffsetToCharOffset(str: string, byteOffset: number): number {
15+
let charCount = 0;
16+
let byteCount = 0;
17+
18+
for (const char of str) {
19+
const charBytes = Buffer.byteLength(char, 'utf8');
20+
if (byteCount + charBytes > byteOffset) break;
21+
byteCount += charBytes;
22+
charCount++;
23+
}
24+
25+
return charCount;
1026
}
1127

1228
export function leadingComment(
@@ -18,20 +34,13 @@ export function leadingComment(
1834
// SWC uses byte offsets, but JavaScript strings use character offsets.
1935
// When there are multibyte UTF-8 characters, we need to adjust.
2036
// Calculate the byte-to-char difference for the portion of source before our slice.
21-
const prefixLength = Math.min(start - srcSpanStart, src.length);
22-
const prefix = src.slice(0, prefixLength);
23-
const byteDiff = computeByteLengthDiff(prefix);
24-
25-
// Adjust the slice offsets by the byte difference
26-
const adjustedStart = start - srcSpanStart - byteDiff;
27-
const adjustedEnd =
28-
end -
29-
srcSpanStart -
30-
computeByteLengthDiff(src.slice(0, Math.min(end - srcSpanStart, src.length)));
31-
32-
let commentString = src
33-
.slice(Math.max(0, adjustedStart), Math.max(0, adjustedEnd))
34-
.trim();
37+
const startByteOffset = start - srcSpanStart;
38+
const endByteOffset = end - srcSpanStart;
39+
40+
const startCharOffset = byteOffsetToCharOffset(src, startByteOffset);
41+
const endCharOffset = byteOffsetToCharOffset(src, endByteOffset);
42+
43+
let commentString = src.slice(startCharOffset, endCharOffset).trim();
3544

3645
if (commentString.includes(' * ') && !/\/\*\*([\s\S]*?)\*\//.test(commentString)) {
3746
// The comment block seems to be JSDoc but was sliced incorrectly

0 commit comments

Comments
 (0)