Skip to content

Commit 0fdae38

Browse files
committed
refactor(strings): enhance string parsing and escaping logic
- Updated the parsing logic to utilize a new `parse_quoted_utf8` function for improved handling of quoted strings, preserving backslashes and non-ASCII characters accurately. - Simplified the `escape_strings_token` function to better manage escape sequences and backslashes, ensuring correct preservation of string literals. - Added a new test case to verify the parsing of strings with trailing spaces and non-ASCII characters, enhancing overall test coverage.
1 parent bf4b41e commit 0fdae38

1 file changed

Lines changed: 69 additions & 46 deletions

File tree

langcodec/src/formats/strings.rs

Lines changed: 69 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ impl Parser for Format {
3535
// Read entire input into a string (UTF-8 expected here; UTF-16 handled in read_from)
3636
let mut reader = reader;
3737
let mut bytes = Vec::new();
38-
reader.read_to_end(&mut bytes).map_err(Error::Io)?;
38+
std::io::Read::read_to_end(&mut reader, &mut bytes).map_err(Error::Io)?;
3939
let content = String::from_utf8(bytes)
4040
.map_err(|_| Error::InvalidResource("Invalid UTF-8 in .strings file".to_string()))?;
4141

@@ -198,15 +198,15 @@ fn parse_strings_content(content: &str) -> (Vec<Pair>, Vec<String>) {
198198
}
199199

200200
// Key-Value pair: "key" = "value";
201-
if let Some((j, key)) = parse_quoted(bytes, i) {
201+
if let Some((j, key)) = parse_quoted_utf8(content, bytes, i) {
202202
i = j;
203203
let (ni2, _) = skip_inline_ws(bytes, i);
204204
i = ni2;
205205
if i < len && bytes[i] == b'=' {
206206
i += 1; // consume '='
207207
let (ni3, _) = skip_inline_ws(bytes, i);
208208
i = ni3;
209-
if let Some((jv, value_raw)) = parse_quoted(bytes, i) {
209+
if let Some((jv, value_raw)) = parse_quoted_utf8(content, bytes, i) {
210210
i = jv;
211211
// seek semicolon, ignoring spaces and tabs only
212212
let (ni4, _) = skip_inline_ws(bytes, i);
@@ -302,37 +302,35 @@ fn parse_block_comment(bytes: &[u8], i: usize) -> (usize, String) {
302302
(j, comment)
303303
}
304304

305-
// Parses a quoted string beginning at position i (which must point to '"').
306-
// Returns (next_index_after_closing_quote, raw_string_content)
307-
fn parse_quoted(bytes: &[u8], mut i: usize) -> Option<(usize, String)> {
305+
// Parses a quoted string starting at byte index i (which must point to '"').
306+
// Returns (byte_index_after_closing_quote, substring content as UTF-8) without the surrounding quotes,
307+
// preserving backslashes and non-ASCII characters exactly as in the source.
308+
fn parse_quoted_utf8(source: &str, bytes: &[u8], i: usize) -> Option<(usize, String)> {
308309
if i >= bytes.len() || bytes[i] != b'"' {
309310
return None;
310311
}
311-
i += 1; // skip opening quote
312-
let mut out = String::new();
313-
while i < bytes.len() {
314-
match bytes[i] {
315-
b'\\' => {
316-
// keep escapes verbatim so we don't change meaning
317-
if i + 1 < bytes.len() {
318-
out.push('\u{005C}'); // backslash
319-
out.push(bytes[i + 1] as char);
320-
i += 2;
321-
} else {
322-
// trailing backslash, keep it
323-
out.push('\u{005C}');
324-
i += 1;
325-
}
326-
}
327-
b'"' => {
328-
i += 1; // consume closing quote
329-
return Some((i, out));
330-
}
331-
ch => {
332-
out.push(ch as char);
333-
i += 1;
312+
let start = i + 1; // start of content inside quotes
313+
let mut j = start;
314+
let mut consecutive_backslashes = 0usize;
315+
while j < bytes.len() {
316+
let b = bytes[j];
317+
if b == b'\\' {
318+
consecutive_backslashes += 1;
319+
j += 1;
320+
continue;
321+
}
322+
if b == b'"' {
323+
// If number of preceding backslashes is even, the quote terminates the string
324+
if consecutive_backslashes % 2 == 0 {
325+
let end = j;
326+
let s = &source[start..end];
327+
return Some((j + 1, s.to_string()));
334328
}
329+
// else, it's an escaped quote, continue scanning
335330
}
331+
// reset backslash count on any non-backslash byte
332+
consecutive_backslashes = 0;
333+
j += 1;
336334
}
337335
None
338336
}
@@ -357,7 +355,7 @@ fn normalize_value_newlines(raw: &str) -> String {
357355
}
358356

359357
fn escape_strings_token(s: &str) -> String {
360-
// Escape quotes and newlines. Backslashes are escaped unless immediately followed by an apostrophe.
358+
// Escape quotes and literal newlines. Preserve recognized escape sequences (\n, \t, \r, \" , \' , \\) as-is.
361359
let mut out = String::new();
362360
let chars: Vec<char> = s.chars().collect();
363361
let mut i = 0usize;
@@ -375,29 +373,52 @@ fn escape_strings_token(s: &str) -> String {
375373
i += 1;
376374
}
377375
'\\' => {
378-
// Count run of backslashes
376+
// Handle runs of backslashes with lookahead
379377
let mut j = i;
380378
while j < chars.len() && chars[j] == '\\' {
381379
j += 1;
382380
}
383-
let next = if j < chars.len() {
381+
let next_char = if j < chars.len() {
384382
Some(chars[j])
385383
} else {
386384
None
387385
};
388-
if next == Some('\'') {
389-
// Preserve run as-is when followed by apostrophe
390-
for _ in i..j {
391-
out.push('\\');
386+
387+
match next_char {
388+
Some('\'') => {
389+
// Preserve run when followed by apostrophe
390+
for _ in i..j {
391+
out.push('\\');
392+
}
393+
out.push('\'');
394+
i = j + 1;
392395
}
393-
} else {
394-
// Double each backslash in the run
395-
for _ in i..j {
396-
out.push('\\');
397-
out.push('\\');
396+
Some('n') | Some('t') | Some('r') | Some('"') | Some('\\') => {
397+
// Recognized escape sequence: preserve exactly one run of backslashes and the escape char as-is
398+
for _ in i..j {
399+
out.push('\\');
400+
}
401+
out.push(next_char.unwrap());
402+
i = j + 1;
403+
}
404+
Some(other) => {
405+
// Unrecognized escape: double each backslash to preserve literal backslashes, then the next char
406+
for _ in i..j {
407+
out.push('\\');
408+
out.push('\\');
409+
}
410+
out.push(other);
411+
i = j + 1;
412+
}
413+
None => {
414+
// Trailing backslashes at end of string: double them
415+
for _ in i..j {
416+
out.push('\\');
417+
out.push('\\');
418+
}
419+
i = j;
398420
}
399421
}
400-
i = j;
401422
}
402423
_ => {
403424
out.push(ch);
@@ -412,9 +433,8 @@ impl TryFrom<Entry> for Pair {
412433
type Error = Error;
413434

414435
fn try_from(entry: Entry) -> Result<Self, Self::Error> {
415-
// Strings format only supports singular translations
416-
// with plain text values.
417-
match Translation::plain_translation(entry.value) {
436+
// Strings format only supports singular translations. Preserve the value verbatim.
437+
match entry.value {
418438
Translation::Singular(value) => Ok(Pair {
419439
key: entry.id,
420440
value,
@@ -601,17 +621,20 @@ mod tests {
601621
"key1" = "Value with trailing space ";
602622
"key2" = "Another value with trailing spaces ";
603623
"key3" = "No trailing spaces";
624+
"key4" = "过去一天 ";
604625
"#;
605626
let parsed = Format::from_str(content).unwrap();
606-
assert_eq!(parsed.pairs.len(), 3);
627+
assert_eq!(parsed.pairs.len(), 4);
607628

608629
let pair1 = &parsed.pairs[0];
609630
let pair2 = &parsed.pairs[1];
610631
let pair3 = &parsed.pairs[2];
632+
let pair4 = &parsed.pairs[3];
611633

612634
assert_eq!(pair1.value, "Value with trailing space ");
613635
assert_eq!(pair2.value, "Another value with trailing spaces ");
614636
assert_eq!(pair3.value, "No trailing spaces");
637+
assert_eq!(pair4.value, "过去一天 ");
615638
}
616639

617640
#[test]

0 commit comments

Comments
 (0)