Skip to content

Commit 884cbbc

Browse files
committed
refactor(strings): improve header language extraction and parsing logic
- Enhanced the `Parser` implementation to extract the language from the header of .strings files, ensuring accurate language representation. - Introduced a new function `try_skip_langcodec_header` to detect and skip the auto-generated header at the start of the file, improving parsing efficiency. - Updated the `parse_strings_content` function to handle the detection of the header and manage the state of seen pairs during parsing.
1 parent 0fdae38 commit 884cbbc

1 file changed

Lines changed: 65 additions & 1 deletion

File tree

langcodec/src/formats/strings.rs

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ impl Parser for Format {
4040
.map_err(|_| Error::InvalidResource("Invalid UTF-8 in .strings file".to_string()))?;
4141

4242
// Parse content
43+
let header_language = extract_header_language(&content).unwrap_or_default();
4344
let (pairs, _warnings) = parse_strings_content(&content);
4445
Ok(Format {
45-
language: String::new(),
46+
language: header_language,
4647
pairs,
4748
})
4849
}
@@ -175,6 +176,7 @@ fn parse_strings_content(content: &str) -> (Vec<Pair>, Vec<String>) {
175176
let mut pairs: Vec<Pair> = Vec::new();
176177
let warnings: Vec<String> = Vec::new();
177178
let mut pending_comment: Option<String> = None;
179+
let mut have_seen_pair = false;
178180

179181
while i < len {
180182
let (ni, _saw_newline) = skip_whitespace(bytes, i);
@@ -183,6 +185,13 @@ fn parse_strings_content(content: &str) -> (Vec<Pair>, Vec<String>) {
183185
break;
184186
}
185187

188+
// If we're at the top of the file (no pairs yet), detect and skip the auto-generated header
189+
if !have_seen_pair && let Some(next_i) = try_skip_langcodec_header(bytes, i) {
190+
i = next_i;
191+
pending_comment = None;
192+
continue;
193+
}
194+
186195
// Comments
187196
if starts_with(bytes, i, b"//") {
188197
let (nj, comment) = parse_line_comment(bytes, i);
@@ -231,6 +240,7 @@ fn parse_strings_content(content: &str) -> (Vec<Pair>, Vec<String>) {
231240
comment: pending_comment.take(),
232241
};
233242
pairs.push(pair);
243+
have_seen_pair = true;
234244
continue;
235245
}
236246
}
@@ -302,6 +312,60 @@ fn parse_block_comment(bytes: &[u8], i: usize) -> (usize, String) {
302312
(j, comment)
303313
}
304314

315+
// Detect and skip the standard langcodec header block at the start of the file.
316+
// Returns Some(new_index) if a header was skipped, or None otherwise.
317+
fn try_skip_langcodec_header(bytes: &[u8], mut i: usize) -> Option<usize> {
318+
let start = i;
319+
let mut saw_header_marker = false;
320+
// We look for consecutive comment lines starting with // and possibly a block containing
321+
// a line beginning with //: Language:
322+
while i < bytes.len() {
323+
// Allow blank lines within header
324+
let (ni, _nl) = skip_whitespace(bytes, i);
325+
i = ni;
326+
if i >= bytes.len() {
327+
break;
328+
}
329+
if starts_with(bytes, i, b"//:") || starts_with(bytes, i, b"//") {
330+
if starts_with(bytes, i, b"//:") {
331+
saw_header_marker = true;
332+
}
333+
// consume to end of line
334+
while i < bytes.len() && bytes[i] != b'\n' {
335+
i += 1;
336+
}
337+
continue;
338+
}
339+
break;
340+
}
341+
if saw_header_marker && i > start {
342+
Some(i)
343+
} else {
344+
None
345+
}
346+
}
347+
348+
fn extract_header_language(content: &str) -> Option<String> {
349+
// Look within the first ~50 lines for a header language line
350+
for line in content.lines().take(50) {
351+
let trimmed = line.trim_start();
352+
// Accept forms like: //: Language: xx or // : Language: xx
353+
if let Some(rest) = trimmed
354+
.strip_prefix("//:")
355+
.or_else(|| trimmed.strip_prefix("// :"))
356+
{
357+
let rest = rest.trim_start();
358+
if let Some(lang_part) = rest.strip_prefix("Language:") {
359+
let lang = lang_part.trim();
360+
if !lang.is_empty() {
361+
return Some(lang.to_string());
362+
}
363+
}
364+
}
365+
}
366+
None
367+
}
368+
305369
// Parses a quoted string starting at byte index i (which must point to '"').
306370
// Returns (byte_index_after_closing_quote, substring content as UTF-8) without the surrounding quotes,
307371
// preserving backslashes and non-ASCII characters exactly as in the source.

0 commit comments

Comments
 (0)