Skip to content

Commit 8e315a6

Browse files
committed
Add placeholder validation and normalization utilities
Introduces a new placeholder module for parsing, normalizing, and validating placeholders across languages and plural forms. Adds `validate_placeholders` to `Codec` to ensure consistency of placeholder signatures, with tests for both matching and mismatched cases. Updates exports in lib.rs to include the new utilities.
1 parent d6fbdde commit 8e315a6

3 files changed

Lines changed: 287 additions & 0 deletions

File tree

langcodec/src/codec.rs

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,85 @@ impl Codec {
619619
.retain(|resource| !resource.entries.is_empty());
620620
}
621621

622+
/// Validate placeholder consistency across languages for each key.
623+
///
624+
/// Rules (initial version):
625+
/// - For each key, each language must have the same placeholder signature.
626+
/// - For plural entries, all forms within a language must share the same signature.
627+
/// - iOS vs Android differences like `%@`/`%1$@` vs `%s`/`%1$s` are normalized.
628+
pub fn validate_placeholders(&self, strict: bool) -> Result<(), Error> {
629+
use std::collections::HashMap;
630+
use crate::placeholder::signature;
631+
use crate::types::Translation;
632+
633+
// key -> lang -> Vec<signatures per form or single>
634+
let mut map: HashMap<String, HashMap<String, Vec<Vec<String>>>> = HashMap::new();
635+
636+
for res in &self.resources {
637+
for entry in &res.entries {
638+
let sigs: Vec<Vec<String>> = match &entry.value {
639+
Translation::Singular(v) => vec![signature(v)],
640+
Translation::Plural(p) => {
641+
let mut all = Vec::new();
642+
for (_cat, v) in &p.forms {
643+
all.push(signature(v));
644+
}
645+
all
646+
}
647+
};
648+
map.entry(entry.id.clone())
649+
.or_default()
650+
.entry(res.metadata.language.clone())
651+
.or_default()
652+
.push(sigs.into_iter().flatten().collect());
653+
}
654+
}
655+
656+
let mut problems = Vec::new();
657+
658+
for (key, langs) in map {
659+
// Per-language: ensure all collected signatures for this entry are identical
660+
let mut per_lang_sig: HashMap<String, Vec<String>> = HashMap::new();
661+
for (lang, sig_lists) in langs {
662+
if let Some(first) = sig_lists.first() {
663+
if sig_lists.iter().any(|s| s != first) {
664+
problems.push(format!(
665+
"Key '{}' in '{}': inconsistent placeholders across forms: {:?}",
666+
key, lang, sig_lists
667+
));
668+
}
669+
per_lang_sig.insert(lang, first.clone());
670+
}
671+
}
672+
673+
// Across languages, pick one baseline and compare
674+
if let Some((base_lang, base_sig)) = per_lang_sig.iter().next() {
675+
for (lang, sig) in &per_lang_sig {
676+
if sig != base_sig {
677+
problems.push(format!(
678+
"Key '{}' mismatch: {} {:?} vs {} {:?}",
679+
key, base_lang, base_sig, lang, sig
680+
));
681+
}
682+
}
683+
}
684+
}
685+
686+
if problems.is_empty() || !strict {
687+
if strict && !problems.is_empty() {
688+
// unreachable given condition, retained for clarity
689+
}
690+
if problems.is_empty() {
691+
Ok(())
692+
} else {
693+
// Non-strict mode: return Ok but could be logged by caller. For now, include in error.
694+
Err(Error::validation_error(format!("Placeholder issues: {}", problems.join(" | "))))
695+
}
696+
} else {
697+
Err(Error::validation_error(format!("Placeholder issues: {}", problems.join(" | "))))
698+
}
699+
}
700+
622701
/// Merge resources with the same language by the given strategy.
623702
///
624703
/// This method groups resources by language and merges multiple resources
@@ -1584,4 +1663,57 @@ mod tests {
15841663
assert_eq!(merged.resources[0].metadata.language, "en");
15851664
assert_eq!(merged.resources[0].entries.len(), 2);
15861665
}
1666+
1667+
#[test]
1668+
fn test_validate_placeholders_across_languages() {
1669+
let mut codec = Codec::new();
1670+
// English with %1$@, French with %1$s should match after normalization
1671+
codec.add_resource(Resource {
1672+
metadata: Metadata { language: "en".into(), domain: "d".into(), custom: HashMap::new() },
1673+
entries: vec![Entry {
1674+
id: "greet".into(),
1675+
value: Translation::Singular("Hello %1$@".into()),
1676+
comment: None,
1677+
status: EntryStatus::Translated,
1678+
custom: HashMap::new(),
1679+
}],
1680+
});
1681+
codec.add_resource(Resource {
1682+
metadata: Metadata { language: "fr".into(), domain: "d".into(), custom: HashMap::new() },
1683+
entries: vec![Entry {
1684+
id: "greet".into(),
1685+
value: Translation::Singular("Bonjour %1$s".into()),
1686+
comment: None,
1687+
status: EntryStatus::Translated,
1688+
custom: HashMap::new(),
1689+
}],
1690+
});
1691+
assert!(codec.validate_placeholders(true).is_ok());
1692+
}
1693+
1694+
#[test]
1695+
fn test_validate_placeholders_mismatch() {
1696+
let mut codec = Codec::new();
1697+
codec.add_resource(Resource {
1698+
metadata: Metadata { language: "en".into(), domain: "d".into(), custom: HashMap::new() },
1699+
entries: vec![Entry {
1700+
id: "count".into(),
1701+
value: Translation::Singular("%d files".into()),
1702+
comment: None,
1703+
status: EntryStatus::Translated,
1704+
custom: HashMap::new(),
1705+
}],
1706+
});
1707+
codec.add_resource(Resource {
1708+
metadata: Metadata { language: "fr".into(), domain: "d".into(), custom: HashMap::new() },
1709+
entries: vec![Entry {
1710+
id: "count".into(),
1711+
value: Translation::Singular("%s fichiers".into()),
1712+
comment: None,
1713+
status: EntryStatus::Translated,
1714+
custom: HashMap::new(),
1715+
}],
1716+
});
1717+
assert!(codec.validate_placeholders(true).is_err());
1718+
}
15871719
}

langcodec/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ pub mod error;
144144
pub mod formats;
145145
pub mod traits;
146146
pub mod types;
147+
pub mod placeholder;
147148

148149
// Re-export most used types for easy consumption
149150
pub use crate::{
@@ -155,6 +156,7 @@ pub use crate::{
155156
},
156157
error::Error,
157158
formats::FormatType,
159+
placeholder::{extract_placeholders, normalize_placeholders, signature},
158160
types::{
159161
ConflictStrategy, Entry, EntryStatus, Metadata, Plural, PluralCategory, Resource,
160162
Translation,

langcodec/src/placeholder.rs

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
//! Placeholder parsing, normalization and validation utilities.
2+
//!
3+
//! Goals:
4+
//! - Normalize common iOS vs Android placeholder variants to a canonical form.
5+
//! - Extract a placeholder "signature" for comparison across languages.
6+
//! - Validate placeholder consistency per entry (across all languages and plural forms).
7+
8+
9+
#[derive(Debug, Clone, PartialEq, Eq)]
10+
pub struct PlaceholderToken {
11+
pub index: Option<usize>,
12+
pub kind: char, // canonical kind: s, d, f, etc.
13+
}
14+
15+
impl PlaceholderToken {
16+
pub fn to_signature(&self) -> String {
17+
match self.index {
18+
Some(i) => format!("{}${}", i, self.kind),
19+
None => format!("{}", self.kind),
20+
}
21+
}
22+
}
23+
24+
/// Extracts placeholder tokens from a string and returns them in occurrence order.
25+
/// Handles iOS and Android variants and ignores escaped percent `%%`.
26+
pub fn extract_placeholders(input: &str) -> Vec<PlaceholderToken> {
27+
let bytes = input.as_bytes();
28+
let mut i = 0;
29+
let mut out = Vec::new();
30+
31+
while i < bytes.len() {
32+
if bytes[i] != b'%' {
33+
i += 1;
34+
continue;
35+
}
36+
// Handle escaped percent
37+
if i + 1 < bytes.len() && bytes[i + 1] == b'%' {
38+
i += 2;
39+
continue;
40+
}
41+
42+
let mut j = i + 1;
43+
44+
// Optional positional index: digits followed by '$'
45+
let mut index: Option<usize> = None;
46+
let start_digits = j;
47+
while j < bytes.len() && bytes[j].is_ascii_digit() {
48+
j += 1;
49+
}
50+
if j < bytes.len() && j > start_digits && bytes[j] == b'$' {
51+
// parse digits
52+
if let Some(num) = std::str::from_utf8(&bytes[start_digits..j])
53+
.ok()
54+
.and_then(|s| s.parse::<usize>().ok())
55+
{
56+
index = Some(num);
57+
}
58+
j += 1; // skip '$'
59+
} else {
60+
// reset j if not actually positional
61+
j = i + 1;
62+
}
63+
64+
// Optional length modifiers (l/ll)
65+
if j < bytes.len() && bytes[j] == b'l' {
66+
j += 1;
67+
if j < bytes.len() && bytes[j] == b'l' {
68+
j += 1;
69+
}
70+
}
71+
72+
// Expect a type character
73+
if j < bytes.len() {
74+
let ch = bytes[j] as char;
75+
if ch.is_ascii_alphabetic() || ch == '@' {
76+
out.push(PlaceholderToken { index, kind: canonical_kind_char(ch) });
77+
i = j + 1;
78+
continue;
79+
}
80+
}
81+
82+
// Not a recognized placeholder; skip this '%'
83+
i += 1;
84+
}
85+
86+
out
87+
}
88+
89+
/// Normalize a string by converting iOS-specific tokens to canonical ones.
90+
/// - %@ -> %s
91+
/// - %1$@ -> %1$s
92+
/// - %ld, %lu -> %d / %u
93+
pub fn normalize_placeholders(input: &str) -> String {
94+
let mut out = input.to_string();
95+
// Positional iOS object -> Android string
96+
out = out.replace("%1$@", "%1$s");
97+
out = out.replace("%2$@", "%2$s");
98+
out = out.replace("%3$@", "%3$s");
99+
out = out.replace("%4$@", "%4$s");
100+
out = out.replace("%5$@", "%5$s");
101+
// Simple iOS object -> string
102+
out = out.replace("%@", "%s");
103+
// Long ints to canonical
104+
out = out.replace("%ld", "%d");
105+
out = out.replace("%lu", "%u");
106+
out
107+
}
108+
109+
/// Build a normalized signature (sequence of tokens) for comparison.
110+
pub fn signature(input: &str) -> Vec<String> {
111+
extract_placeholders(&normalize_placeholders(input))
112+
.into_iter()
113+
.map(|t| t.to_signature())
114+
.collect()
115+
}
116+
117+
fn canonical_kind(raw: &str) -> char { canonical_kind_char(raw.chars().next().unwrap_or('s')) }
118+
119+
fn canonical_kind_char(ch: char) -> char {
120+
match ch {
121+
'@' => 's',
122+
// Map uppercase to lowercase for type letters where it matters
123+
c => c.to_ascii_lowercase(),
124+
}
125+
}
126+
127+
#[cfg(test)]
128+
mod tests {
129+
use super::*;
130+
131+
#[test]
132+
fn test_extract_android_and_ios() {
133+
let s = "Hello %1$@, you have %2$d items and %s extra";
134+
let sig = signature(s);
135+
assert_eq!(sig, vec!["1$s", "2$d", "s"]);
136+
}
137+
138+
#[test]
139+
fn test_normalize_ios_simple() {
140+
let s = "Value: %@ and number %ld";
141+
let n = normalize_placeholders(s);
142+
assert!(n.contains("%s"));
143+
assert!(n.contains("%d"));
144+
assert_eq!(signature(s), vec!["s", "d"]);
145+
}
146+
147+
#[test]
148+
fn test_ignore_escaped_percent() {
149+
let s = "Discount: 50%% and value %d";
150+
let sig = signature(s);
151+
assert_eq!(sig, vec!["d"]);
152+
}
153+
}

0 commit comments

Comments
 (0)