fix(lapis): allow non-ASCII characters in advanced queries (#1628)

fengelniederhammer · web-flow · commit 9cb694921292 · 2026-04-16T09:45:57.000+02:00
resolves #1603 ## Problem Non-ASCII characters (umlauts, accented letters, Cyrillic, CJK, etc.) in unquoted advanced query values were silently dropped by the ANTLR lexer, producing wrong results with no error. For example: - `division=Zürich` was parsed as `division=Zrich` → 0 results - `division.regex=Graubünden` was parsed as `division.regex=Graubnden` → 0 results Quoted values like `division='Zürich'` already worked correctly, since the `QUOTED_STRING` lexer rule accepts any character. ## Fix Added a `UNICODE_LETTER` lexer rule (`[\p{Letter}]`) and included it in the `charOrNumber` parser rule. This makes unquoted values behave consistently with quoted ones for any Unicode letter. ASCII letters continue to be matched by the existing `A`–`Z` lexer rules (which take priority by rule order), so all existing parsing — nucleotide/amino acid symbols, keywords (`NOT`, `MAYBE`, `ISNULL`, etc.) — is unaffected. Non-ASCII characters are also now valid in field name and gene/segment name positions, where they will produce a meaningful "field/gene not found" error rather than a silent wrong result or a confusing syntax error. (also see antlr/antlr4#1688 for some background info) ## PR Checklist - [x] All necessary documentation has been adapted. - [x] All necessary changes are explained in the `llms.txt`. - [x] The implemented feature is covered by an appropriate test.
diff --git a/lapis/src/main/antlr/org/genspectrum/lapis/model/advancedqueryparser/AdvancedQuery.g4 b/lapis/src/main/antlr/org/genspectrum/lapis/model/advancedqueryparser/AdvancedQuery.g4
@@ -84,7 +84,7 @@ value: name | QUOTED_STRING;
 dateOrNumber: digit+;
 digit: NUMBER | MINUS | DOT;
 name: charOrNumber+;
-charOrNumber: A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z | NUMBER | MINUS | UNDERSCORE | DOT | ASTERISK;
+charOrNumber: A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z | NUMBER | MINUS | UNDERSCORE | DOT | ASTERISK | UNICODE_LETTER;
 
 isNullQuery: isnull_ '(' name ')';
 isnull_: I S N U L L ;
@@ -122,6 +122,7 @@ UNDERSCORE: '_';
 DOT: '.';
 ASTERISK: '*';
 QUOTED_STRING: '\'' ( '\\' . | ~['\\] )* '\'';  // matches all strings with quotes, supports backslash escaping (e.g. \' for a literal single quote, \\ for a literal backslash)
+UNICODE_LETTER: [\p{Letter}\p{Mark}] ; // matches Unicode letters and combining marks (e.g. precomposed and NFD-decomposed umlauts, Devanagari matras)
 AND: ' ' A N D ' '; // space is important here, otherwise metadataNames with 'AND' in them would be misinterpreted
 OR: ' ' O R ' ';
 NOT: N O T ' ';
diff --git a/lapis/src/test/kotlin/org/genspectrum/lapis/model/AdvancedQueryFacadeTest.kt b/lapis/src/test/kotlin/org/genspectrum/lapis/model/AdvancedQueryFacadeTest.kt
@@ -341,6 +341,11 @@ class AdvancedQueryFacadeTest {
                     query = "some_metadata.regex='it\\'s'",
                     expected = StringSearch("some_metadata", "it's"),
                 ),
+                ValidTestCase(
+                    description = "unquoted regex with non-ASCII characters",
+                    query = "some_metadata.regex=Graubünden",
+                    expected = StringSearch("some_metadata", "Graubünden"),
+                ),
             ),
             invalid = listOf(
                 InvalidTestCase(
@@ -625,6 +630,78 @@ class AdvancedQueryFacadeTest {
                     query = "some_metadata='Côte d\\'Ivoire'",
                     expected = StringEquals("some_metadata", "Côte d'Ivoire"),
                 ),
+                ValidTestCase(
+                    description = "string equals with unquoted umlaut (ü)",
+                    query = "some_metadata=Zürich",
+                    expected = StringEquals("some_metadata", "Zürich"),
+                ),
+                // The tests below cover NFD (decomposed) Unicode forms, where diacritics are represented as
+                // separate combining mark codepoints (Unicode category M) rather than precomposed single
+                // codepoints. UNICODE_LETTER only matches \p{Letter} and therefore rejects combining marks,
+                // causing them to be silently dropped or triggering a token recognition error.
+                ValidTestCase(
+                    // "Zürich" in NFD: ü = u (U+0075) + combining diaeresis (U+0308, Mn)
+                    description = "string equals with unquoted umlaut in NFD form (u + combining diaeresis U+0308)",
+                    query = "some_metadata=Zu\u0308rich",
+                    expected = StringEquals("some_metadata", "Zu\u0308rich"),
+                ),
+                ValidTestCase(
+                    // "Bogotá" in NFD: á = a (U+0061) + combining acute accent (U+0301, Mn)
+                    description = "string equals with NFD acute accent (Bogota\u0301)",
+                    query = "some_metadata=Bogota\u0301",
+                    expected = StringEquals("some_metadata", "Bogota\u0301"),
+                ),
+                ValidTestCase(
+                    // "Genève" in NFD: è = e (U+0065) + combining grave accent (U+0300, Mn)
+                    description = "string equals with NFD grave accent (Gene\u0300ve)",
+                    query = "some_metadata=Gene\u0300ve",
+                    expected = StringEquals("some_metadata", "Gene\u0300ve"),
+                ),
+                ValidTestCase(
+                    // "Hà Nội" simplified to "HaNoi" with NFD: à = a + U+0300, ộ = o + U+0302 + U+0323
+                    // Tests stacked combining marks (two Mn per vowel) as in Vietnamese
+                    description = "string equals with NFD stacked combining marks (Vietnamese Ha\u0300No\u0323\u0302i)",
+                    query = "some_metadata=Ha\u0300No\u0323\u0302i",
+                    expected = StringEquals("some_metadata", "Ha\u0300No\u0323\u0302i"),
+                ),
+                ValidTestCase(
+                    // Devanagari: "दिल्ली" (Delhi) — contains matra ि (U+093F, Mc spacing mark) and
+                    // virama ् (U+094D, Mn non-spacing mark) even in NFC. These are combining marks
+                    // that are NOT results of NFD decomposition — they exist in NFC already.
+                    description = "string equals with Devanagari combining marks (Delhi in Hindi)",
+                    query = "some_metadata=\u0926\u093F\u0932\u094D\u0932\u0940",
+                    expected = StringEquals("some_metadata", "\u0926\u093F\u0932\u094D\u0932\u0940"),
+                ),
+                ValidTestCase(
+                    description = "string equals with unquoted accented character (â)",
+                    query = "some_metadata=Neuchâtel",
+                    expected = StringEquals("some_metadata", "Neuchâtel"),
+                ),
+                ValidTestCase(
+                    description = "string equals with unquoted cedilla (ç)",
+                    query = "some_metadata=Français",
+                    expected = StringEquals("some_metadata", "Français"),
+                ),
+                ValidTestCase(
+                    description = "string equals with unquoted tilde-n (ñ)",
+                    query = "some_metadata=España",
+                    expected = StringEquals("some_metadata", "España"),
+                ),
+                ValidTestCase(
+                    description = "string equals with unquoted Cyrillic characters",
+                    query = "some_metadata=Москва",
+                    expected = StringEquals("some_metadata", "Москва"),
+                ),
+                ValidTestCase(
+                    description = "string equals with unquoted Chinese characters",
+                    query = "some_metadata=北京",
+                    expected = StringEquals("some_metadata", "北京"),
+                ),
+                ValidTestCase(
+                    description = "string equals with unquoted mixed ASCII and non-ASCII",
+                    query = "some_metadata=Graubünden",
+                    expected = StringEquals("some_metadata", "Graubünden"),
+                ),
                 ValidTestCase(
                     description = "string equals with escaped backslash in value",
                     query = "some_metadata='back\\\\slash'",
@@ -726,6 +803,16 @@ class AdvancedQueryFacadeTest {
                     "floatField=notAFloat",
                     "'notAFloat' is not a valid float",
                 ),
+                InvalidTestCase(
+                    description = "non-ASCII field name",
+                    query = "divïsion=Bern",
+                    expected = "Metadata field divïsion does not exist",
+                ),
+                InvalidTestCase(
+                    description = "non-ASCII field name with regex suffix",
+                    query = "divïsion.regex=Bern",
+                    expected = "Metadata field divïsion does not exist",
+                ),
             ),
         )
 
@@ -797,6 +884,11 @@ class AdvancedQueryFacadeTest {
                     query = "invalidGene:501Y",
                     expected = "invalidGene is not a known segment or gene",
                 ),
+                InvalidTestCase(
+                    description = "named mutation with non-ASCII gene/segment name",
+                    query = "Ñ:123A",
+                    expected = "Ñ is not a known segment or gene",
+                ),
                 InvalidTestCase(
                     description = "'-' in nucleotide 'from' position is invalid",
                     query = "-300A",