Skip to content

Commit 9cb6949

Browse files
fix(lapis): allow non-ASCII characters in advanced queries (#1628)
resolves #1603 ## Problem Non-ASCII characters (umlauts, accented letters, Cyrillic, CJK, etc.) in unquoted advanced query values were silently dropped by the ANTLR lexer, producing wrong results with no error. For example: - `division=Zürich` was parsed as `division=Zrich` → 0 results - `division.regex=Graubünden` was parsed as `division.regex=Graubnden` → 0 results Quoted values like `division='Zürich'` already worked correctly, since the `QUOTED_STRING` lexer rule accepts any character. ## Fix Added a `UNICODE_LETTER` lexer rule (`[\p{Letter}]`) and included it in the `charOrNumber` parser rule. This makes unquoted values behave consistently with quoted ones for any Unicode letter. ASCII letters continue to be matched by the existing `A`–`Z` lexer rules (which take priority by rule order), so all existing parsing — nucleotide/amino acid symbols, keywords (`NOT`, `MAYBE`, `ISNULL`, etc.) — is unaffected. Non-ASCII characters are also now valid in field name and gene/segment name positions, where they will produce a meaningful "field/gene not found" error rather than a silent wrong result or a confusing syntax error. (also see antlr/antlr4#1688 for some background info) ## PR Checklist - [x] All necessary documentation has been adapted. - [x] All necessary changes are explained in the `llms.txt`. - [x] The implemented feature is covered by an appropriate test.
1 parent 6fcb130 commit 9cb6949

2 files changed

Lines changed: 94 additions & 1 deletion

File tree

lapis/src/main/antlr/org/genspectrum/lapis/model/advancedqueryparser/AdvancedQuery.g4

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ value: name | QUOTED_STRING;
8484
dateOrNumber: digit+;
8585
digit: NUMBER | MINUS | DOT;
8686
name: charOrNumber+;
87-
charOrNumber: A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z | NUMBER | MINUS | UNDERSCORE | DOT | ASTERISK;
87+
charOrNumber: A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z | NUMBER | MINUS | UNDERSCORE | DOT | ASTERISK | UNICODE_LETTER;
8888

8989
isNullQuery: isnull_ '(' name ')';
9090
isnull_: I S N U L L ;
@@ -122,6 +122,7 @@ UNDERSCORE: '_';
122122
DOT: '.';
123123
ASTERISK: '*';
124124
QUOTED_STRING: '\'' ( '\\' . | ~['\\] )* '\''; // matches all strings with quotes, supports backslash escaping (e.g. \' for a literal single quote, \\ for a literal backslash)
125+
UNICODE_LETTER: [\p{Letter}\p{Mark}] ; // matches Unicode letters and combining marks (e.g. precomposed and NFD-decomposed umlauts, Devanagari matras)
125126
AND: ' ' A N D ' '; // space is important here, otherwise metadataNames with 'AND' in them would be misinterpreted
126127
OR: ' ' O R ' ';
127128
NOT: N O T ' ';

lapis/src/test/kotlin/org/genspectrum/lapis/model/AdvancedQueryFacadeTest.kt

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,11 @@ class AdvancedQueryFacadeTest {
341341
query = "some_metadata.regex='it\\'s'",
342342
expected = StringSearch("some_metadata", "it's"),
343343
),
344+
ValidTestCase(
345+
description = "unquoted regex with non-ASCII characters",
346+
query = "some_metadata.regex=Graubünden",
347+
expected = StringSearch("some_metadata", "Graubünden"),
348+
),
344349
),
345350
invalid = listOf(
346351
InvalidTestCase(
@@ -625,6 +630,78 @@ class AdvancedQueryFacadeTest {
625630
query = "some_metadata='Côte d\\'Ivoire'",
626631
expected = StringEquals("some_metadata", "Côte d'Ivoire"),
627632
),
633+
ValidTestCase(
634+
description = "string equals with unquoted umlaut (ü)",
635+
query = "some_metadata=Zürich",
636+
expected = StringEquals("some_metadata", "Zürich"),
637+
),
638+
// The tests below cover NFD (decomposed) Unicode forms, where diacritics are represented as
639+
// separate combining mark codepoints (Unicode category M) rather than precomposed single
640+
// codepoints. UNICODE_LETTER only matches \p{Letter} and therefore rejects combining marks,
641+
// causing them to be silently dropped or triggering a token recognition error.
642+
ValidTestCase(
643+
// "Zürich" in NFD: ü = u (U+0075) + combining diaeresis (U+0308, Mn)
644+
description = "string equals with unquoted umlaut in NFD form (u + combining diaeresis U+0308)",
645+
query = "some_metadata=Zu\u0308rich",
646+
expected = StringEquals("some_metadata", "Zu\u0308rich"),
647+
),
648+
ValidTestCase(
649+
// "Bogotá" in NFD: á = a (U+0061) + combining acute accent (U+0301, Mn)
650+
description = "string equals with NFD acute accent (Bogota\u0301)",
651+
query = "some_metadata=Bogota\u0301",
652+
expected = StringEquals("some_metadata", "Bogota\u0301"),
653+
),
654+
ValidTestCase(
655+
// "Genève" in NFD: è = e (U+0065) + combining grave accent (U+0300, Mn)
656+
description = "string equals with NFD grave accent (Gene\u0300ve)",
657+
query = "some_metadata=Gene\u0300ve",
658+
expected = StringEquals("some_metadata", "Gene\u0300ve"),
659+
),
660+
ValidTestCase(
661+
// "Hà Nội" simplified to "HaNoi" with NFD: à = a + U+0300, ộ = o + U+0302 + U+0323
662+
// Tests stacked combining marks (two Mn per vowel) as in Vietnamese
663+
description = "string equals with NFD stacked combining marks (Vietnamese Ha\u0300No\u0323\u0302i)",
664+
query = "some_metadata=Ha\u0300No\u0323\u0302i",
665+
expected = StringEquals("some_metadata", "Ha\u0300No\u0323\u0302i"),
666+
),
667+
ValidTestCase(
668+
// Devanagari: "दिल्ली" (Delhi) — contains matra ि (U+093F, Mc spacing mark) and
669+
// virama ् (U+094D, Mn non-spacing mark) even in NFC. These are combining marks
670+
// that are NOT results of NFD decomposition — they exist in NFC already.
671+
description = "string equals with Devanagari combining marks (Delhi in Hindi)",
672+
query = "some_metadata=\u0926\u093F\u0932\u094D\u0932\u0940",
673+
expected = StringEquals("some_metadata", "\u0926\u093F\u0932\u094D\u0932\u0940"),
674+
),
675+
ValidTestCase(
676+
description = "string equals with unquoted accented character (â)",
677+
query = "some_metadata=Neuchâtel",
678+
expected = StringEquals("some_metadata", "Neuchâtel"),
679+
),
680+
ValidTestCase(
681+
description = "string equals with unquoted cedilla (ç)",
682+
query = "some_metadata=Français",
683+
expected = StringEquals("some_metadata", "Français"),
684+
),
685+
ValidTestCase(
686+
description = "string equals with unquoted tilde-n (ñ)",
687+
query = "some_metadata=España",
688+
expected = StringEquals("some_metadata", "España"),
689+
),
690+
ValidTestCase(
691+
description = "string equals with unquoted Cyrillic characters",
692+
query = "some_metadata=Москва",
693+
expected = StringEquals("some_metadata", "Москва"),
694+
),
695+
ValidTestCase(
696+
description = "string equals with unquoted Chinese characters",
697+
query = "some_metadata=北京",
698+
expected = StringEquals("some_metadata", "北京"),
699+
),
700+
ValidTestCase(
701+
description = "string equals with unquoted mixed ASCII and non-ASCII",
702+
query = "some_metadata=Graubünden",
703+
expected = StringEquals("some_metadata", "Graubünden"),
704+
),
628705
ValidTestCase(
629706
description = "string equals with escaped backslash in value",
630707
query = "some_metadata='back\\\\slash'",
@@ -726,6 +803,16 @@ class AdvancedQueryFacadeTest {
726803
"floatField=notAFloat",
727804
"'notAFloat' is not a valid float",
728805
),
806+
InvalidTestCase(
807+
description = "non-ASCII field name",
808+
query = "divïsion=Bern",
809+
expected = "Metadata field divïsion does not exist",
810+
),
811+
InvalidTestCase(
812+
description = "non-ASCII field name with regex suffix",
813+
query = "divïsion.regex=Bern",
814+
expected = "Metadata field divïsion does not exist",
815+
),
729816
),
730817
)
731818

@@ -797,6 +884,11 @@ class AdvancedQueryFacadeTest {
797884
query = "invalidGene:501Y",
798885
expected = "invalidGene is not a known segment or gene",
799886
),
887+
InvalidTestCase(
888+
description = "named mutation with non-ASCII gene/segment name",
889+
query = "Ñ:123A",
890+
expected = "Ñ is not a known segment or gene",
891+
),
800892
InvalidTestCase(
801893
description = "'-' in nucleotide 'from' position is invalid",
802894
query = "-300A",

0 commit comments

Comments
 (0)