implement new UCSUR characters, closes #90

neverRare · neverRare · commit 379ccb8e75ab · 2026-01-20T10:41:16.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,11 @@ NOTE: Before publishing:
 The latest on-development version can be accessed by building the source code.
 On this on-development version, things can be broken.
 
+- Implement nasin sitelen kalama pi linja lili a.k.a. cartouche tally marks.
+- Implement a bunch of words from the new UCSUR specification. This doesn't mean
+  these will be recognized, only Linku is used for inclusion criteria. However,
+  you can add these on the custom dictionary, then you can UCSUR characters for
+  these words.
 - Remove X ala X partial parsing setting. It is now always turned on.
 - Warn user when X ala X is used. It is still not implemented.
 - Link related projects.
diff --git a/src/parser/lexer.ts b/src/parser/lexer.ts
@@ -26,6 +26,7 @@ import {
 } from "./punctuation.ts";
 import { Token } from "./token.ts";
 import {
+  COMBINING_TALLY_MARK,
   END_OF_CARTOUCHE,
   END_OF_LONG_GLYPH,
   END_OF_REVERSE_LONG_GLYPH,
@@ -100,6 +101,25 @@ const punctuation = choiceOnlyOne(
 )
   .map((punctuation): Token => ({ type: "punctuation", punctuation }));
 const cartoucheElement = choiceOnlyOne(
+  sequence(
+    singleUcsurWord,
+    count(
+      allAtLeastOnce(
+        matchString(
+          COMBINING_TALLY_MARK,
+          SPECIAL_UCSUR_DESCRIPTIONS[COMBINING_TALLY_MARK],
+        )
+          .skip(spaces),
+      ),
+    ),
+  )
+    .map(([word, tallyMarks]) => {
+      if (tallyMarks <= word.length) {
+        return word.slice(0, tallyMarks);
+      } else {
+        throw new UnrecognizedError("excess dots");
+      }
+    }),
   singleUcsurWord
     .skip(match(NSK_COLON, "full width colon").skip(spaces)),
   sequence(
@@ -112,7 +132,7 @@ const cartoucheElement = choiceOnlyOne(
   )
     .map(([word, dots]) => {
       const count = /^[aeiou]/.test(word) ? dots + 1 : dots;
-      const morae = word.match(/[jklmnpstw]?[aeiou]|n/g)!;
+      const morae = word.match(/[jklmnpstwy]?[aeiou]|n/g)!;
       if (count <= morae.length) {
         return morae.slice(0, count).join("");
       } else {
diff --git a/src/parser/test.ts b/src/parser/test.ts
@@ -1,7 +1,6 @@
 // this code is Deno only
 
 import { assertEquals } from "@std/assert/equals";
-import { assertLess } from "@std/assert/less";
 import { assertNotEquals } from "@std/assert/not-equals";
 import { assertThrows } from "@std/assert/throws";
 import { parser } from "./parser.ts";
@@ -15,7 +14,6 @@ import {
   matchString,
   sequence,
 } from "./parser_lib.ts";
-import { KU_LILI_WORDS, KU_SULI_WORDS, PU_WORDS } from "./ucsur.ts";
 
 Deno.test("AST all distinct", () => {
   // examples gathered from https://github.com/kilipan/nasin-toki
@@ -238,24 +236,6 @@ Deno.test("parser all error", () => {
     assertThrows(() => parser.parse(sentence).collect());
   }
 });
-Deno.test("ucsur have proper length", () => {
-  assertEquals(PU_WORDS.length, 120);
-  assertEquals(KU_SULI_WORDS.length, 17);
-  assertEquals(KU_LILI_WORDS.length, 4);
-});
-Deno.test("ucsur ordered", () => {
-  for (const [i, word] of PU_WORDS.entries()) {
-    if (i < PU_WORDS.length - 1) {
-      const other = PU_WORDS[i + 1];
-      assertLess(word, PU_WORDS[i + 1], `error between ${word} and ${other}`);
-    }
-  }
-});
-Deno.test("no ali", () => {
-  for (const word of PU_WORDS) {
-    assertNotEquals(word, "ali");
-  }
-});
 Deno.test("small parser", () => {
   const space = match(/\s*/, "space");
   const parser = sequence(
diff --git a/src/parser/ucsur.ts b/src/parser/ucsur.ts
@@ -14,6 +14,7 @@ export const START_OF_REVERSE_LONG_GLYPH = "\u{F199A}";
 export const END_OF_REVERSE_LONG_GLYPH = "\u{F199B}";
 export const UCSUR_MIDDLE_DOT = "\u{F199C}";
 export const UCSUR_COLON = "\u{F199D}";
+export const COMBINING_TALLY_MARK = "\u{F199E}";
 
 export const SPECIAL_UCSUR_DESCRIPTIONS = {
   [START_OF_CARTOUCHE]: "start of cartouche",
@@ -30,6 +31,7 @@ export const SPECIAL_UCSUR_DESCRIPTIONS = {
   [END_OF_REVERSE_LONG_GLYPH]: "end of reverse long glyph",
   [UCSUR_MIDDLE_DOT]: "middle dot",
   [UCSUR_COLON]: "colon",
+  [COMBINING_TALLY_MARK]: "combining tally mark",
 };
 
 export type SpecialUcsur = keyof typeof SPECIAL_UCSUR_DESCRIPTIONS;
@@ -175,16 +177,45 @@ export const KU_SULI_WORDS = [
   "misikeke",
   "ku",
 ];
-export const KU_LILI_WORDS = [
+export const VARIANTS = [
+  "ni",
+  "ni",
+  "ni",
+  "sewi",
+];
+export const OTHER_WORDS = [
   "pake",
   "apeja",
   "majuna",
   "powe",
+  "linluwi",
+  "kiki",
+  "su",
+  "isipin",
+  "kamalawala",
+  "kapesi",
+  "melome",
+  "mijomi",
+  "misa",
+  "nimisin",
+  "nja",
+  "oke",
+  "omekapo",
+  "puwa",
+  "san",
+  "taki",
+  "te",
+  "to",
+  "unu",
+  "usawi",
+  "wa",
+  "wuwojiti",
+  "yupekosi",
 ];
 export const UCSUR_TO_LATIN = new Map(
   [
-    { start: 0xF1900, words: [...PU_WORDS, ...KU_SULI_WORDS] },
-    { start: 0xF19A0, words: KU_LILI_WORDS },
+    { start: 0xF1900, words: [...PU_WORDS, ...KU_SULI_WORDS, ...VARIANTS] },
+    { start: 0xF19A0, words: OTHER_WORDS },
   ]
     .flatMap(({ start, words }) =>
       words.map((latin, i) => [String.fromCodePoint(start + i), latin] as const)