Skip to content

Commit 379ccb8

Browse files
committed
implement new UCSUR characters, closes #90
1 parent 61f6911 commit 379ccb8

4 files changed

Lines changed: 60 additions & 24 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ NOTE: Before publishing:
1616
The latest on-development version can be accessed by building the source code.
1717
On this on-development version, things can be broken.
1818

19+
- Implement nasin sitelen kalama pi linja lili a.k.a. cartouche tally marks.
20+
- Implement a bunch of words from the new UCSUR specification. This doesn't mean
21+
these will be recognized, only Linku is used for inclusion criteria. However,
22+
you can add these on the custom dictionary, then you can UCSUR characters for
23+
these words.
1924
- Remove X ala X partial parsing setting. It is now always turned on.
2025
- Warn user when X ala X is used. It is still not implemented.
2126
- Link related projects.

src/parser/lexer.ts

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import {
2626
} from "./punctuation.ts";
2727
import { Token } from "./token.ts";
2828
import {
29+
COMBINING_TALLY_MARK,
2930
END_OF_CARTOUCHE,
3031
END_OF_LONG_GLYPH,
3132
END_OF_REVERSE_LONG_GLYPH,
@@ -100,6 +101,25 @@ const punctuation = choiceOnlyOne(
100101
)
101102
.map((punctuation): Token => ({ type: "punctuation", punctuation }));
102103
const cartoucheElement = choiceOnlyOne(
104+
sequence(
105+
singleUcsurWord,
106+
count(
107+
allAtLeastOnce(
108+
matchString(
109+
COMBINING_TALLY_MARK,
110+
SPECIAL_UCSUR_DESCRIPTIONS[COMBINING_TALLY_MARK],
111+
)
112+
.skip(spaces),
113+
),
114+
),
115+
)
116+
.map(([word, tallyMarks]) => {
117+
if (tallyMarks <= word.length) {
118+
return word.slice(0, tallyMarks);
119+
} else {
120+
throw new UnrecognizedError("excess dots");
121+
}
122+
}),
103123
singleUcsurWord
104124
.skip(match(NSK_COLON, "full width colon").skip(spaces)),
105125
sequence(
@@ -112,7 +132,7 @@ const cartoucheElement = choiceOnlyOne(
112132
)
113133
.map(([word, dots]) => {
114134
const count = /^[aeiou]/.test(word) ? dots + 1 : dots;
115-
const morae = word.match(/[jklmnpstw]?[aeiou]|n/g)!;
135+
const morae = word.match(/[jklmnpstwy]?[aeiou]|n/g)!;
116136
if (count <= morae.length) {
117137
return morae.slice(0, count).join("");
118138
} else {

src/parser/test.ts

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
// this code is Deno only
22

33
import { assertEquals } from "@std/assert/equals";
4-
import { assertLess } from "@std/assert/less";
54
import { assertNotEquals } from "@std/assert/not-equals";
65
import { assertThrows } from "@std/assert/throws";
76
import { parser } from "./parser.ts";
@@ -15,7 +14,6 @@ import {
1514
matchString,
1615
sequence,
1716
} from "./parser_lib.ts";
18-
import { KU_LILI_WORDS, KU_SULI_WORDS, PU_WORDS } from "./ucsur.ts";
1917

2018
Deno.test("AST all distinct", () => {
2119
// examples gathered from https://github.com/kilipan/nasin-toki
@@ -238,24 +236,6 @@ Deno.test("parser all error", () => {
238236
assertThrows(() => parser.parse(sentence).collect());
239237
}
240238
});
241-
Deno.test("ucsur have proper length", () => {
242-
assertEquals(PU_WORDS.length, 120);
243-
assertEquals(KU_SULI_WORDS.length, 17);
244-
assertEquals(KU_LILI_WORDS.length, 4);
245-
});
246-
Deno.test("ucsur ordered", () => {
247-
for (const [i, word] of PU_WORDS.entries()) {
248-
if (i < PU_WORDS.length - 1) {
249-
const other = PU_WORDS[i + 1];
250-
assertLess(word, PU_WORDS[i + 1], `error between ${word} and ${other}`);
251-
}
252-
}
253-
});
254-
Deno.test("no ali", () => {
255-
for (const word of PU_WORDS) {
256-
assertNotEquals(word, "ali");
257-
}
258-
});
259239
Deno.test("small parser", () => {
260240
const space = match(/\s*/, "space");
261241
const parser = sequence(

src/parser/ucsur.ts

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ export const START_OF_REVERSE_LONG_GLYPH = "\u{F199A}";
1414
export const END_OF_REVERSE_LONG_GLYPH = "\u{F199B}";
1515
export const UCSUR_MIDDLE_DOT = "\u{F199C}";
1616
export const UCSUR_COLON = "\u{F199D}";
17+
export const COMBINING_TALLY_MARK = "\u{F199E}";
1718

1819
export const SPECIAL_UCSUR_DESCRIPTIONS = {
1920
[START_OF_CARTOUCHE]: "start of cartouche",
@@ -30,6 +31,7 @@ export const SPECIAL_UCSUR_DESCRIPTIONS = {
3031
[END_OF_REVERSE_LONG_GLYPH]: "end of reverse long glyph",
3132
[UCSUR_MIDDLE_DOT]: "middle dot",
3233
[UCSUR_COLON]: "colon",
34+
[COMBINING_TALLY_MARK]: "combining tally mark",
3335
};
3436

3537
export type SpecialUcsur = keyof typeof SPECIAL_UCSUR_DESCRIPTIONS;
@@ -175,16 +177,45 @@ export const KU_SULI_WORDS = [
175177
"misikeke",
176178
"ku",
177179
];
178-
export const KU_LILI_WORDS = [
180+
export const VARIANTS = [
181+
"ni",
182+
"ni",
183+
"ni",
184+
"sewi",
185+
];
186+
export const OTHER_WORDS = [
179187
"pake",
180188
"apeja",
181189
"majuna",
182190
"powe",
191+
"linluwi",
192+
"kiki",
193+
"su",
194+
"isipin",
195+
"kamalawala",
196+
"kapesi",
197+
"melome",
198+
"mijomi",
199+
"misa",
200+
"nimisin",
201+
"nja",
202+
"oke",
203+
"omekapo",
204+
"puwa",
205+
"san",
206+
"taki",
207+
"te",
208+
"to",
209+
"unu",
210+
"usawi",
211+
"wa",
212+
"wuwojiti",
213+
"yupekosi",
183214
];
184215
export const UCSUR_TO_LATIN = new Map(
185216
[
186-
{ start: 0xF1900, words: [...PU_WORDS, ...KU_SULI_WORDS] },
187-
{ start: 0xF19A0, words: KU_LILI_WORDS },
217+
{ start: 0xF1900, words: [...PU_WORDS, ...KU_SULI_WORDS, ...VARIANTS] },
218+
{ start: 0xF19A0, words: OTHER_WORDS },
188219
]
189220
.flatMap(({ start, words }) =>
190221
words.map((latin, i) => [String.fromCodePoint(start + i), latin] as const)

0 commit comments

Comments
 (0)