Skip to content

Commit 008538a

Browse files
authored
refactor: 識別子の処理に関するリファクタリング (#970)
* CharStreamでサロゲートペアを1文字として扱う * JSON5の識別子を使えるように * columnをUTF-16コード単位に変更 * 使用中、使用予定の予約語のエラーを共通に * エスケープされた予約語をキーに持つオブジェクトリテラルのテスト * isIdentifierStart, isIdentifierPart関数を削除 * 文字種の追加を取り消し * Unicodeエスケープシーケンスの検証処理を削除
1 parent ed72d59 commit 008538a

10 files changed

Lines changed: 784 additions & 223 deletions

File tree

src/parser/plugins/validate-keyword.ts

Lines changed: 60 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,34 @@ import { visitNode } from '../visit.js';
33
import type * as Ast from '../../node.js';
44

55
// 予約語となっている識別子があるかを確認する。
6-
// - キーワードは字句解析の段階でそれぞれのKeywordトークンとなるため除外
6+
// - キーワードは字句解析の段階でそれぞれのKeywordトークンとなるが、エスケープシーケンスを含む場合はIdentifierトークンとなるので検証を行う。
77
// - 文脈キーワードは識別子に利用できるため除外
88

99
const reservedWord = [
10+
// 使用中の語
11+
'null',
12+
'true',
13+
'false',
14+
'each',
15+
'for',
16+
'loop',
17+
'do',
18+
'while',
19+
'break',
20+
'continue',
21+
'match',
22+
'case',
23+
'default',
24+
'if',
25+
'elif',
26+
'else',
27+
'return',
28+
'eval',
29+
'var',
30+
'let',
31+
'exists',
32+
33+
// 使用予定の語
1034
'as',
1135
'async',
1236
'attr',
@@ -52,25 +76,36 @@ const reservedWord = [
5276
'new',
5377
];
5478

55-
function throwReservedWordError(name: string, loc: Ast.Loc): void {
56-
throw new AiScriptSyntaxError(`Reserved word "${name}" cannot be used as variable name.`, loc.start);
79+
function validateName(name: string, pos: Ast.Pos): void {
80+
if (reservedWord.includes(name)) {
81+
throwReservedWordError(name, pos);
82+
}
83+
}
84+
85+
function validateTypeName(name: string, pos: Ast.Pos): void {
86+
if (name === 'null') {
87+
return;
88+
}
89+
validateName(name, pos);
90+
}
91+
92+
function throwReservedWordError(name: string, pos: Ast.Pos): never {
93+
throw new AiScriptSyntaxError(`Reserved word "${name}" cannot be used as variable name.`, pos);
5794
}
5895

5996
function validateDest(node: Ast.Node): Ast.Node {
6097
return visitNode(node, node => {
6198
switch (node.type) {
6299
case 'null': {
63-
throwReservedWordError(node.type, node.loc);
100+
throwReservedWordError(node.type, node.loc.start);
64101
break;
65102
}
66103
case 'bool': {
67-
throwReservedWordError(`${node.value}`, node.loc);
104+
throwReservedWordError(`${node.value}`, node.loc.start);
68105
break;
69106
}
70107
case 'identifier': {
71-
if (reservedWord.includes(node.name)) {
72-
throwReservedWordError(node.name, node.loc);
73-
}
108+
validateName(node.name, node.loc.start);
74109
break;
75110
}
76111
}
@@ -81,9 +116,7 @@ function validateDest(node: Ast.Node): Ast.Node {
81116

82117
function validateTypeParams(node: Ast.Fn | Ast.FnTypeSource): void {
83118
for (const typeParam of node.typeParams) {
84-
if (reservedWord.includes(typeParam.name)) {
85-
throwReservedWordError(typeParam.name, node.loc);
86-
}
119+
validateTypeName(typeParam.name, node.loc.start);
87120
}
88121
}
89122

@@ -97,48 +130,46 @@ function validateNode(node: Ast.Node): Ast.Node {
97130
case 'attr':
98131
case 'identifier':
99132
case 'prop': {
100-
if (reservedWord.includes(node.name)) {
101-
throwReservedWordError(node.name, node.loc);
102-
}
133+
validateName(node.name, node.loc.start);
103134
break;
104135
}
105136
case 'meta': {
106-
if (node.name != null && reservedWord.includes(node.name)) {
107-
throwReservedWordError(node.name, node.loc);
137+
if (node.name != null) {
138+
validateName(node.name, node.loc.start);
108139
}
109140
break;
110141
}
111142
case 'each': {
112-
if (node.label != null && reservedWord.includes(node.label)) {
113-
throwReservedWordError(node.label, node.loc);
143+
if (node.label != null) {
144+
validateName(node.label, node.loc.start);
114145
}
115146
validateDest(node.var);
116147
break;
117148
}
118149
case 'for': {
119-
if (node.label != null && reservedWord.includes(node.label)) {
120-
throwReservedWordError(node.label, node.loc);
150+
if (node.label != null) {
151+
validateName(node.label, node.loc.start);
121152
}
122-
if (node.var != null && reservedWord.includes(node.var)) {
123-
throwReservedWordError(node.var, node.loc);
153+
if (node.var != null) {
154+
validateName(node.var, node.loc.start);
124155
}
125156
break;
126157
}
127158
case 'loop': {
128-
if (node.label != null && reservedWord.includes(node.label)) {
129-
throwReservedWordError(node.label, node.loc);
159+
if (node.label != null) {
160+
validateName(node.label, node.loc.start);
130161
}
131162
break;
132163
}
133164
case 'break': {
134-
if (node.label != null && reservedWord.includes(node.label)) {
135-
throwReservedWordError(node.label, node.loc);
165+
if (node.label != null) {
166+
validateName(node.label, node.loc.start);
136167
}
137168
break;
138169
}
139170
case 'continue': {
140-
if (node.label != null && reservedWord.includes(node.label)) {
141-
throwReservedWordError(node.label, node.loc);
171+
if (node.label != null) {
172+
validateName(node.label, node.loc.start);
142173
}
143174
break;
144175
}
@@ -150,9 +181,7 @@ function validateNode(node: Ast.Node): Ast.Node {
150181
break;
151182
}
152183
case 'namedTypeSource': {
153-
if (reservedWord.includes(node.name)) {
154-
throwReservedWordError(node.name, node.loc);
155-
}
184+
validateTypeName(node.name, node.loc.start);
156185
break;
157186
}
158187
case 'fnTypeSource': {

src/parser/scanner.ts

Lines changed: 77 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { AiScriptSyntaxError, AiScriptUnexpectedEOFError } from '../error.js';
2+
import { decodeUnicodeEscapeSequence } from '../utils/characters.js';
23
import { CharStream } from './streams/char-stream.js';
34
import { TOKEN, TokenKind } from './token.js';
45
import { unexpectedTokenError } from './utils.js';
@@ -9,7 +10,9 @@ import type { Token, TokenPosition } from './token.js';
910
const spaceChars = [' ', '\t'];
1011
const lineBreakChars = ['\r', '\n'];
1112
const digit = /^[0-9]$/;
12-
const wordChar = /^[A-Za-z0-9_]$/;
13+
const identifierStart = /^[A-Za-z_]$/u;
14+
const identifierPart = /^[A-Za-z0-9_]$/u;
15+
const hexDigit = /^[0-9a-fA-F]$/;
1316
const exponentIndicatorPattern = /^[eE]$/;
1417

1518
/**
@@ -282,6 +285,11 @@ export class Scanner implements ITokenStream {
282285
}
283286
case '\\': {
284287
this.stream.next();
288+
if (!this.stream.eof && (this.stream.char as string) === 'u') {
289+
this.stream.prev();
290+
const wordToken = this.tryReadWord(hasLeftSpacing);
291+
if (wordToken) return wordToken;
292+
}
285293
return TOKEN(TokenKind.BackSlash, pos, { hasLeftSpacing });
286294
}
287295
case ']': {
@@ -332,17 +340,29 @@ export class Scanner implements ITokenStream {
332340

333341
private tryReadWord(hasLeftSpacing: boolean): Token | undefined {
334342
// read a word
335-
let value = '';
343+
if (this.stream.eof) {
344+
return;
345+
}
336346

337347
const pos = this.stream.getPos();
338348

339-
while (!this.stream.eof && wordChar.test(this.stream.char)) {
340-
value += this.stream.char;
341-
this.stream.next();
342-
}
343-
if (value.length === 0) {
349+
let rawValue = this.tryReadIdentifierStart();
350+
if (rawValue === undefined) {
344351
return;
345352
}
353+
while (!(this.stream.eof as boolean)) {
354+
const matchedIdentifierPart = this.tryReadIdentifierPart();
355+
if (matchedIdentifierPart === undefined) {
356+
break;
357+
}
358+
rawValue += matchedIdentifierPart;
359+
}
360+
361+
const value = decodeUnicodeEscapeSequence(rawValue);
362+
if (value !== rawValue) {
363+
throw new AiScriptSyntaxError(`Invalid identifier: "${rawValue}"`, pos);
364+
}
365+
346366
// check word kind
347367
switch (value) {
348368
case 'null': {
@@ -414,6 +434,56 @@ export class Scanner implements ITokenStream {
414434
}
415435
}
416436

437+
private tryReadIdentifierStart(): string | undefined {
438+
if (this.stream.eof) {
439+
return;
440+
}
441+
if (identifierStart.test(this.stream.char)) {
442+
const value = this.stream.char;
443+
this.stream.next();
444+
return value;
445+
}
446+
if (this.stream.char === '\\') {
447+
this.stream.next();
448+
return '\\' + this.readUnicodeEscapeSequence();
449+
}
450+
return;
451+
}
452+
453+
private tryReadIdentifierPart(): string | undefined {
454+
if (this.stream.eof) {
455+
return;
456+
}
457+
const matchedIdentifierStart = this.tryReadIdentifierStart();
458+
if (matchedIdentifierStart !== undefined) {
459+
return matchedIdentifierStart;
460+
}
461+
if (identifierPart.test(this.stream.char)) {
462+
const value = this.stream.char;
463+
this.stream.next();
464+
return value;
465+
}
466+
return;
467+
}
468+
469+
private readUnicodeEscapeSequence(): `u${string}` {
470+
if (this.stream.eof || (this.stream.char as string) !== 'u') {
471+
throw new AiScriptSyntaxError('character "u" expected', this.stream.getPos());
472+
}
473+
this.stream.next();
474+
475+
let code = '';
476+
for (let i = 0; i < 4; i++) {
477+
if (this.stream.eof || !hexDigit.test(this.stream.char)) {
478+
throw new AiScriptSyntaxError('hexadecimal digit expected', this.stream.getPos());
479+
}
480+
code += this.stream.char;
481+
this.stream.next();
482+
}
483+
484+
return `u${code}`;
485+
}
486+
417487
private tryReadDigits(hasLeftSpacing: boolean): Token | undefined {
418488
let wholeNumber = '';
419489
let fractional = '';

0 commit comments

Comments
 (0)