Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 49 additions & 2 deletions lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,11 @@ fn parse_escape<const REGEX: bool>(
) -> Result<usize> {
let mut count = 2;
let is_oct = |x: char| ('0'..'8').contains(&x);
let is_hex = |i: usize| {
slice
.get(i)
.is_some_and(|&x| (x as char).is_ascii_hexdigit())
};
let is_slice_oct = |i| slice.get(i).map(|&x| x as char).is_some_and(is_oct);
let Some(to_escape) = slice.get(1).map(|x| *x as char) else {
return Err(LexingError::UnexpectedEof);
Expand Down Expand Up @@ -417,15 +422,57 @@ fn parse_escape<const REGEX: bool>(
.take(count - 1)
.fold(0, |acc, digit| acc * 8 + digit - b'0') as char
}
'x' if !posix_strict => todo!(),
'u' => todo!(),
'x' if !posix_strict => {
let num_digits = is_hex(2) as usize + (is_hex(2) && is_hex(3)) as usize;

if num_digits == 0 {
'x'
} else {
count += num_digits;

let value = parse_hex_digits(&slice[2..2 + num_digits]) as u8;
value as char
}
}
'u' if !posix_strict => {
let num_digits = (2..=9).take_while(|&i| is_hex(i)).count();

if num_digits == 0 {
'u'
} else {
count += num_digits;

let codepoint = parse_hex_digits(&slice[2..2 + num_digits]);

// FIXME: assumes UTF-8 locale; replacement character and encoding may differ
// for non-UTF-8 locales.
let c = char::from_u32(codepoint).unwrap_or('\u{FFFD}');
let mut buf = [0u8; 4];
let encoded = c.encode_utf8(&mut buf);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assumes the system locale is UTF-8. This is fine for now; we can do this in a subsequent PR given its breadth. However, note that there are quite a few assumptions about UTF-8 in the code.

out.extend_from_slice_copy(encoded.as_bytes());

return Ok(count);
}
}
// Unspecified by POSIX; we ditto GNU.
c => c, // TODO: Output warning
};
out.push(escaped as u8);
Ok(count)
}

fn parse_hex_digits(slice: &[u8]) -> u32 {
slice.iter().fold(0u32, |acc, &digit| {
acc * 16
+ match digit {
b'0'..=b'9' => (digit - b'0') as u32,
b'a'..=b'f' => (digit - b'a' + 10) as u32,
b'A'..=b'F' => (digit - b'A' + 10) as u32,
_ => unreachable!(),
}
})
}

fn parse_ident<'a>(lex: &mut Lexer<'a>, index: impl SliceIndex<[u8], Output = [u8]>) -> &'a str {
accept_operator(lex);
// SAFETY: The regex matching ensures it is ASCII.
Expand Down
117 changes: 117 additions & 0 deletions lexer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,3 +259,120 @@ fn lexer_test_regex_ambiguity() {
]
);
}

#[test]
fn lexer_test_hex_escape() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\x41\"", &arena, false, false),
&[Token::String(b"A".into())]
);
}

#[test]
fn lexer_test_hex_escape_uppercase() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\x4F\"", &arena, false, false),
&[Token::String(b"O".into())]
);
}

#[test]
fn lexer_test_hex_escape_single_digit() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\x9\"", &arena, false, false),
&[Token::String(b"\x09".into())]
);
}

#[test]
fn lexer_test_hex_escape_posix_strict() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\x41\"", &arena, true, false),
&[Token::String(b"x41".into())]
);
}

#[test]
fn lexer_test_unicode_escape_posix_strict() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u0041\"", &arena, true, false),
&[Token::String(b"u0041".into())]
);
}

#[test]
fn lexer_test_unicode_escape_ascii() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u0041\"", &arena, false, false),
&[Token::String(b"A".into())]
);
}

#[test]
fn lexer_test_unicode_escape_two_byte() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u00e9\"", &arena, false, false),
&[Token::String("\u{00e9}".as_bytes().into())]
);
}

#[test]
fn lexer_test_unicode_escape_three_byte() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u4e2d\"", &arena, false, false),
&[Token::String("\u{4e2d}".as_bytes().into())]
);
}

#[test]
fn lexer_test_unicode_escape_uppercase() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u004F\"", &arena, false, false),
&[Token::String(b"O".into())]
);
}

#[test]
fn lexer_test_unicode_escape_single_digit() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u9\"", &arena, false, false),
&[Token::String("\u{9}".as_bytes().into())]
);
}

#[test]
fn lexer_test_hex_escape_no_digits() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\x\"", &arena, false, false),
&[Token::String(b"x".into())]
);
}

#[test]
fn lexer_test_unicode_escape_no_digits() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u\"", &arena, false, false),
&[Token::String(b"u".into())]
);
}

#[test]
fn lexer_test_unicode_escape_eight_digits() {
let arena = Bump::new();
assert_eq!(
&lex(b"\"\\u00000032\"", &arena, false, false),
&[Token::String(b"2".into())]
);
}
Loading