From 9104b5e4086d02cb552c6edbffb4f816268a75dd Mon Sep 17 00:00:00 2001 From: yor1xd Date: Thu, 21 May 2026 21:43:32 -0300 Subject: [PATCH] lexer: implement \x and \u numeric escape sequences --- lexer/src/lib.rs | 51 +++++++++++++++++++- lexer/src/tests.rs | 117 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 2 deletions(-) diff --git a/lexer/src/lib.rs b/lexer/src/lib.rs index abe7084..f982b0f 100644 --- a/lexer/src/lib.rs +++ b/lexer/src/lib.rs @@ -387,6 +387,11 @@ fn parse_escape( ) -> Result { let mut count = 2; let is_oct = |x: char| ('0'..'8').contains(&x); + let is_hex = |i: usize| { + slice + .get(i) + .is_some_and(|&x| (x as char).is_ascii_hexdigit()) + }; let is_slice_oct = |i| slice.get(i).map(|&x| x as char).is_some_and(is_oct); let Some(to_escape) = slice.get(1).map(|x| *x as char) else { return Err(LexingError::UnexpectedEof); @@ -417,8 +422,38 @@ fn parse_escape( .take(count - 1) .fold(0, |acc, digit| acc * 8 + digit - b'0') as char } - 'x' if !posix_strict => todo!(), - 'u' => todo!(), + 'x' if !posix_strict => { + let num_digits = is_hex(2) as usize + (is_hex(2) && is_hex(3)) as usize; + + if num_digits == 0 { + 'x' + } else { + count += num_digits; + + let value = parse_hex_digits(&slice[2..2 + num_digits]) as u8; + value as char + } + } + 'u' if !posix_strict => { + let num_digits = (2..=9).take_while(|&i| is_hex(i)).count(); + + if num_digits == 0 { + 'u' + } else { + count += num_digits; + + let codepoint = parse_hex_digits(&slice[2..2 + num_digits]); + + // FIXME: assumes UTF-8 locale; replacement character and encoding may differ + // for non-UTF-8 locales. + let c = char::from_u32(codepoint).unwrap_or('\u{FFFD}'); + let mut buf = [0u8; 4]; + let encoded = c.encode_utf8(&mut buf); + out.extend_from_slice_copy(encoded.as_bytes()); + + return Ok(count); + } + } // Unspecified by POSIX; we ditto GNU. c => c, // TODO: Output warning }; @@ -426,6 +461,18 @@ fn parse_escape( Ok(count) } +fn parse_hex_digits(slice: &[u8]) -> u32 { + slice.iter().fold(0u32, |acc, &digit| { + acc * 16 + + match digit { + b'0'..=b'9' => (digit - b'0') as u32, + b'a'..=b'f' => (digit - b'a' + 10) as u32, + b'A'..=b'F' => (digit - b'A' + 10) as u32, + _ => unreachable!(), + } + }) +} + fn parse_ident<'a>(lex: &mut Lexer<'a>, index: impl SliceIndex<[u8], Output = [u8]>) -> &'a str { accept_operator(lex); // SAFETY: The regex matching ensures it is ASCII. diff --git a/lexer/src/tests.rs b/lexer/src/tests.rs index ebeb435..d56e8c1 100644 --- a/lexer/src/tests.rs +++ b/lexer/src/tests.rs @@ -259,3 +259,120 @@ fn lexer_test_regex_ambiguity() { ] ); } + +#[test] +fn lexer_test_hex_escape() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\x41\"", &arena, false, false), + &[Token::String(b"A".into())] + ); +} + +#[test] +fn lexer_test_hex_escape_uppercase() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\x4F\"", &arena, false, false), + &[Token::String(b"O".into())] + ); +} + +#[test] +fn lexer_test_hex_escape_single_digit() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\x9\"", &arena, false, false), + &[Token::String(b"\x09".into())] + ); +} + +#[test] +fn lexer_test_hex_escape_posix_strict() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\x41\"", &arena, true, false), + &[Token::String(b"x41".into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_posix_strict() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u0041\"", &arena, true, false), + &[Token::String(b"u0041".into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_ascii() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u0041\"", &arena, false, false), + &[Token::String(b"A".into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_two_byte() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u00e9\"", &arena, false, false), + &[Token::String("\u{00e9}".as_bytes().into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_three_byte() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u4e2d\"", &arena, false, false), + &[Token::String("\u{4e2d}".as_bytes().into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_uppercase() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u004F\"", &arena, false, false), + &[Token::String(b"O".into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_single_digit() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u9\"", &arena, false, false), + &[Token::String("\u{9}".as_bytes().into())] + ); +} + +#[test] +fn lexer_test_hex_escape_no_digits() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\x\"", &arena, false, false), + &[Token::String(b"x".into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_no_digits() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u\"", &arena, false, false), + &[Token::String(b"u".into())] + ); +} + +#[test] +fn lexer_test_unicode_escape_eight_digits() { + let arena = Bump::new(); + assert_eq!( + &lex(b"\"\\u00000032\"", &arena, false, false), + &[Token::String(b"2".into())] + ); +}