Skip to content

Commit c5b6c08

Browse files
committed
lexers: Add a GNU Assembler lexer
1 parent 54b6e2b commit c5b6c08

2 files changed

Lines changed: 109 additions & 0 deletions

File tree

elixir/lexers/__main__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
lexer = lexers.DTSLexer(f.read())
1818
elif filename.endswith('Kconfig'):
1919
lexer = lexers.KconfigLexer(f.read())
20+
elif filename.endswith(('.s', '.S')):
21+
lexer = lexers.GasLexer(f.read())
2022
else:
2123
raise Exception("no lexer for filetype")
2224

elixir/lexers/lexers.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,3 +255,110 @@ def __init__(self, code):
255255
def lex(self):
256256
return simple_lexer(self.rules, self.code)
257257

258+
259+
# https://sourceware.org/binutils/docs/as.html#Syntax
260+
class GasLexer:
261+
# https://sourceware.org/binutils/docs/as.html#Symbol-Intro
262+
# apparently dots are okay, BUT ctags removes the first dot from labels, for example. same with dollars
263+
# /musl/v1.2.5/source/src/string/aarch64/memcpy.S#L92
264+
gasm_identifier = r'[a-zA-Z0-9_][a-zA-Z0-9_$.]*'
265+
266+
gasm_flonum = r'0?[a-zA-Z][+-]?([0-9]|\\s*\n\s*)*\.([0-9]|\\s*\n\s*)*([eE][+-]?[0-9]+)?'
267+
gasm_number = regex_or(gasm_flonum, shared.common_hexidecimal_integer, shared.common_binary_integer,
268+
shared.common_decimal_integer)
269+
270+
gasm_char = r"'(\\.|.|\n)"
271+
gasm_string = f'(({ shared.double_quote_string_with_escapes })|({ gasm_char }))'
272+
273+
gasm_comment_chars_map = {
274+
'generic': (r'#\s',),
275+
276+
'nios2': (r'#',),
277+
'openrisc': (r'#',),
278+
'powerpc': (r'#',),
279+
's390': (r'#',),
280+
'xtensa': (r'#',),
281+
'microblaze': (r'#',),
282+
'mips': (r'#',),
283+
'alpha': (r'#',),
284+
'csky': (r'#',),
285+
# BUT double pipe in macros is an operator... and # not in the first line in
286+
# /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S
287+
'm68k': ('|', '^#', r'#\s'),
288+
'arc': ('# ', ';'),
289+
290+
# https://sourceware.org/binutils/docs/as.html#HPPA-Syntax
291+
# /linux/v6.10.7/source/arch/parisc/kernel/perf_asm.S#L28
292+
'parisc': (';',),
293+
'x86': (';',),
294+
'tic6x': (';', '*'), # cx6, tms320, although the star is sketchy
295+
296+
# in below, # can be a comment only if the first character of the line
297+
298+
# https://sourceware.org/binutils/docs/as.html#SH-Syntax
299+
# /linux/v6.10.7/source/arch/sh/kernel/head_32.S#L58
300+
'sh': ('!', '^#'),
301+
# https://sourceware.org/binutils/docs/as.html#Sparc_002dSyntax
302+
# /linux/v6.10.7/source/arch/sparc/lib/memset.S#L125
303+
'sparc': ('!', '^#'),
304+
# used in ARM https://sourceware.org/binutils/docs/as.html#ARM-Syntax
305+
# /linux/v6.10.7/source/arch/arm/mach-sa1100/sleep.S#L33
306+
'arm32': ('@', '^#'),
307+
'cris': (';', '^#'),
308+
'avr': (';', '^#'),
309+
# blackfin, tile
310+
}
311+
312+
gasm_punctuation = r'[.,\[\]()<>{}%&+*!|@#$;:^/\\=~-]'
313+
# TODO make sure all relevant directives are listed here
314+
gasm_preprocessor = r'#[ \t]*(define|ifdef|ifndef|undef|if|else|elif|endif)'
315+
316+
rules_before_comments = [
317+
(shared.whitespace, TokenType.WHITESPACE),
318+
# don't interpret macro concatenate as a comment
319+
('##', TokenType.PUNCTUATION),
320+
# don't interpret or as a comment
321+
(r'\|\|', TokenType.PUNCTUATION),
322+
(FirstInLine(regex_or(shared.c_preproc_include, shared.c_preproc_warning_and_error)), TokenType.SPECIAL),
323+
(FirstInLine(gasm_preprocessor), TokenType.SPECIAL),
324+
(shared.common_slash_comment, TokenType.COMMENT),
325+
]
326+
327+
rules_after_comments = [
328+
(gasm_string, TokenType.STRING),
329+
(gasm_number, TokenType.NUMBER),
330+
(gasm_identifier, TokenType.IDENTIFIER),
331+
(gasm_punctuation, TokenType.PUNCTUATION),
332+
]
333+
334+
def __init__(self, code, arch='generic'):
335+
self.code = code
336+
self.comment_chars = self.gasm_comment_chars_map[arch]
337+
338+
def get_arch_rules(self):
339+
result = []
340+
341+
regex_chars = '*?+^.$\\[]|()'
342+
add_slash = lambda ch: '\\' + ch if ch in regex_chars else ch
343+
344+
for comment_char in self.comment_chars:
345+
if comment_char[0] == '^':
346+
result.append((
347+
FirstInLine(add_slash(comment_char[1]) + shared.singleline_comment_with_escapes_base),
348+
TokenType.COMMENT
349+
))
350+
else:
351+
result.append((
352+
add_slash(comment_char) + shared.singleline_comment_with_escapes_base,
353+
TokenType.COMMENT)
354+
)
355+
356+
return result
357+
358+
def lex(self):
359+
rules = self.rules_before_comments + \
360+
self.get_arch_rules() + \
361+
self.rules_after_comments
362+
363+
return simple_lexer(rules, self.code)
364+

0 commit comments

Comments
 (0)