From e9fe011e656588354fc569c14d777c69f3270788 Mon Sep 17 00:00:00 2001 From: ivanharvard <144486839+ivanharvard@users.noreply.github.com> Date: Mon, 28 Jul 2025 16:26:41 -0400 Subject: [PATCH 1/4] added inferencing of code in txt files --- compare50/_data.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/compare50/_data.py b/compare50/_data.py index c9b17d7..0a13b08 100644 --- a/compare50/_data.py +++ b/compare50/_data.py @@ -183,22 +183,29 @@ def tokens(self): def lexer(self): """Determine which Pygments lexer should be used.""" + def read_and_guess_lexer(): + try: + return pygments.lexers.guess_lexer(self.read()) + except pygments.util.ClassNotFound: + return pygments.lexers.special.TextLexer() + ext = self.name.suffix try: return self._lexer_cache[ext] except KeyError: pass + # if this is a txt file, assume its some kind of code and infer its lexer + if ext == '.txt': + return read_and_guess_lexer() + # get lexer for this file type try: lexer = pygments.lexers.get_lexer_for_filename(self.name.name) self._lexer_cache[ext] = lexer return lexer except pygments.util.ClassNotFound: - try: - return pygments.lexers.guess_lexer(self.read()) - except pygments.util.ClassNotFound: - return pygments.lexers.special.TextLexer() + return read_and_guess_lexer() @classmethod def get(cls, id): @@ -207,8 +214,13 @@ def get(cls, id): def unprocessed_tokens(self): """Get the raw tokens of the file.""" - text = self.read() - lexer_tokens = self.lexer().get_tokens_unprocessed(text) + text = self.read() + lexer = self.lexer() + if lexer is None: + import termcolor + termcolor.cprint(f"{self.name.name} appears to be a plaintext file. Skipping.") + return [] + lexer_tokens = lexer.get_tokens_unprocessed(text) tokens = [] prevToken = None for token in lexer_tokens: From 69a6605c80dde2d296ace244e07025db1f36ae65 Mon Sep 17 00:00:00 2001 From: ivanharvard <144486839+ivanharvard@users.noreply.github.com> Date: Tue, 29 Jul 2025 14:21:29 -0400 Subject: [PATCH 2/4] warnings abt txt files are printed to stdout --- compare50/__main__.py | 26 ++++++++++++++++++++++++++ compare50/_data.py | 14 ++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/compare50/__main__.py b/compare50/__main__.py index bc6212f..ad98b36 100644 --- a/compare50/__main__.py +++ b/compare50/__main__.py @@ -19,6 +19,7 @@ import attr import lib50 import termcolor +import pygments from . import comparators, _api, _data, _renderer, __version__ @@ -245,6 +246,26 @@ def get_non_empty_subs(subs): def get_undecodable_files(subs): return [sub.path / file for sub in subs for file in sub.undecodable_files] + def warn_txt_files(subs): + """Warn about plaintext or code in file when the user submits a .txt file""" + for sub in subs: + for file in sub.files: + if file.name.suffix == ".txt": + try: + lexer = pygments.lexers.guess_lexer(file.read()) + except pygments.util.ClassNotFound: + lexer = pygments.lexers.special.TextLexer() + + # If the file is interpreted as a plaintext file + if isinstance(lexer, pygments.lexers.special.TextLexer): + termcolor.cprint( + f"{file.name.name} appears to be plaintext. Scoring using `structure` could produce unexpected results.", + "yellow", attrs=["bold"]) + else: + termcolor.cprint( + f"{file.name.name} is a .txt file that appears to contain code. Interpreting as: {lexer.name}", + "yellow", attrs=["bold"]) + # Print the number of subs, archives, distro files, and the average number of files per sub n_subs = len(get_non_empty_subs(subs)) n_archives = len(get_non_empty_subs(archives)) @@ -285,6 +306,11 @@ def get_undecodable_files(subs): print_warning(undecodable, undecodable_archive, undecodable_distro, "non utf-8") did_print_warning = True + # Warn about txt files + warn_txt_files(subs) + warn_txt_files(archives) + warn_txt_files(distro_subs) + # Print suggestion to run with --verbose if any files are excluded if not verbose and did_print_warning: termcolor.cprint("Rerun with --verbose to see which files are excluded", diff --git a/compare50/_data.py b/compare50/_data.py index 0a13b08..e953545 100644 --- a/compare50/_data.py +++ b/compare50/_data.py @@ -184,21 +184,23 @@ def tokens(self): def lexer(self): """Determine which Pygments lexer should be used.""" def read_and_guess_lexer(): + """Reads a file, guesses an appropriate lexer. Fallback to plaintext.""" try: return pygments.lexers.guess_lexer(self.read()) except pygments.util.ClassNotFound: return pygments.lexers.special.TextLexer() ext = self.name.suffix - try: - return self._lexer_cache[ext] - except KeyError: - pass # if this is a txt file, assume its some kind of code and infer its lexer if ext == '.txt': return read_and_guess_lexer() + try: + return self._lexer_cache[ext] + except KeyError: + pass + # get lexer for this file type try: lexer = pygments.lexers.get_lexer_for_filename(self.name.name) @@ -216,10 +218,6 @@ def unprocessed_tokens(self): """Get the raw tokens of the file.""" text = self.read() lexer = self.lexer() - if lexer is None: - import termcolor - termcolor.cprint(f"{self.name.name} appears to be a plaintext file. Skipping.") - return [] lexer_tokens = lexer.get_tokens_unprocessed(text) tokens = [] prevToken = None From e7813223dbf25103a7805b02059c085b42b15874 Mon Sep 17 00:00:00 2001 From: ivanharvard <144486839+ivanharvard@users.noreply.github.com> Date: Tue, 29 Jul 2025 14:28:28 -0400 Subject: [PATCH 3/4] minor refactor change --- compare50/_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/compare50/_data.py b/compare50/_data.py index e953545..8a1a1a1 100644 --- a/compare50/_data.py +++ b/compare50/_data.py @@ -217,8 +217,7 @@ def get(cls, id): def unprocessed_tokens(self): """Get the raw tokens of the file.""" text = self.read() - lexer = self.lexer() - lexer_tokens = lexer.get_tokens_unprocessed(text) + lexer_tokens = lexer.lexer().get_tokens_unprocessed(text) tokens = [] prevToken = None for token in lexer_tokens: From 2bdeab46a03d2bd8d32649a492d50c22faf9649f Mon Sep 17 00:00:00 2001 From: ivanharvard <144486839+ivanharvard@users.noreply.github.com> Date: Tue, 29 Jul 2025 14:41:56 -0400 Subject: [PATCH 4/4] minor refactoring bug fix --- compare50/_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compare50/_data.py b/compare50/_data.py index 8a1a1a1..c1b76f4 100644 --- a/compare50/_data.py +++ b/compare50/_data.py @@ -217,7 +217,7 @@ def get(cls, id): def unprocessed_tokens(self): """Get the raw tokens of the file.""" text = self.read() - lexer_tokens = lexer.lexer().get_tokens_unprocessed(text) + lexer_tokens = self.lexer().get_tokens_unprocessed(text) tokens = [] prevToken = None for token in lexer_tokens: