From 93e37f3a0afbed66851e8603a4837fffc3570d76 Mon Sep 17 00:00:00 2001 From: Kymi808 Date: Fri, 29 May 2026 20:12:21 -0500 Subject: [PATCH] Fix metadata path leaking outside cache when ancestor dir contains '.py' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_copy_script_and_other_resources_in_importable_dir` derived the cache metadata sibling via `importable_local_file.split(".py")[0] + ".json"`. The intent (per the comment immediately below) is to swap the trailing `.py` extension for `.json`, but `str.split(".py")` splits on every occurrence and `[0]` takes everything before the first. For any cache path with `.py` in an ancestor directory — `.pyenv`, `.pycache`, pypy paths — the prefix gets truncated to before that directory. On a pyenv user's machine `/home/u/.pyenv/.../accuracy//accuracy.py` becomes `/home/u/.json`, so the metadata is written outside the cache tree (and gets clobbered by every subsequent `evaluate.load()`). Use `os.path.splitext(...)[0]` to strip only the trailing extension, matching the comment's intent. Adds a regression test using a tmp dir whose ancestor contains `.pyenv` and asserts the metadata lands next to the copied script and not at `/.json`. --- src/evaluate/loading.py | 4 +++- tests/test_load.py | 44 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/evaluate/loading.py b/src/evaluate/loading.py index 505015b15..8fca32583 100644 --- a/src/evaluate/loading.py +++ b/src/evaluate/loading.py @@ -330,7 +330,9 @@ def _copy_script_and_other_resources_in_importable_dir( shutil.copyfile(original_local_path, importable_local_file) # Record metadata associating original dataset path with local unique folder - meta_path = importable_local_file.split(".py")[0] + ".json" + # (use splitext so an ancestor directory containing ".py" — pyenv, pycache, + # pypy paths — doesn't truncate the prefix to before that directory) + meta_path = os.path.splitext(importable_local_file)[0] + ".json" if not os.path.exists(meta_path): meta = {"original file path": original_local_path, "local file path": importable_local_file} # the filename is *.py in our case, so better rename to filenam.json instead of filename.py.json diff --git a/tests/test_load.py b/tests/test_load.py index e20ea671f..a1e0f003a 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -138,3 +138,47 @@ def test_cache_with_remote_community_module(self): evaluation_module_factory( metric, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path ) + + +def test_copy_script_metadata_path_when_ancestor_dir_contains_py(): + """Regression: meta_path derivation when an ancestor directory contains ".py". + + The previous implementation used `importable_local_file.split(".py")[0]`, + which splits on every occurrence, so a cache path under ~/.pyenv/... wrote + the metadata to ~/.json — outside the cache tree — instead of next to the + copied script. + """ + import json + + from evaluate.loading import _copy_script_and_other_resources_in_importable_dir + + with tempfile.TemporaryDirectory() as root: + # Simulate a pyenv-style cache path: ancestor directory contains ".py". + importable_directory_path = os.path.join(root, ".pyenv", "evaluate_modules") + # The FileLock acquired inside the function needs the parent dir to exist. + os.makedirs(os.path.dirname(importable_directory_path), exist_ok=True) + subdirectory_name = "abcd1234" + name = "accuracy" + + original_script_path = os.path.join(root, "src_accuracy.py") + with open(original_script_path, "w", encoding="utf-8") as f: + f.write("# dummy metric script\n") + + _copy_script_and_other_resources_in_importable_dir( + name=name, + importable_directory_path=importable_directory_path, + subdirectory_name=subdirectory_name, + original_local_path=original_script_path, + local_imports=[], + additional_files=[], + download_mode=None, + ) + + expected_meta = os.path.join(importable_directory_path, subdirectory_name, name + ".json") + leaked_meta = os.path.join(root, ".json") + + assert os.path.exists(expected_meta), f"meta file missing at {expected_meta}" + assert not os.path.exists(leaked_meta), f"meta file leaked to {leaked_meta}" + with open(expected_meta, "r", encoding="utf-8") as f: + meta = json.load(f) + assert meta["original file path"] == original_script_path