Skip to content

Commit ef3f7ee

Browse files
Merge pull request #16 from Anas-Elhounsri/dev
Now the tool has suggestions in JSON-LD output, and can take multiple input paths for repositories
2 parents 5899da5 + bf22137 commit ef3f7ee

3 files changed

Lines changed: 97 additions & 67 deletions

File tree

src/metacheck/cli.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
import argparse
22
import os
3-
from metacheck.run_somef import run_somef_single, run_somef_batch
3+
from metacheck.run_somef import run_somef_batch
44
from metacheck.run_analyzer import run_analysis
55

66
def cli():
77
parser = argparse.ArgumentParser(description="Detect metadata pitfalls in software repositories using SoMEF.")
88
parser.add_argument(
99
"--input",
10-
help="Either a single GitHub repo URL or a path to a JSON file containing multiple repos."
10+
nargs="+", # <-- accepts multiple files
11+
required=True,
12+
help="One or more JSON files containing repositories (e.g., GitHub, GitLab)."
1113
)
1214
parser.add_argument(
1315
"--pitfalls-output",
@@ -27,26 +29,18 @@ def cli():
2729
)
2830

2931
args = parser.parse_args()
30-
input_value = args.input
3132
threshold = args.threshold
32-
pitfalls_output_dir = args.pitfalls_output
33-
analysis_output_file = args.analysis_output
34-
3533
somef_output_dir = os.path.join(os.getcwd(), "somef_outputs")
3634

37-
if not input_value:
38-
input_value = input("Enter a GitHub repository URL or path to a JSON file: ").strip()
35+
print(f"Detected {len(args.input)} input files:")
36+
for json_path in args.input:
37+
if not os.path.exists(json_path):
38+
print(f"Skipping missing file: {json_path}")
39+
continue
40+
print(f"Processing repositories from {json_path}")
41+
run_somef_batch(json_path, somef_output_dir, threshold)
3942

40-
if os.path.exists(input_value) and input_value.lower().endswith(".json"):
41-
print(f"Batch mode: reading repositories from {input_value}")
42-
success = run_somef_batch(input_value, somef_output_dir, threshold)
43-
if success:
44-
run_analysis(somef_output_dir, pitfalls_output_dir, analysis_output_file)
45-
else:
46-
print(f"Single repository mode: running SoMEF on {input_value}")
47-
result_dir = run_somef_single(input_value, somef_output_dir, threshold)
48-
if result_dir:
49-
run_analysis(result_dir, pitfalls_output_dir, analysis_output_file)
43+
run_analysis(somef_output_dir, args.pitfalls_output, args.analysis_output)
5044

5145
if __name__ == "__main__":
5246
cli()

src/metacheck/run_somef.py

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,48 @@
1-
import os
2-
import json
3-
import subprocess
4-
5-
def run_somef(repo_url, output_file, threshold):
6-
"""Run SoMEF on a given repository and save results."""
7-
try:
8-
subprocess.run(
9-
["somef", "describe", "-r", repo_url, "-o", output_file, "-t", str(threshold)],
10-
check=True
11-
)
12-
print(f"SoMEF finished for: {repo_url}")
13-
return True
14-
except subprocess.CalledProcessError as e:
15-
print(f"Error running SoMEF for {repo_url}: {e}")
16-
return False
17-
18-
def run_somef_single(repo_url, output_dir="somef_outputs", threshold=0.8):
19-
"""Run SoMEF for a single repository."""
20-
os.makedirs(output_dir, exist_ok=True)
21-
output_file = os.path.join(output_dir, "output_1.json")
22-
23-
print(f"Running SoMEF for {repo_url}...")
24-
success = run_somef(repo_url, output_file, threshold)
25-
return output_dir if success else None
26-
27-
def run_somef_batch(json_file, output_dir="somef_outputs", threshold=0.8):
28-
"""Run SoMEF for all repositories listed in a JSON file, then run analysis once."""
29-
os.makedirs(output_dir, exist_ok=True)
30-
31-
with open(json_file, "r") as f:
32-
data = json.load(f)
33-
34-
# Expected structure: {"repositories": ["repo1", "repo2", ...]}
35-
repos = data.get("repositories", [])
36-
if not repos:
37-
print("No repositories found in JSON file.")
38-
return False
39-
40-
print(f"Found {len(repos)} repositories to process.\n")
41-
42-
for idx, repo_url in enumerate(repos, start=1):
43-
output_file = os.path.join(output_dir, f"output_{idx}.json")
44-
print(f"[{idx}/{len(repos)}] Running SoMEF for: {repo_url}")
45-
run_somef(repo_url, output_file, threshold)
46-
47-
print("\nFinished running SoMEF for all repositories.")
48-
return True
1+
import os
2+
import json
3+
import subprocess
4+
5+
def run_somef(repo_url, output_file, threshold):
6+
"""Run SoMEF on a given repository and save results."""
7+
try:
8+
subprocess.run(
9+
["somef", "describe", "-r", repo_url, "-o", output_file, "-t", str(threshold)],
10+
check=True
11+
)
12+
print(f"SoMEF finished for: {repo_url}")
13+
return True
14+
except subprocess.CalledProcessError as e:
15+
print(f"Error running SoMEF for {repo_url}: {e}")
16+
return False
17+
18+
def run_somef_single(repo_url, output_dir="somef_outputs", threshold=0.8):
19+
"""Run SoMEF for a single repository."""
20+
os.makedirs(output_dir, exist_ok=True)
21+
output_file = os.path.join(output_dir, "output_1.json")
22+
23+
print(f"Running SoMEF for {repo_url}...")
24+
success = run_somef(repo_url, output_file, threshold)
25+
return output_dir if success else None
26+
27+
def run_somef_batch(json_file, output_dir="somef_outputs", threshold=0.8):
28+
"""Run SoMEF for all repositories listed in a JSON file."""
29+
os.makedirs(output_dir, exist_ok=True)
30+
31+
with open(json_file, "r") as f:
32+
data = json.load(f)
33+
34+
repos = data.get("repositories", [])
35+
if not repos:
36+
print(f" No repositories found in {json_file}.")
37+
return False
38+
39+
base_name = os.path.splitext(os.path.basename(json_file))[0]
40+
print(f"Running SoMEF for {len(repos)} repositories in {base_name}...")
41+
42+
for idx, repo_url in enumerate(repos, start=1):
43+
output_file = os.path.join(output_dir, f"{base_name}_output_{idx}.json")
44+
print(f"[{idx}/{len(repos)}] {repo_url}")
45+
run_somef(repo_url, output_file, threshold)
46+
47+
print(f"Completed SoMEF for {base_name}. Results in {output_dir}")
48+
return True

src/metacheck/utils/json_ld_utils.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,42 @@ def extract_software_info_from_somef(somef_data: Dict) -> Dict:
399399

400400
return software_info
401401

402+
def get_suggestion_text(pitfall_code: str) -> str:
403+
"""
404+
Adds the suggestions depending on the Pitfall/Warning
405+
"""
406+
pitfall_suggestions = {
407+
"P001": "Ensure the version in your metadata matches the latest official release. Keeping these synchronized avoids confusion for users and improves reproducibility.",
408+
"P002": "Update the copyright section with accurate names, organizations, and the current year. Personalizing this section ensures clarity and legal accuracy.",
409+
"W003": "Add version numbers to your dependencies. This provides stability for users and allows reproducibility across different environments.",
410+
"W004": "You need to align the version in your metadata file with your latest release tag. Automating this synchronization as part of your release process is highly recommended.",
411+
"P005": "You should separate multiple authors into a structured list. This allows tools and citation systems to correctly identify and credit each contributor.",
412+
"P006": "Update the README property so it points directly to your actual README file instead of your homepage. This helps ensure users and automated tools can access your project documentation easily.",
413+
"P007": "Standardize your version format across files using semantic versioning (for example, 1.2.0) or any standardized versioning scheme. Consistent versioning reduces confusion for users and systems.",
414+
"P008": "You need to replace local file paths with recognized SPDX license identifiers, such as MIT or GPL-3.0-only in URL form. This ensures your license can be correctly detected by automated tools.",
415+
"W010": "List all applicable licenses if your repository includes more than one. This avoids confusion about terms of use and ensures full transparency.",
416+
"P011": "Include version numbers for each programming language used. Defining these helps ensure reproducibility and compatibility across systems.",
417+
"W012": "Add a referencePublication field with the related DOI or citation entry to your CITATION.cff. This will help link your work to its scholarly references.",
418+
"P013": "Rewrite your dependencies as a proper list, with each item separated and preferably with their versions. This makes them easier to parse for metadata systems.",
419+
"W014": "Verify and update any dependency links to ensure they lead to valid and accessible pages.",
420+
"W015": "You should replace plain name in your identifier field with persistent identifiers, such as DOIs or SWHIDs, to improve discoverability and interoperability.",
421+
"P016": "You need to update the codeRepository field to point directly to your repository's source code instead of a homepage. Accurate links improve traceability and user access.",
422+
"P017": "You need to include the complete text of a recognized license such as MIT, Apache 2.0, or GPL. A full license clarifies rights and usage conditions for others",
423+
"P018": "You need to correct the issue tracker URL so it follows a valid format, such as https://github.com/user/repo/issues. Proper links help users engage with your development process.",
424+
"P019": "You need to update the downloadURL field to point to your latest release or current distribution source. Outdated links can mislead users or cause failed installations.",
425+
"P020": "You need to replace URLs in the developmentStatus field with descriptive text values, such as 'active', 'beta', or 'stable'. This maintains schema compliance and clarity.",
426+
"W021": "Ensure givenName is a single string per person. This ensures that every author is properly credited and can be extracted automatically ",
427+
"P022": "You should declare the specific version of the license using a recognized SPDX identifier. For example, use 'GPL-3.0-only' or 'GPL-2.0-or-later' instead of simply 'GPL'",
428+
"P023": "You should replace the remote-style syntax with a full web-accessible URL (e.g., https://github.com/user/repo).",
429+
"P024": "You should include the full DOI URL form in your metadata (e.g., https://doi.org/XX.XXXX/zenodo.XXXX)",
430+
"P025": "You need to update the outdated URLs to point to the current CI platform, or remove the property if no active CI is in place. A good pratcie would be to periodically test all external links, especially those related to CI or build status.",
431+
"P026": "Make sure that the codeRepository URL in your metadata exactly matches the repository hosting your source code.",
432+
"P027": "You need to synchronize all version references across metadata and build configuration files.",
433+
"P028": "Always use the full resolvable SWHID URL (e.g., https://archive.softwareheritage.org/swh:1:dir:abcd.../). This will ensures that both humans and machines can access the archived software snapshot directly"
434+
}
435+
436+
return pitfall_suggestions.get(pitfall_code, f"Suggestion for {pitfall_code}")
437+
402438
def extract_description_info(somef_data: Dict) -> str:
403439
"""
404440
Extract description information from SoMEF data.
@@ -464,7 +500,7 @@ def create_pitfall_jsonld(somef_data: Dict, pitfall_results: List[Dict], file_na
464500
"status": {"@id": "schema:CompletedActionStatus"},
465501
"checkId":pitfall_code,
466502
"evidence": format_evidence_text(pitfall_code, pitfall_result),
467-
"suggestion": ""
503+
"suggestion": get_suggestion_text(pitfall_code)
468504
}
469505

470506
jsonld_output["checks"].append(check_result)

0 commit comments

Comments
 (0)