Skip to content

Commit a8be09f

Browse files
Added detailed test edge cases and minor patches for pitfall and warings
1 parent 5386e08 commit a8be09f

44 files changed

Lines changed: 5671 additions & 169 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.idea/.gitignore

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/metacheck/detect_pitfalls_main.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from metacheck.scripts.warnings.w008 import detect_author_name_list_warning
3636
from metacheck.scripts.warnings.w009 import detect_development_status_url_pitfall
3737
from metacheck.scripts.warnings.w010 import detect_git_remote_shorthand_pitfall
38+
from metacheck.scripts.warnings.w011 import detect_inconsistent_author_count
3839

3940

4041
def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[str, Path], output_file: Union[str, Path]):
@@ -260,6 +261,13 @@ def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[s
260261
"percentage": 0.0,
261262
"languages": {}
262263
},
264+
{
265+
"pitfall_code": "W011",
266+
"pitfall_desc": "The metadata file codeRepository does not have matching number of authors",
267+
"count": 0,
268+
"percentage": 0.0,
269+
"languages": {}
270+
}
263271
]
264272
}
265273

@@ -299,6 +307,7 @@ def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[s
299307
(detect_author_name_list_warning, "W008"), # Index 24 -> W008
300308
(detect_development_status_url_pitfall, "W009"), # Index 25 -> W009
301309
(detect_git_remote_shorthand_pitfall, "W010"), # Index 26 -> W010
310+
(detect_inconsistent_author_count, "W011"),
302311
]
303312

304313
for json_file in json_files:

src/metacheck/scripts/pitfalls/p005.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import Dict
22
import re
33

4-
54
def is_software_archive_url(url: str) -> bool:
65
"""
76
Check if URL points to a software archive instead of a research paper.

src/metacheck/scripts/pitfalls/p006.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
21
from typing import Dict
3-
import re
42
from metacheck.utils.pitfall_utils import extract_metadata_source_filename
53

64
def is_local_file_license(license_value: str) -> bool:
@@ -26,7 +24,7 @@ def is_local_file_license(license_value: str) -> bool:
2624
'copying', 'copying.md', 'copying.txt',
2725
'copyright', 'copyright.md', 'copyright.txt',
2826
'licence', 'licence.md', 'licence.txt', # British spelling
29-
'readme.md', 'doc.txt', 'file.rst' # Other common file patterns
27+
'readme.md', 'doc.txt', 'file.rst'
3028
]
3129

3230
if license_lower in license_file_names:

src/metacheck/scripts/pitfalls/p007.py

Lines changed: 33 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -13,55 +13,38 @@ def detect_citation_missing_reference_publication_pitfall(somef_data: Dict, file
1313
"citation_cff_exists": False
1414
}
1515

16-
if "citation" not in somef_data:
17-
return result
18-
19-
citation_entries = somef_data["citation"]
20-
if not isinstance(citation_entries, list):
21-
return result
22-
23-
codemeta_citation_value = None
24-
citation_cff_citation_value = None
25-
citation_cff_exists_in_somef = False
26-
27-
for entry in citation_entries:
28-
source = entry.get("source", "")
29-
technique = entry.get("technique", "")
30-
31-
if technique == "code_parser" and "codemeta.json" in source:
32-
if "result" in entry and "value" in entry["result"]:
33-
codemeta_citation_value = entry["result"]["value"]
34-
result["codemeta_has_reference"] = True
35-
elif "CITATION.cff" in source:
36-
citation_cff_exists_in_somef = True
37-
result["citation_cff_exists"] = True
38-
if "result" in entry and "value" in entry["result"]:
39-
citation_cff_citation_value = entry["result"]["value"]
40-
41-
if not citation_cff_exists_in_somef:
42-
citation_cff_sources = ["authors", "title", "description", "version", "license"]
43-
for category in citation_cff_sources:
44-
if category in somef_data:
45-
entries = somef_data[category]
46-
if isinstance(entries, list):
47-
for entry in entries:
48-
source = entry.get("source", "")
49-
if "CITATION.cff" in source:
50-
citation_cff_exists_in_somef = True
51-
result["citation_cff_exists"] = True
52-
break
53-
54-
if (codemeta_citation_value and
55-
citation_cff_exists_in_somef and
56-
(not citation_cff_citation_value or citation_cff_citation_value != codemeta_citation_value)):
57-
58-
if citation_cff_citation_value:
59-
if ("doi.org" in codemeta_citation_value or "http" in codemeta_citation_value):
60-
if not ("doi.org" in citation_cff_citation_value or "http" in citation_cff_citation_value):
61-
result["has_pitfall"] = True
62-
elif codemeta_citation_value not in citation_cff_citation_value and citation_cff_citation_value not in codemeta_citation_value:
63-
result["has_pitfall"] = True
64-
else:
65-
result["has_pitfall"] = True
16+
if "reference_publication" in somef_data:
17+
ref_pub_entries = somef_data["reference_publication"]
18+
if isinstance(ref_pub_entries, list):
19+
for entry in ref_pub_entries:
20+
source = entry.get("source", "")
21+
technique = entry.get("technique", "")
22+
23+
if technique == "code_parser" and "codemeta.json" in source:
24+
if "result" in entry and "value" in entry["result"]:
25+
result["codemeta_has_reference"] = True
26+
27+
elif "CITATION.cff" in source:
28+
if "result" in entry and "value" in entry["result"]:
29+
result["citation_cff_has_reference"] = True
30+
31+
citation_cff_sources = ["authors", "title", "description", "version", "license"]
32+
for category in citation_cff_sources:
33+
if category in somef_data:
34+
entries = somef_data[category]
35+
if isinstance(entries, list):
36+
for entry in entries:
37+
source = entry.get("source", "")
38+
if "CITATION.cff" in source:
39+
result["citation_cff_exists"] = True
40+
break
41+
42+
if result["citation_cff_exists"]:
43+
break
44+
45+
if (result["codemeta_has_reference"] and
46+
result["citation_cff_exists"] and
47+
not result["citation_cff_has_reference"]):
48+
result["has_pitfall"] = True
6649

6750
return result

src/metacheck/scripts/pitfalls/p009.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
from typing import Dict
23
from metacheck.utils.pitfall_utils import extract_metadata_source_filename
34

@@ -11,11 +12,16 @@ def is_repository_url(url: str) -> bool:
1112

1213
url_lower = url.lower()
1314

14-
# Valid repository indicators
15+
if 'github.io' in url_lower:
16+
return False
17+
1518
repo_indicators = [
1619
'github.com/',
20+
'github.org/',
1721
'gitlab.com/',
22+
'gitlab.org/',
1823
'bitbucket.org/',
24+
'bitbucket.net/',
1925
'sourceforge.net/projects/',
2026
'git.',
2127
'.git'
@@ -37,7 +43,9 @@ def is_homepage_url_repo(url: str) -> bool:
3743

3844
url_lower = url.lower()
3945

40-
# Homepage indicators
46+
if is_repository_url(url):
47+
return False
48+
4149
homepage_indicators = [
4250
'.org/',
4351
'.com/',
@@ -50,11 +58,6 @@ def is_homepage_url_repo(url: str) -> bool:
5058
'github.io'
5159
]
5260

53-
# If it's clearly a repository URL, it's not a homepage
54-
if is_repository_url(url):
55-
return False
56-
57-
# Check for homepage indicators
5861
for indicator in homepage_indicators:
5962
if indicator in url_lower:
6063
return True
@@ -82,15 +85,17 @@ def detect_coderepository_homepage_pitfall(somef_data: Dict, file_name: str) ->
8285
if not isinstance(repo_entries, list):
8386
return result
8487

85-
metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json", "pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
88+
metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json",
89+
"pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
8690

8791
for entry in repo_entries:
8892
technique = entry.get("technique", "")
8993
source = entry.get("source", "")
9094

9195
is_metadata_source = (
92-
technique in metadata_sources or
93-
any(src in source.lower() for src in metadata_sources)
96+
technique == "code_parser" or
97+
technique in metadata_sources or
98+
any(src in source.lower() for src in metadata_sources)
9499
)
95100

96101
if is_metadata_source:

src/metacheck/scripts/pitfalls/p010.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
import re
32
from typing import Dict, Optional
43

@@ -18,7 +17,6 @@ def extract_license_from_file(somef_data: Dict) -> Optional[Dict[str, str]]:
1817
for entry in license_entries:
1918
if "source" in entry:
2019
source = entry["source"]
21-
# Look for LICENSE files (LICENSE, LICENSE.md, etc.)
2220
if "LICENSE" in source.upper() and "result" in entry and "value" in entry["result"]:
2321
return {
2422
"source": source,
@@ -41,16 +39,14 @@ def check_copyright_only_license(license_content: str) -> bool:
4139
content_lower = license_content.lower().strip()
4240
content_lines = [line.strip() for line in license_content.strip().split('\n') if line.strip()]
4341

44-
# Patterns that indicate copyright-only content
4542
copyright_only_patterns = [
46-
r'year\s*:\s*\d{4}', # YEAR: 2017 (removed ^ and $ to match anywhere in text)
43+
r'year\s*:\s*\d{4}', # YEAR: 2017
4744
r'copyright\s+holder\s*:\s*[a-zA-Z]', # COPYRIGHT HOLDER: Someone
4845
r'author\s*:\s*[a-zA-Z]', # AUTHOR: Someone
4946
r'copyright\s*©?\s*\d{4}', # Copyright 2017 or Copyright © 2017
5047
r'\(c\)\s*\d{4}', # (C) 2017
5148
]
5249

53-
# Patterns that indicate actual license terms
5450
license_term_patterns = [
5551
r'permission\s+is\s+hereby\s+granted',
5652
r'subject\s+to\s+the\s+following\s+conditions',
@@ -70,33 +66,36 @@ def check_copyright_only_license(license_content: str) -> bool:
7066
has_copyright_info = any(re.search(pattern, content_lower) for pattern in copyright_only_patterns)
7167
has_license_terms = any(re.search(pattern, content_lower) for pattern in license_term_patterns)
7268

73-
# If it has copyright info but no license terms and is short, it's likely copyright-only
69+
if has_license_terms:
70+
return False
71+
72+
# This will check if it has copyright info but no license terms and is short, it's likely copyright-only
7473
if has_copyright_info and not has_license_terms and len(content_lines) <= 10:
7574
return True
7675

77-
# Special case: check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
76+
# Check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
7877
year_pattern_found = bool(re.search(r'year\s*:\s*\d{4}', content_lower))
7978
copyright_holder_pattern_found = bool(re.search(r'copyright\s+holder\s*:', content_lower))
8079

8180
if year_pattern_found and copyright_holder_pattern_found:
81+
if has_license_terms:
82+
return False
8283
return True
8384

84-
# Additional check: if the content is very short and only contains basic copyright info
85-
if len(content_lines) <= 5: # Increased from 3 to 5 for more flexibility
86-
# Check if all lines are just copyright/year information
85+
if len(content_lines) <= 5:
8786
meaningful_lines = []
87+
8888
for line in content_lines:
8989
line_lower = line.lower()
90-
# Skip lines that are just copyright patterns
90+
9191
if not any(re.search(pattern, line_lower) for pattern in copyright_only_patterns):
92-
# This line doesn't match copyright patterns, check if it's meaningful
92+
9393
if (len(line.strip()) > 0 and
9494
not line.strip().startswith('#') and
9595
not line.strip().startswith('//') and
9696
line.strip() not in ['', '-', '=', '*']):
9797
meaningful_lines.append(line)
9898

99-
# If we have very few meaningful lines and some copyright info, it's probably copyright-only
10099
if len(meaningful_lines) <= 1 and has_copyright_info:
101100
return True
102101

src/metacheck/scripts/pitfalls/p012.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,21 @@ def extract_version_from_download_url(url: str) -> str:
1111

1212
# Common version patterns in download URLs
1313
version_patterns = [
14-
r'/archive/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)', # /archive/3.8.0 or /archive/v1.2.3
14+
r'/archive/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)\.', # /archive/3.8.0. or /archive/v1.2.3.
15+
r'/archive/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)$',
16+
# /archive/3.8.0 or /archive/v1.2.3 (end of string)
1517
r'[-_](?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)\.', # -3.8.0.tar.gz or _v1.2.3.zip
1618
r'/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)/[^/]*$', # /3.8.0/something
17-
r'[-_/](?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)(?:\.tar\.gz|\.zip|$)' # More flexible ending
1819
]
1920

2021
for pattern in version_patterns:
2122
match = re.search(pattern, url)
2223
if match:
23-
return match.group(1)
24+
version = match.group(1)
25+
# Remove any trailing file extension artifacts
26+
# This handles cases where .tar, .zip etc might be captured
27+
version = re.sub(r'\.(tar|gz|zip|bz2|xz|tgz).*$', '', version)
28+
return version
2429

2530
return None
2631

@@ -32,12 +37,16 @@ def normalize_version(version: str) -> str:
3237
if not version:
3338
return None
3439

35-
# Remove 'v' prefix if present
36-
normalized = version.lower().strip()
40+
normalized = version.strip()
41+
42+
if not normalized:
43+
return None
44+
45+
normalized = normalized.lower()
3746
if normalized.startswith('v'):
3847
normalized = normalized[1:]
3948

40-
return normalized
49+
return normalized if normalized else None
4150

4251

4352
def get_latest_release_version(somef_data: Dict) -> str:
@@ -51,21 +60,17 @@ def get_latest_release_version(somef_data: Dict) -> str:
5160
if not isinstance(releases, list) or not releases:
5261
return None
5362

54-
# Get the first (latest) release
5563
latest_release = releases[0]
5664
if "result" in latest_release:
5765
result = latest_release["result"]
5866

59-
# Try to get version from tag first
6067
if "tag" in result and result["tag"]:
6168
tag = result["tag"].strip()
6269
if tag:
6370
return normalize_version(tag)
6471

65-
# Fallback to name if tag is not available
6672
if "name" in result and result["name"]:
6773
name = result["name"]
68-
# Extract version from name
6974
version_match = re.search(r'(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)', name)
7075
if version_match:
7176
return normalize_version(version_match.group(1))
@@ -96,7 +101,6 @@ def detect_outdated_download_url_pitfall(somef_data: Dict, file_name: str) -> Di
96101
codemeta_download_url = None
97102
codemeta_source = None
98103

99-
# Find download URL from codemeta.json
100104
for entry in download_entries:
101105
source = entry.get("source", "")
102106
technique = entry.get("technique", "")
@@ -111,24 +115,20 @@ def detect_outdated_download_url_pitfall(somef_data: Dict, file_name: str) -> Di
111115
if not codemeta_download_url:
112116
return result
113117

114-
# Extract version from download URL
115118
download_version = extract_version_from_download_url(codemeta_download_url)
116119
if not download_version:
117120
return result
118121

119-
# Get latest release version
120122
latest_version = get_latest_release_version(somef_data)
121123
if not latest_version:
122124
return result
123125

124-
# Normalize both versions for comparison
125126
normalized_download_version = normalize_version(download_version)
126127
normalized_latest_version = normalize_version(latest_version)
127128

128129
if not normalized_download_version or not normalized_latest_version:
129130
return result
130131

131-
# Compare versions
132132
if normalized_download_version != normalized_latest_version:
133133
result["has_pitfall"] = True
134134
result["download_url"] = codemeta_download_url

0 commit comments

Comments
 (0)