Skip to content

Commit 0d8da69

Browse files
Patch for #15 and other pitfalls/warnings that were found during unittesting
1 parent 165f332 commit 0d8da69

5 files changed

Lines changed: 518 additions & 522 deletions

File tree

src/metacheck/scripts/p005.py

Lines changed: 89 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,90 @@
1-
from typing import Dict
2-
import re
3-
from metacheck.utils.pitfall_utils import extract_metadata_source_filename
4-
5-
def has_multiple_authors_in_single_field(author_value: str) -> bool:
6-
"""
7-
Check if a single author field contains multiple authors.
8-
"""
9-
if not author_value or not isinstance(author_value, str):
10-
return False
11-
12-
# Common patterns indicating multiple authors
13-
multiple_author_patterns = [
14-
r' and ', # "John Smith and Jane Doe"
15-
r' & ', # "John Smith & Jane Doe"
16-
r',(?!\s+Jr\.?)', # "John Smith, Jane Doe" (but not "Smith, Jr.")
17-
r';', # "John Smith; Jane Doe"
18-
r'\n', # Multi-line authors
19-
]
20-
21-
author_value = author_value.strip()
22-
23-
# Check for patterns that suggest multiple authors
24-
for pattern in multiple_author_patterns:
25-
if re.search(pattern, author_value, re.IGNORECASE):
26-
return True
27-
28-
return False
29-
30-
31-
def detect_multiple_authors_single_field_pitfall(somef_data: Dict, file_name: str) -> Dict:
32-
"""
33-
Detect when metadata files have multiple authors in a single field instead of a list.
34-
"""
35-
result = {
36-
"has_pitfall": False,
37-
"file_name": file_name,
38-
"author_value": None,
39-
"source": None,
40-
"metadata_source_file": None, # Add this line
41-
"multiple_authors_detected": False
42-
}
43-
44-
if "authors" not in somef_data:
45-
return result
46-
47-
authors_entries = somef_data["authors"]
48-
if not isinstance(authors_entries, list):
49-
return result
50-
51-
# Define metadata sources to check
52-
metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json", "pom.xml", "pyproject.toml",
53-
"requirements.txt", "setup.py"]
54-
55-
for entry in authors_entries:
56-
source = entry.get("source", "")
57-
technique = entry.get("technique", "")
58-
59-
is_metadata_source = (
60-
technique == "code_parser" and
61-
any(src in source.lower() for src in metadata_sources)
62-
)
63-
64-
if is_metadata_source:
65-
if "result" in entry and "value" in entry["result"]:
66-
author_value = entry["result"]["value"]
67-
68-
# Handle different value formats
69-
if isinstance(author_value, str):
70-
if has_multiple_authors_in_single_field(author_value):
71-
result["has_pitfall"] = True
72-
result["author_value"] = author_value
73-
result["source"] = source
74-
result["metadata_source_file"] = extract_metadata_source_filename(source)
75-
result["multiple_authors_detected"] = True
76-
break
77-
78-
elif isinstance(author_value, dict) and "name" in author_value:
79-
# Handle structured author data
80-
name_value = author_value["name"]
81-
if isinstance(name_value, str) and has_multiple_authors_in_single_field(name_value):
82-
result["has_pitfall"] = True
83-
result["author_value"] = name_value
84-
result["source"] = source
85-
result["metadata_source_file"] = extract_metadata_source_filename(source)
86-
result["multiple_authors_detected"] = True
87-
break
88-
1+
2+
from typing import Dict
3+
import re
4+
from metacheck.utils.pitfall_utils import extract_metadata_source_filename
5+
6+
def has_multiple_authors_in_single_field(author_value: str) -> bool:
7+
"""
8+
Check if a single author field contains multiple authors.
9+
"""
10+
if not author_value or not isinstance(author_value, str):
11+
return False
12+
13+
# Common patterns indicating multiple authors
14+
multiple_author_patterns = [
15+
r' and ', # "John Smith and Jane Doe"
16+
r' & ', # "John Smith & Jane Doe"
17+
r',(?!\s+Jr\.?)', # "John Smith, Jane Doe" (but not "Smith, Jr.")
18+
r';', # "John Smith; Jane Doe"
19+
r'\n', # Multi-line authors
20+
]
21+
22+
author_value = author_value.strip()
23+
24+
# Check for patterns that suggest multiple authors
25+
for pattern in multiple_author_patterns:
26+
if re.search(pattern, author_value, re.IGNORECASE):
27+
return True
28+
29+
return False
30+
31+
32+
def detect_multiple_authors_single_field_pitfall(somef_data: Dict, file_name: str) -> Dict:
33+
"""
34+
Detect when metadata files have multiple authors in a single field instead of a list.
35+
"""
36+
result = {
37+
"has_pitfall": False,
38+
"file_name": file_name,
39+
"author_value": None,
40+
"source": None,
41+
"metadata_source_file": None,
42+
"multiple_authors_detected": False
43+
}
44+
45+
if "authors" not in somef_data:
46+
return result
47+
48+
authors_entries = somef_data["authors"]
49+
if not isinstance(authors_entries, list):
50+
return result
51+
52+
# Define metadata sources to check (convert to lowercase for comparison)
53+
metadata_sources = ["codemeta.json", "description", "composer.json", "package.json", "pom.xml", "pyproject.toml",
54+
"requirements.txt", "setup.py"]
55+
56+
for entry in authors_entries:
57+
source = entry.get("source", "")
58+
technique = entry.get("technique", "")
59+
60+
is_metadata_source = (
61+
technique == "code_parser" and
62+
any(src in source.lower() for src in metadata_sources)
63+
)
64+
65+
if is_metadata_source:
66+
if "result" in entry and "value" in entry["result"]:
67+
author_value = entry["result"]["value"]
68+
69+
# Handle different value formats
70+
if isinstance(author_value, str):
71+
if has_multiple_authors_in_single_field(author_value):
72+
result["has_pitfall"] = True
73+
result["author_value"] = author_value
74+
result["source"] = source
75+
result["metadata_source_file"] = extract_metadata_source_filename(source)
76+
result["multiple_authors_detected"] = True
77+
break
78+
79+
elif isinstance(author_value, dict) and "name" in author_value:
80+
# Handle structured author data
81+
name_value = author_value["name"]
82+
if isinstance(name_value, str) and has_multiple_authors_in_single_field(name_value):
83+
result["has_pitfall"] = True
84+
result["author_value"] = name_value
85+
result["source"] = source
86+
result["metadata_source_file"] = extract_metadata_source_filename(source)
87+
result["multiple_authors_detected"] = True
88+
break
89+
8990
return result

src/metacheck/scripts/p006.py

Lines changed: 83 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,84 @@
1-
from typing import Dict, List
2-
import re
3-
from urllib.parse import urlparse
4-
5-
6-
def is_homepage_url(url: str) -> bool:
7-
"""
8-
Check if a URL appears to be a homepage/wiki rather than a README file.
9-
Returns True if it's likely a homepage.
10-
"""
11-
if not url:
12-
return False
13-
14-
url_lower = url.lower()
15-
16-
# Check for documentation sites and wikis
17-
homepage_indicators = [
18-
'.readthedocs.io',
19-
'.github.io',
20-
'wiki',
21-
'docs.',
22-
'documentation',
23-
'.org',
24-
'.com',
25-
'.net'
26-
]
27-
28-
# If it contains github.com or gitlab.com but NOT pointing to a specific README file
29-
if ('github.com' in url_lower or 'gitlab.com' in url_lower):
30-
# If it's pointing to a specific README file, it's OK
31-
if 'readme' in url_lower or 'blob/' in url_lower:
32-
return False
33-
# If it's just the repository root, it's a homepage
34-
return True
35-
36-
# Check for other homepage indicators
37-
for indicator in homepage_indicators:
38-
if indicator in url_lower:
39-
return True
40-
41-
return False
42-
43-
44-
def detect_readme_homepage_pitfall(somef_data: Dict, file_name: str) -> Dict:
45-
"""
46-
Detect when README property in codemeta.json points to homepage/wiki instead of README file.
47-
"""
48-
result = {
49-
"has_pitfall": False,
50-
"file_name": file_name,
51-
"readme_url": None,
52-
"source": None,
53-
"is_homepage": False
54-
}
55-
56-
if "readme_url" not in somef_data:
57-
return result
58-
59-
readme_entries = somef_data["readme_url"]
60-
if not isinstance(readme_entries, list):
61-
return result
62-
63-
for entry in readme_entries:
64-
if "technique" in entry and entry["technique"] == "code_parser":
65-
if "source" in entry and "codemeta.json" in entry["source"]:
66-
if "result" in entry and "value" in entry["result"]:
67-
readme_url = entry["result"]["value"]
68-
69-
if is_homepage_url(readme_url):
70-
result["has_pitfall"] = True
71-
result["readme_url"] = readme_url
72-
result["source"] = entry["source"]
73-
result["is_homepage"] = True
74-
break
75-
1+
from typing import Dict, List
2+
import re
3+
from urllib.parse import urlparse
4+
5+
6+
def is_homepage_url(url: str) -> bool:
7+
"""
8+
Check if a URL appears to be a homepage/wiki rather than a README file.
9+
Returns True if it's likely a homepage.
10+
"""
11+
if not url:
12+
return False
13+
14+
url_lower = url.lower()
15+
16+
if 'raw.githubusercontent.com' in url_lower:
17+
return False
18+
19+
# Check for documentation sites and wikis
20+
homepage_indicators = [
21+
'.readthedocs.io',
22+
'.github.io',
23+
'wiki',
24+
'docs.',
25+
'documentation'
26+
]
27+
28+
if ('github.com' in url_lower or 'gitlab.com' in url_lower):
29+
30+
if 'readme' in url_lower or 'blob/' in url_lower:
31+
return False
32+
33+
return True
34+
35+
# Check for other homepage indicators
36+
for indicator in homepage_indicators:
37+
if indicator in url_lower:
38+
return True
39+
40+
# Check for generic domains (.org, .com, .net) but more specific
41+
if any(domain in url_lower for domain in ['.org', '.com', '.net']):
42+
if any(ext in url_lower for ext in ['.md', '.txt', '.rst', '.html', 'readme']):
43+
return False
44+
45+
if '/' in url_lower.split('.')[-1]:
46+
return False
47+
return True
48+
49+
return False
50+
51+
52+
def detect_readme_homepage_pitfall(somef_data: Dict, file_name: str) -> Dict:
53+
"""
54+
Detect when README property in codemeta.json points to homepage/wiki instead of README file.
55+
"""
56+
result = {
57+
"has_pitfall": False,
58+
"file_name": file_name,
59+
"readme_url": None,
60+
"source": None,
61+
"is_homepage": False
62+
}
63+
64+
if "readme_url" not in somef_data:
65+
return result
66+
67+
readme_entries = somef_data["readme_url"]
68+
if not isinstance(readme_entries, list):
69+
return result
70+
71+
for entry in readme_entries:
72+
if "technique" in entry and entry["technique"] == "code_parser":
73+
if "source" in entry and "codemeta.json" in entry["source"]:
74+
if "result" in entry and "value" in entry["result"]:
75+
readme_url = entry["result"]["value"]
76+
77+
if is_homepage_url(readme_url):
78+
result["has_pitfall"] = True
79+
result["readme_url"] = readme_url
80+
result["source"] = entry["source"]
81+
result["is_homepage"] = True
82+
break
83+
7684
return result

0 commit comments

Comments
 (0)