Skip to content

Commit 5a7b724

Browse files
Added new Pitfall: P019 where we detect inconsistent number of authors across metadata sources
1 parent 03227ab commit 5a7b724

3 files changed

Lines changed: 749 additions & 10 deletions

File tree

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from typing import Dict, List, Tuple
2+
from metacheck.utils.pitfall_utils import extract_metadata_source_filename
3+
4+
5+
def extract_authors_from_somef(somef_data: Dict) -> List[Dict[str, any]]:
6+
"""
7+
Extract all author entries from different sources in SoMEF output.
8+
Returns a list of dicts with source and author count information.
9+
"""
10+
if "author" not in somef_data:
11+
return []
12+
13+
author_entries = somef_data["author"]
14+
if not isinstance(author_entries, list):
15+
return []
16+
17+
results = []
18+
19+
for entry in author_entries:
20+
if "source" not in entry or "result" not in entry:
21+
continue
22+
23+
source = entry["source"]
24+
result = entry["result"]
25+
26+
author_count = 0
27+
authors_list = []
28+
29+
if isinstance(result, list):
30+
author_count = len(result)
31+
authors_list = [get_author_identifier(author) for author in result]
32+
elif isinstance(result, dict):
33+
author_count = 1
34+
authors_list = [get_author_identifier(result)]
35+
elif isinstance(result, str):
36+
author_count = 1
37+
authors_list = [get_author_identifier(result)]
38+
39+
if author_count > 0:
40+
results.append({
41+
"source": source,
42+
"source_file": extract_metadata_source_filename(source),
43+
"author_count": author_count,
44+
"authors": authors_list
45+
})
46+
47+
return results
48+
49+
50+
def get_author_identifier(author: any) -> str:
51+
"""
52+
Extract a string identifier for an author from various formats.
53+
"""
54+
if isinstance(author, str):
55+
return author.strip()
56+
elif isinstance(author, dict):
57+
if "name" in author:
58+
return str(author["name"]).strip()
59+
elif "value" in author:
60+
return str(author["value"]).strip()
61+
elif "email" in author:
62+
return str(author["email"]).strip()
63+
else:
64+
return str(author)
65+
else:
66+
return str(author)
67+
68+
69+
def find_author_count_inconsistencies(author_sources: List[Dict]) -> Tuple[bool, List[Dict]]:
70+
"""
71+
Check if there are inconsistencies in author counts across different sources.
72+
Returns (has_inconsistency, inconsistency_details).
73+
"""
74+
if len(author_sources) < 2:
75+
return False, []
76+
77+
counts = {}
78+
for source_info in author_sources:
79+
count = source_info["author_count"]
80+
if count not in counts:
81+
counts[count] = []
82+
counts[count].append(source_info)
83+
84+
if len(counts) <= 1:
85+
return False, []
86+
87+
inconsistencies = []
88+
sorted_counts = sorted(counts.keys())
89+
90+
for i in range(len(sorted_counts)):
91+
lower_count = sorted_counts[i]
92+
for j in range(i + 1, len(sorted_counts)):
93+
higher_count = sorted_counts[j]
94+
95+
for lower_source in counts[lower_count]:
96+
for higher_source in counts[higher_count]:
97+
inconsistencies.append({
98+
"source_with_fewer": lower_source["source_file"],
99+
"source_with_fewer_full": lower_source["source"],
100+
"fewer_count": lower_count,
101+
"fewer_authors": lower_source["authors"],
102+
"source_with_more": higher_source["source_file"],
103+
"source_with_more_full": higher_source["source"],
104+
"more_count": higher_count,
105+
"more_authors": higher_source["authors"],
106+
"difference": higher_count - lower_count
107+
})
108+
109+
return len(inconsistencies) > 0, inconsistencies
110+
111+
112+
def detect_inconsistent_author_count(somef_data: Dict, file_name: str) -> Dict:
113+
"""
114+
Detect inconsistent author counts across different metadata files.
115+
Returns detection result with warning info.
116+
"""
117+
result = {
118+
"has_warning": False,
119+
"file_name": file_name,
120+
"author_sources": [],
121+
"inconsistencies": [],
122+
"total_sources": 0,
123+
"min_author_count": 0,
124+
"max_author_count": 0
125+
}
126+
127+
author_sources = extract_authors_from_somef(somef_data)
128+
129+
if not author_sources:
130+
return result
131+
132+
result["author_sources"] = author_sources
133+
result["total_sources"] = len(author_sources)
134+
135+
counts = [src["author_count"] for src in author_sources]
136+
result["min_author_count"] = min(counts)
137+
result["max_author_count"] = max(counts)
138+
139+
has_inconsistency, inconsistencies = find_author_count_inconsistencies(author_sources)
140+
141+
if has_inconsistency:
142+
result["has_warning"] = True
143+
result["inconsistencies"] = inconsistencies
144+
145+
return result

0 commit comments

Comments
 (0)