1+ from typing import Dict , List , Tuple
2+ from metacheck .utils .pitfall_utils import extract_metadata_source_filename
3+
4+
5+ def extract_authors_from_somef (somef_data : Dict ) -> List [Dict [str , any ]]:
6+ """
7+ Extract all author entries from different sources in SoMEF output.
8+ Returns a list of dicts with source and author count information.
9+ """
10+ if "author" not in somef_data :
11+ return []
12+
13+ author_entries = somef_data ["author" ]
14+ if not isinstance (author_entries , list ):
15+ return []
16+
17+ results = []
18+
19+ for entry in author_entries :
20+ if "source" not in entry or "result" not in entry :
21+ continue
22+
23+ source = entry ["source" ]
24+ result = entry ["result" ]
25+
26+ author_count = 0
27+ authors_list = []
28+
29+ if isinstance (result , list ):
30+ author_count = len (result )
31+ authors_list = [get_author_identifier (author ) for author in result ]
32+ elif isinstance (result , dict ):
33+ author_count = 1
34+ authors_list = [get_author_identifier (result )]
35+ elif isinstance (result , str ):
36+ author_count = 1
37+ authors_list = [get_author_identifier (result )]
38+
39+ if author_count > 0 :
40+ results .append ({
41+ "source" : source ,
42+ "source_file" : extract_metadata_source_filename (source ),
43+ "author_count" : author_count ,
44+ "authors" : authors_list
45+ })
46+
47+ return results
48+
49+
50+ def get_author_identifier (author : any ) -> str :
51+ """
52+ Extract a string identifier for an author from various formats.
53+ """
54+ if isinstance (author , str ):
55+ return author .strip ()
56+ elif isinstance (author , dict ):
57+ if "name" in author :
58+ return str (author ["name" ]).strip ()
59+ elif "value" in author :
60+ return str (author ["value" ]).strip ()
61+ elif "email" in author :
62+ return str (author ["email" ]).strip ()
63+ else :
64+ return str (author )
65+ else :
66+ return str (author )
67+
68+
69+ def find_author_count_inconsistencies (author_sources : List [Dict ]) -> Tuple [bool , List [Dict ]]:
70+ """
71+ Check if there are inconsistencies in author counts across different sources.
72+ Returns (has_inconsistency, inconsistency_details).
73+ """
74+ if len (author_sources ) < 2 :
75+ return False , []
76+
77+ counts = {}
78+ for source_info in author_sources :
79+ count = source_info ["author_count" ]
80+ if count not in counts :
81+ counts [count ] = []
82+ counts [count ].append (source_info )
83+
84+ if len (counts ) <= 1 :
85+ return False , []
86+
87+ inconsistencies = []
88+ sorted_counts = sorted (counts .keys ())
89+
90+ for i in range (len (sorted_counts )):
91+ lower_count = sorted_counts [i ]
92+ for j in range (i + 1 , len (sorted_counts )):
93+ higher_count = sorted_counts [j ]
94+
95+ for lower_source in counts [lower_count ]:
96+ for higher_source in counts [higher_count ]:
97+ inconsistencies .append ({
98+ "source_with_fewer" : lower_source ["source_file" ],
99+ "source_with_fewer_full" : lower_source ["source" ],
100+ "fewer_count" : lower_count ,
101+ "fewer_authors" : lower_source ["authors" ],
102+ "source_with_more" : higher_source ["source_file" ],
103+ "source_with_more_full" : higher_source ["source" ],
104+ "more_count" : higher_count ,
105+ "more_authors" : higher_source ["authors" ],
106+ "difference" : higher_count - lower_count
107+ })
108+
109+ return len (inconsistencies ) > 0 , inconsistencies
110+
111+
112+ def detect_inconsistent_author_count (somef_data : Dict , file_name : str ) -> Dict :
113+ """
114+ Detect inconsistent author counts across different metadata files.
115+ Returns detection result with warning info.
116+ """
117+ result = {
118+ "has_warning" : False ,
119+ "file_name" : file_name ,
120+ "author_sources" : [],
121+ "inconsistencies" : [],
122+ "total_sources" : 0 ,
123+ "min_author_count" : 0 ,
124+ "max_author_count" : 0
125+ }
126+
127+ author_sources = extract_authors_from_somef (somef_data )
128+
129+ if not author_sources :
130+ return result
131+
132+ result ["author_sources" ] = author_sources
133+ result ["total_sources" ] = len (author_sources )
134+
135+ counts = [src ["author_count" ] for src in author_sources ]
136+ result ["min_author_count" ] = min (counts )
137+ result ["max_author_count" ] = max (counts )
138+
139+ has_inconsistency , inconsistencies = find_author_count_inconsistencies (author_sources )
140+
141+ if has_inconsistency :
142+ result ["has_warning" ] = True
143+ result ["inconsistencies" ] = inconsistencies
144+
145+ return result
0 commit comments