1- from typing import Dict
2- import re
3- from metacheck .utils .pitfall_utils import extract_metadata_source_filename
4-
5- def has_multiple_authors_in_single_field (author_value : str ) -> bool :
6- """
7- Check if a single author field contains multiple authors.
8- """
9- if not author_value or not isinstance (author_value , str ):
10- return False
11-
12- # Common patterns indicating multiple authors
13- multiple_author_patterns = [
14- r' and ' , # "John Smith and Jane Doe"
15- r' & ' , # "John Smith & Jane Doe"
16- r',(?!\s+Jr\.?)' , # "John Smith, Jane Doe" (but not "Smith, Jr.")
17- r';' , # "John Smith; Jane Doe"
18- r'\n' , # Multi-line authors
19- ]
20-
21- author_value = author_value .strip ()
22-
23- # Check for patterns that suggest multiple authors
24- for pattern in multiple_author_patterns :
25- if re .search (pattern , author_value , re .IGNORECASE ):
26- return True
27-
28- return False
29-
30-
31- def detect_multiple_authors_single_field_pitfall (somef_data : Dict , file_name : str ) -> Dict :
32- """
33- Detect when metadata files have multiple authors in a single field instead of a list.
34- """
35- result = {
36- "has_pitfall" : False ,
37- "file_name" : file_name ,
38- "author_value" : None ,
39- "source" : None ,
40- "metadata_source_file" : None , # Add this line
41- "multiple_authors_detected" : False
42- }
43-
44- if "authors" not in somef_data :
45- return result
46-
47- authors_entries = somef_data ["authors" ]
48- if not isinstance (authors_entries , list ):
49- return result
50-
51- # Define metadata sources to check
52- metadata_sources = ["codemeta.json" , "DESCRIPTION" , "composer.json" , "package.json" , "pom.xml" , "pyproject.toml" ,
53- "requirements.txt" , "setup.py" ]
54-
55- for entry in authors_entries :
56- source = entry .get ("source" , "" )
57- technique = entry .get ("technique" , "" )
58-
59- is_metadata_source = (
60- technique == "code_parser" and
61- any (src in source .lower () for src in metadata_sources )
62- )
63-
64- if is_metadata_source :
65- if "result" in entry and "value" in entry ["result" ]:
66- author_value = entry ["result" ]["value" ]
67-
68- # Handle different value formats
69- if isinstance (author_value , str ):
70- if has_multiple_authors_in_single_field (author_value ):
71- result ["has_pitfall" ] = True
72- result ["author_value" ] = author_value
73- result ["source" ] = source
74- result ["metadata_source_file" ] = extract_metadata_source_filename (source )
75- result ["multiple_authors_detected" ] = True
76- break
77-
78- elif isinstance (author_value , dict ) and "name" in author_value :
79- # Handle structured author data
80- name_value = author_value ["name" ]
81- if isinstance (name_value , str ) and has_multiple_authors_in_single_field (name_value ):
82- result ["has_pitfall" ] = True
83- result ["author_value" ] = name_value
84- result ["source" ] = source
85- result ["metadata_source_file" ] = extract_metadata_source_filename (source )
86- result ["multiple_authors_detected" ] = True
87- break
88-
1+
2+ from typing import Dict
3+ import re
4+ from metacheck .utils .pitfall_utils import extract_metadata_source_filename
5+
6+ def has_multiple_authors_in_single_field (author_value : str ) -> bool :
7+ """
8+ Check if a single author field contains multiple authors.
9+ """
10+ if not author_value or not isinstance (author_value , str ):
11+ return False
12+
13+ # Common patterns indicating multiple authors
14+ multiple_author_patterns = [
15+ r' and ' , # "John Smith and Jane Doe"
16+ r' & ' , # "John Smith & Jane Doe"
17+ r',(?!\s+Jr\.?)' , # "John Smith, Jane Doe" (but not "Smith, Jr.")
18+ r';' , # "John Smith; Jane Doe"
19+ r'\n' , # Multi-line authors
20+ ]
21+
22+ author_value = author_value .strip ()
23+
24+ # Check for patterns that suggest multiple authors
25+ for pattern in multiple_author_patterns :
26+ if re .search (pattern , author_value , re .IGNORECASE ):
27+ return True
28+
29+ return False
30+
31+
32+ def detect_multiple_authors_single_field_pitfall (somef_data : Dict , file_name : str ) -> Dict :
33+ """
34+ Detect when metadata files have multiple authors in a single field instead of a list.
35+ """
36+ result = {
37+ "has_pitfall" : False ,
38+ "file_name" : file_name ,
39+ "author_value" : None ,
40+ "source" : None ,
41+ "metadata_source_file" : None ,
42+ "multiple_authors_detected" : False
43+ }
44+
45+ if "authors" not in somef_data :
46+ return result
47+
48+ authors_entries = somef_data ["authors" ]
49+ if not isinstance (authors_entries , list ):
50+ return result
51+
52+ # Define metadata sources to check (convert to lowercase for comparison)
53+ metadata_sources = ["codemeta.json" , "description" , "composer.json" , "package.json" , "pom.xml" , "pyproject.toml" ,
54+ "requirements.txt" , "setup.py" ]
55+
56+ for entry in authors_entries :
57+ source = entry .get ("source" , "" )
58+ technique = entry .get ("technique" , "" )
59+
60+ is_metadata_source = (
61+ technique == "code_parser" and
62+ any (src in source .lower () for src in metadata_sources )
63+ )
64+
65+ if is_metadata_source :
66+ if "result" in entry and "value" in entry ["result" ]:
67+ author_value = entry ["result" ]["value" ]
68+
69+ # Handle different value formats
70+ if isinstance (author_value , str ):
71+ if has_multiple_authors_in_single_field (author_value ):
72+ result ["has_pitfall" ] = True
73+ result ["author_value" ] = author_value
74+ result ["source" ] = source
75+ result ["metadata_source_file" ] = extract_metadata_source_filename (source )
76+ result ["multiple_authors_detected" ] = True
77+ break
78+
79+ elif isinstance (author_value , dict ) and "name" in author_value :
80+ # Handle structured author data
81+ name_value = author_value ["name" ]
82+ if isinstance (name_value , str ) and has_multiple_authors_in_single_field (name_value ):
83+ result ["has_pitfall" ] = True
84+ result ["author_value" ] = name_value
85+ result ["source" ] = source
86+ result ["metadata_source_file" ] = extract_metadata_source_filename (source )
87+ result ["multiple_authors_detected" ] = True
88+ break
89+
8990 return result
0 commit comments