1- from typing import Dict , List
2- import re
3- from urllib .parse import urlparse
4-
5-
6- def is_homepage_url (url : str ) -> bool :
7- """
8- Check if a URL appears to be a homepage/wiki rather than a README file.
9- Returns True if it's likely a homepage.
10- """
11- if not url :
12- return False
13-
14- url_lower = url .lower ()
15-
16- if 'raw.githubusercontent.com' in url_lower :
17- return False
18-
19- # Check for documentation sites and wikis
20- homepage_indicators = [
21- '.readthedocs.io' ,
22- '.github.io' ,
23- 'wiki' ,
24- 'docs.' ,
25- 'documentation'
26- ]
27-
28- if ('github.com' in url_lower or 'gitlab.com' in url_lower ):
29-
30- if 'readme' in url_lower or 'blob/' in url_lower :
31- return False
32-
33- return True
34-
35- # Check for other homepage indicators
36- for indicator in homepage_indicators :
37- if indicator in url_lower :
38- return True
39-
40- # Check for generic domains (.org, .com, .net) but more specific
41- if any (domain in url_lower for domain in ['.org' , '.com' , '.net' ]):
42- if any (ext in url_lower for ext in ['.md' , '.txt' , '.rst' , '.html' , 'readme' ]):
43- return False
44-
45- if '/' in url_lower .split ('.' )[- 1 ]:
46- return False
47- return True
48-
49- return False
50-
51-
52- def detect_readme_homepage_pitfall (somef_data : Dict , file_name : str ) -> Dict :
53- """
54- Detect when README property in codemeta.json points to homepage/wiki instead of README file.
55- """
56- result = {
57- "has_pitfall" : False ,
58- "file_name" : file_name ,
59- "readme_url" : None ,
60- "source" : None ,
61- "is_homepage" : False
62- }
63-
64- if "readme_url" not in somef_data :
65- return result
66-
67- readme_entries = somef_data ["readme_url" ]
68- if not isinstance (readme_entries , list ):
69- return result
70-
71- for entry in readme_entries :
72- if "technique" in entry and entry ["technique" ] == "code_parser" :
73- if "source" in entry and "codemeta.json" in entry ["source" ]:
74- if "result" in entry and "value" in entry ["result" ]:
75- readme_url = entry ["result" ]["value" ]
76-
77- if is_homepage_url (readme_url ):
78- result ["has_pitfall" ] = True
79- result ["readme_url" ] = readme_url
80- result ["source" ] = entry ["source" ]
81- result ["is_homepage" ] = True
82- break
83-
1+ from typing import Dict , List
2+ import re
3+ from urllib .parse import urlparse
4+
5+
6+
7+ def is_homepage_url (url : str ) -> bool :
8+ """
9+ Check if a URL appears to be a homepage/wiki rather than a README file.
10+ Returns True if it's likely a homepage.
11+ """
12+ if not url :
13+ return False
14+
15+ url_lower = url .lower ()
16+
17+ if 'raw.githubusercontent.com' in url_lower :
18+ return False
19+
20+ # Check for documentation sites and wikis
21+ homepage_indicators = [
22+ '.readthedocs.io' ,
23+ '.github.io' ,
24+ 'wiki' ,
25+ 'docs.' ,
26+ 'documentation'
27+ ]
28+
29+ if ('github.com' in url_lower or 'gitlab.com' in url_lower ):
30+ if 'readme' in url_lower or 'blob/' in url_lower :
31+ return False
32+ return True
33+
34+ # Check for other homepage indicators
35+ for indicator in homepage_indicators :
36+ if indicator in url_lower :
37+ return True
38+
39+ # Check for generic domains (.org, .com, .net)
40+ if any (domain in url_lower for domain in ['.org' , '.com' , '.net' ]):
41+ if any (ext in url_lower for ext in ['.md' , '.txt' , '.rst' , '.html' , 'readme' ]):
42+ return False
43+ return True
44+
45+ return False
46+
47+
48+ def detect_readme_homepage_pitfall (somef_data : Dict , file_name : str ) -> Dict :
49+ """
50+ Detect when README property in codemeta.json points to homepage/wiki instead of README file.
51+ """
52+ result = {
53+ "has_pitfall" : False ,
54+ "file_name" : file_name ,
55+ "readme_url" : None ,
56+ "source" : None ,
57+ "is_homepage" : False
58+ }
59+
60+ if "readme_url" not in somef_data :
61+ return result
62+
63+ readme_entries = somef_data ["readme_url" ]
64+ if not isinstance (readme_entries , list ):
65+ return result
66+
67+ for entry in readme_entries :
68+ if "technique" in entry and entry ["technique" ] == "code_parser" :
69+ if "source" in entry and "codemeta.json" in entry ["source" ]:
70+ if "result" in entry and "value" in entry ["result" ]:
71+ readme_url = entry ["result" ]["value" ]
72+
73+ if is_homepage_url (readme_url ):
74+ result ["has_pitfall" ] = True
75+ result ["readme_url" ] = readme_url
76+ result ["source" ] = entry ["source" ]
77+ result ["is_homepage" ] = True
78+ break
79+
8480 return result
0 commit comments