SoftwareUnderstanding
diff --git a/‎.idea/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.idea/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/metacheck/detect_pitfalls_main.py‎
Lines changed: 9 additions & 0 deletions b/‎src/metacheck/detect_pitfalls_main.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p005.py‎
Lines changed: 0 additions & 1 deletion b/‎src/metacheck/scripts/pitfalls/p005.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/metacheck/scripts/pitfalls/p006.py‎
Lines changed: 1 addition & 3 deletions b/‎src/metacheck/scripts/pitfalls/p006.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p007.py‎
Lines changed: 33 additions & 50 deletions b/‎src/metacheck/scripts/pitfalls/p007.py‎
Lines changed: 33 additions & 50 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p009.py‎
Lines changed: 15 additions & 10 deletions b/‎src/metacheck/scripts/pitfalls/p009.py‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p010.py‎
Lines changed: 12 additions & 13 deletions b/‎src/metacheck/scripts/pitfalls/p010.py‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p012.py‎
Lines changed: 15 additions & 15 deletions b/‎src/metacheck/scripts/pitfalls/p012.py‎
Lines changed: 15 additions & 15 deletions
@@ -35,6 +35,7 @@
 from metacheck.scripts.warnings.w008 import detect_author_name_list_warning
 from metacheck.scripts.warnings.w009 import detect_development_status_url_pitfall
 from metacheck.scripts.warnings.w010 import detect_git_remote_shorthand_pitfall
+from metacheck.scripts.warnings.w011 import detect_inconsistent_author_count
 
 
 def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[str, Path], output_file: Union[str, Path]):
@@ -260,6 +261,13 @@ def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[s
                 "percentage": 0.0,
                 "languages": {}
             },
+            {
+                "pitfall_code": "W011",
+                "pitfall_desc": "The metadata file codeRepository does not have matching number of authors",
+                "count": 0,
+                "percentage": 0.0,
+                "languages": {}
+            }
         ]
     }
 
@@ -299,6 +307,7 @@ def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[s
         (detect_author_name_list_warning, "W008"),  # Index 24 -> W008
         (detect_development_status_url_pitfall, "W009"),  # Index 25 -> W009
         (detect_git_remote_shorthand_pitfall, "W010"),  # Index 26 -> W010
+        (detect_inconsistent_author_count, "W011"),
     ]
 
     for json_file in json_files:
 
@@ -1,7 +1,6 @@
 from typing import Dict
 import re
 
-
 def is_software_archive_url(url: str) -> bool:
     """
     Check if URL points to a software archive instead of a research paper.
 
@@ -1,6 +1,4 @@
-
 from typing import Dict
-import re
 from metacheck.utils.pitfall_utils import extract_metadata_source_filename
 
 def is_local_file_license(license_value: str) -> bool:
@@ -26,7 +24,7 @@ def is_local_file_license(license_value: str) -> bool:
         'copying', 'copying.md', 'copying.txt',
         'copyright', 'copyright.md', 'copyright.txt',
         'licence', 'licence.md', 'licence.txt',  # British spelling
-        'readme.md', 'doc.txt', 'file.rst'  # Other common file patterns
+        'readme.md', 'doc.txt', 'file.rst'
     ]
 
     if license_lower in license_file_names:
 
@@ -13,55 +13,38 @@ def detect_citation_missing_reference_publication_pitfall(somef_data: Dict, file
         "citation_cff_exists": False
     }
 
-    if "citation" not in somef_data:
-        return result
-
-    citation_entries = somef_data["citation"]
-    if not isinstance(citation_entries, list):
-        return result
-
-    codemeta_citation_value = None
-    citation_cff_citation_value = None
-    citation_cff_exists_in_somef = False
-
-    for entry in citation_entries:
-        source = entry.get("source", "")
-        technique = entry.get("technique", "")
-
-        if technique == "code_parser" and "codemeta.json" in source:
-            if "result" in entry and "value" in entry["result"]:
-                codemeta_citation_value = entry["result"]["value"]
-                result["codemeta_has_reference"] = True
-        elif "CITATION.cff" in source:
-            citation_cff_exists_in_somef = True
-            result["citation_cff_exists"] = True
-            if "result" in entry and "value" in entry["result"]:
-                citation_cff_citation_value = entry["result"]["value"]
-
-    if not citation_cff_exists_in_somef:
-        citation_cff_sources = ["authors", "title", "description", "version", "license"]
-        for category in citation_cff_sources:
-            if category in somef_data:
-                entries = somef_data[category]
-                if isinstance(entries, list):
-                    for entry in entries:
-                        source = entry.get("source", "")
-                        if "CITATION.cff" in source:
-                            citation_cff_exists_in_somef = True
-                            result["citation_cff_exists"] = True
-                            break
-
-    if (codemeta_citation_value and
-            citation_cff_exists_in_somef and
-            (not citation_cff_citation_value or citation_cff_citation_value != codemeta_citation_value)):
-
-        if citation_cff_citation_value:
-            if ("doi.org" in codemeta_citation_value or "http" in codemeta_citation_value):
-                if not ("doi.org" in citation_cff_citation_value or "http" in citation_cff_citation_value):
-                    result["has_pitfall"] = True
-                elif codemeta_citation_value not in citation_cff_citation_value and citation_cff_citation_value not in codemeta_citation_value:
-                    result["has_pitfall"] = True
-        else:
-            result["has_pitfall"] = True
+    if "reference_publication" in somef_data:
+        ref_pub_entries = somef_data["reference_publication"]
+        if isinstance(ref_pub_entries, list):
+            for entry in ref_pub_entries:
+                source = entry.get("source", "")
+                technique = entry.get("technique", "")
+
+                if technique == "code_parser" and "codemeta.json" in source:
+                    if "result" in entry and "value" in entry["result"]:
+                        result["codemeta_has_reference"] = True
+
+                elif "CITATION.cff" in source:
+                    if "result" in entry and "value" in entry["result"]:
+                        result["citation_cff_has_reference"] = True
+
+    citation_cff_sources = ["authors", "title", "description", "version", "license"]
+    for category in citation_cff_sources:
+        if category in somef_data:
+            entries = somef_data[category]
+            if isinstance(entries, list):
+                for entry in entries:
+                    source = entry.get("source", "")
+                    if "CITATION.cff" in source:
+                        result["citation_cff_exists"] = True
+                        break
+
+        if result["citation_cff_exists"]:
+            break
+
+    if (result["codemeta_has_reference"] and
+            result["citation_cff_exists"] and
+            not result["citation_cff_has_reference"]):
+        result["has_pitfall"] = True
 
     return result
@@ -1,3 +1,4 @@
+
 from typing import Dict
 from metacheck.utils.pitfall_utils import extract_metadata_source_filename
 
@@ -11,11 +12,16 @@ def is_repository_url(url: str) -> bool:
 
     url_lower = url.lower()
 
-    # Valid repository indicators
+    if 'github.io' in url_lower:
+        return False
+
     repo_indicators = [
         'github.com/',
+        'github.org/',
         'gitlab.com/',
+        'gitlab.org/',
         'bitbucket.org/',
+        'bitbucket.net/',
         'sourceforge.net/projects/',
         'git.',
         '.git'
@@ -37,7 +43,9 @@ def is_homepage_url_repo(url: str) -> bool:
 
     url_lower = url.lower()
 
-    # Homepage indicators
+    if is_repository_url(url):
+        return False
+
     homepage_indicators = [
         '.org/',
         '.com/',
@@ -50,11 +58,6 @@ def is_homepage_url_repo(url: str) -> bool:
         'github.io'
     ]
 
-    # If it's clearly a repository URL, it's not a homepage
-    if is_repository_url(url):
-        return False
-
-    # Check for homepage indicators
     for indicator in homepage_indicators:
         if indicator in url_lower:
             return True
@@ -82,15 +85,17 @@ def detect_coderepository_homepage_pitfall(somef_data: Dict, file_name: str) ->
     if not isinstance(repo_entries, list):
         return result
 
-    metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json", "pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
+    metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json",
+                        "pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
 
     for entry in repo_entries:
         technique = entry.get("technique", "")
         source = entry.get("source", "")
 
         is_metadata_source = (
-                technique in metadata_sources or
-                any(src in source.lower() for src in metadata_sources)
+            technique == "code_parser" or
+            technique in metadata_sources or
+            any(src in source.lower() for src in metadata_sources)
         )
 
         if is_metadata_source:
 
@@ -1,4 +1,3 @@
-
 import re
 from typing import Dict, Optional
 
@@ -18,7 +17,6 @@ def extract_license_from_file(somef_data: Dict) -> Optional[Dict[str, str]]:
     for entry in license_entries:
         if "source" in entry:
             source = entry["source"]
-            # Look for LICENSE files (LICENSE, LICENSE.md, etc.)
             if "LICENSE" in source.upper() and "result" in entry and "value" in entry["result"]:
                 return {
                     "source": source,
@@ -41,16 +39,14 @@ def check_copyright_only_license(license_content: str) -> bool:
     content_lower = license_content.lower().strip()
     content_lines = [line.strip() for line in license_content.strip().split('\n') if line.strip()]
 
-    # Patterns that indicate copyright-only content
     copyright_only_patterns = [
-        r'year\s*:\s*\d{4}',  # YEAR: 2017 (removed ^ and $ to match anywhere in text)
+        r'year\s*:\s*\d{4}',  # YEAR: 2017
         r'copyright\s+holder\s*:\s*[a-zA-Z]',  # COPYRIGHT HOLDER: Someone
         r'author\s*:\s*[a-zA-Z]',  # AUTHOR: Someone
         r'copyright\s*©?\s*\d{4}',  # Copyright 2017 or Copyright © 2017
         r'\(c\)\s*\d{4}',  # (C) 2017
     ]
 
-    # Patterns that indicate actual license terms
     license_term_patterns = [
         r'permission\s+is\s+hereby\s+granted',
         r'subject\s+to\s+the\s+following\s+conditions',
@@ -70,33 +66,36 @@ def check_copyright_only_license(license_content: str) -> bool:
     has_copyright_info = any(re.search(pattern, content_lower) for pattern in copyright_only_patterns)
     has_license_terms = any(re.search(pattern, content_lower) for pattern in license_term_patterns)
 
-    # If it has copyright info but no license terms and is short, it's likely copyright-only
+    if has_license_terms:
+        return False
+
+    # This will check if it has copyright info but no license terms and is short, it's likely copyright-only
     if has_copyright_info and not has_license_terms and len(content_lines) <= 10:
         return True
 
-    # Special case: check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
+    # Check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
     year_pattern_found = bool(re.search(r'year\s*:\s*\d{4}', content_lower))
     copyright_holder_pattern_found = bool(re.search(r'copyright\s+holder\s*:', content_lower))
 
     if year_pattern_found and copyright_holder_pattern_found:
+        if has_license_terms:
+            return False
         return True
 
-    # Additional check: if the content is very short and only contains basic copyright info
-    if len(content_lines) <= 5:  # Increased from 3 to 5 for more flexibility
-        # Check if all lines are just copyright/year information
+    if len(content_lines) <= 5:
         meaningful_lines = []
+
         for line in content_lines:
             line_lower = line.lower()
-            # Skip lines that are just copyright patterns
+
             if not any(re.search(pattern, line_lower) for pattern in copyright_only_patterns):
-                # This line doesn't match copyright patterns, check if it's meaningful
+
                 if (len(line.strip()) > 0 and
                     not line.strip().startswith('#') and
                     not line.strip().startswith('//') and
                     line.strip() not in ['', '-', '=', '*']):
                     meaningful_lines.append(line)
 
-        # If we have very few meaningful lines and some copyright info, it's probably copyright-only
         if len(meaningful_lines) <= 1 and has_copyright_info:
             return True
 
 
@@ -11,16 +11,21 @@ def extract_version_from_download_url(url: str) -> str:
 
     # Common version patterns in download URLs
     version_patterns = [
-        r'/archive/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)',  # /archive/3.8.0 or /archive/v1.2.3
+        r'/archive/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)\.',  # /archive/3.8.0. or /archive/v1.2.3.
+        r'/archive/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)$',
+        # /archive/3.8.0 or /archive/v1.2.3 (end of string)
         r'[-_](?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)\.',  # -3.8.0.tar.gz or _v1.2.3.zip
         r'/(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)/[^/]*$',  # /3.8.0/something
-        r'[-_/](?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)(?:\.tar\.gz|\.zip|$)'  # More flexible ending
     ]
 
     for pattern in version_patterns:
         match = re.search(pattern, url)
         if match:
-            return match.group(1)
+            version = match.group(1)
+            # Remove any trailing file extension artifacts
+            # This handles cases where .tar, .zip etc might be captured
+            version = re.sub(r'\.(tar|gz|zip|bz2|xz|tgz).*$', '', version)
+            return version
 
     return None
 
@@ -32,12 +37,16 @@ def normalize_version(version: str) -> str:
     if not version:
         return None
 
-    # Remove 'v' prefix if present
-    normalized = version.lower().strip()
+    normalized = version.strip()
+
+    if not normalized:
+        return None
+
+    normalized = normalized.lower()
     if normalized.startswith('v'):
         normalized = normalized[1:]
 
-    return normalized
+    return normalized if normalized else None
 
 
 def get_latest_release_version(somef_data: Dict) -> str:
@@ -51,21 +60,17 @@ def get_latest_release_version(somef_data: Dict) -> str:
     if not isinstance(releases, list) or not releases:
         return None
 
-    # Get the first (latest) release
     latest_release = releases[0]
     if "result" in latest_release:
         result = latest_release["result"]
 
-        # Try to get version from tag first
         if "tag" in result and result["tag"]:
             tag = result["tag"].strip()
             if tag:
                 return normalize_version(tag)
 
-        # Fallback to name if tag is not available
         if "name" in result and result["name"]:
             name = result["name"]
-            # Extract version from name
             version_match = re.search(r'(?:v)?(\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9\-\.]*)?)', name)
             if version_match:
                 return normalize_version(version_match.group(1))
@@ -96,7 +101,6 @@ def detect_outdated_download_url_pitfall(somef_data: Dict, file_name: str) -> Di
     codemeta_download_url = None
     codemeta_source = None
 
-    # Find download URL from codemeta.json
     for entry in download_entries:
         source = entry.get("source", "")
         technique = entry.get("technique", "")
@@ -111,24 +115,20 @@ def detect_outdated_download_url_pitfall(somef_data: Dict, file_name: str) -> Di
     if not codemeta_download_url:
         return result
 
-    # Extract version from download URL
     download_version = extract_version_from_download_url(codemeta_download_url)
     if not download_version:
         return result
 
-    # Get latest release version
     latest_version = get_latest_release_version(somef_data)
     if not latest_version:
         return result
 
-    # Normalize both versions for comparison
     normalized_download_version = normalize_version(download_version)
     normalized_latest_version = normalize_version(latest_version)
 
     if not normalized_download_version or not normalized_latest_version:
         return result
 
-    # Compare versions
     if normalized_download_version != normalized_latest_version:
         result["has_pitfall"] = True
         result["download_url"] = codemeta_download_url