feat(maintainers): section extraction for ai analysis [CM-1047] (#3965)

mbani01 · web-flow · commit 003b27c24159 · 2026-03-27T11:10:13.000+01:00
Signed-off-by: Mouad BANI &lt;mouad-mb@outlook.com&gt;
diff --git a/services/apps/git_integration/pyproject.toml b/services/apps/git_integration/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
     "aioboto3>=15.1.0",
     "slugify>=0.0.1",
     "orjson>=3.11.3",
+    "pyyaml>=6.0",
 ]
 
 
diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py
@@ -37,6 +37,7 @@
 from crowdgit.models.service_execution import ServiceExecution
 from crowdgit.services.base.base_service import BaseService
 from crowdgit.services.maintainer.bedrock import invoke_bedrock
+from crowdgit.services.maintainer.section_extractor import SectionExtractor
 from crowdgit.services.utils import run_shell_command
 from crowdgit.settings import MAINTAINER_RETRY_INTERVAL_DAYS, MAINTAINER_UPDATE_INTERVAL_HOURS
 
@@ -93,6 +94,7 @@ class MaintainerService(BaseService):
         "code_owners",
         "emeritus",
         "workgroup",
+        "readme",
     }
 
     VALID_EXTENSIONS = {
@@ -132,6 +134,12 @@ class MaintainerService(BaseService):
     STEM_MATCH_SCORE = 50
     PARTIAL_STEM_SCORE = 25
 
+    # Files in KNOWN_PATHS that still need section filtering (contain non-governance content)
+    SECTION_FILTERED_PATHS = {"readme.md", "governance.md"}
+    SCORING_KEYWORDS_SET = frozenset(SCORING_KEYWORDS)
+
+    _section_extractor = SectionExtractor()
+
     def make_role(self, title: str):
         title = title.lower()
         title = (
@@ -360,8 +368,10 @@ async def process_chunk(chunk_index: int, chunk: str):
                 self.get_extraction_prompt(maintainer_filename, content),
                 pydantic_model=MaintainerInfo,
             )
-        self.logger.info("Maintainers file content analyzed by AI")
-        self.logger.info(f"Maintainers response: {maintainer_info}")
+        info_count = len(maintainer_info.output.info) if maintainer_info.output.info else 0
+        self.logger.info(
+            f"Maintainers file content analyzed by AI (found={info_count}, cost={maintainer_info.cost:.4f})"
+        )
         if maintainer_info.output.info is not None:
             return AggregatedMaintainerInfo(
                 output=AggregatedMaintainerInfoItems(info=maintainer_info.output.info),
@@ -373,7 +383,7 @@ async def process_chunk(chunk_index: int, chunk: str):
             )
         else:
             self.logger.error(
-                f"Expected a list of maintainer info or an error message, got: {str(maintainer_info)}"
+                f"Expected a list of maintainer info or an error message, got error={maintainer_info.output.error}"
             )
             raise MaintanerAnalysisError(
                 error_message="Unexpected response from AI for Maintainers analysis",
@@ -586,6 +596,16 @@ async def analyze_and_build_result(self, filename: str, content: str) -> Maintai
                 f"Skipping README file '{filename}': no governance keyword found in content"
             )
             raise MaintanerAnalysisError(error_code=ErrorCode.NO_MAINTAINER_FOUND)
+
+        fname = os.path.basename(filename).lower()
+        if fname not in self.KNOWN_PATHS or fname in self.SECTION_FILTERED_PATHS:
+            extracted = self._section_extractor.extract(fname, content, self.SCORING_KEYWORDS_SET)
+            if extracted:
+                self.logger.info(f"Using extracted sections for '{filename}'")
+                content = extracted
+            else:
+                self.logger.debug(f"No sections extracted for '{filename}', using full content")
+
         result = await self.analyze_file_content(filename, content)
 
         if not result.output.info:
@@ -664,12 +684,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
         root_candidates, subdir_candidates = await self.find_candidate_files(repo_path)
         all_candidates = root_candidates + subdir_candidates
         candidate_files = [(path, score) for path, _, score in all_candidates][:100]
-        self.logger.debug(
-            f"Detection step 2: {len(root_candidates)} root candidate(s), "
-            f"{len(subdir_candidates)} subdir candidate(s); "
-            f"root={[p for p, _, _ in root_candidates]}, "
-            f"subdir_top={[p for p, _, _ in subdir_candidates[:3]]}"
-        )
 
         # Step 3: Try root-level files first (in score order), then top subdirectory file
         failed_candidates: set[str] = set()
@@ -757,7 +771,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
             f"Passing {len(ai_input_files)} files to AI for maintainer file detection "
             f"(total repo files: {len(file_names)})"
         )
-        self.logger.debug(f"AI input files: {[f for f, _ in ai_input_files]}")
         ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(ai_input_files)
         ai_suggested_file = ai_file_name
         total_cost += ai_cost
diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/section_extractor.py b/services/apps/git_integration/src/crowdgit/services/maintainer/section_extractor.py
@@ -0,0 +1,90 @@
+import os
+import re
+
+import tomllib
+import yaml
+
+
+class SectionExtractor:
+    """
+    Extracts relevant sections from file content based on file format and governance keywords.
+    Returns None when no relevant sections are found or format is unsupported (caller falls back to full content).
+    """
+
+    def extract(self, filename: str, content: str, keywords: set[str]) -> str | None:
+        """
+        Returns extracted relevant section text, or None if no sections found.
+        filename should be the basename (lowercased).
+        """
+        ext = os.path.splitext(filename)[1]
+        if ext in (".md", ".markdown"):
+            return self._extract_markdown_sections(content, keywords)
+        elif ext in (".yaml", ".yml"):
+            return self._extract_yaml_sections(content, keywords)
+        elif ext == ".toml":
+            return self._extract_toml_sections(content, keywords)
+        return None
+
+    def _extract_markdown_sections(self, content: str, keywords: set[str]) -> str | None:
+        """
+        Splits content on `#`-style heading lines only (# / ## / ###...).
+        Includes a section if its heading text contains any keyword.
+        Returns joined matching sections, or None if none match.
+        """
+        heading_pattern = re.compile(r"^#{1,6}\s+", re.MULTILINE)
+        # Split into (heading_line, body) pairs; first element may be pre-heading content
+        parts = heading_pattern.split(content)
+        headings = heading_pattern.findall(content)
+
+        # parts[0] is text before the first heading (skip it)
+        # parts[1..] correspond to headings[0..]
+        matching_sections = []
+        for i, heading_marker in enumerate(headings):
+            block = parts[i + 1]  # block starts right after the heading marker
+            # The first line of block is the heading text
+            first_newline = block.find("\n")
+            heading_text = block[:first_newline].strip() if first_newline != -1 else block.strip()
+            if any(kw in heading_text.lower() for kw in keywords):
+                matching_sections.append(f"{heading_marker}{block}")
+
+        return "".join(matching_sections) if matching_sections else None
+
+    def _extract_yaml_sections(self, content: str, keywords: set[str]) -> str | None:
+        """
+        Parses YAML and returns top-level keys whose name contains any keyword, serialized back to YAML.
+        Returns None if no keys match or parsing fails.
+        """
+        try:
+            data = yaml.safe_load(content)
+        except yaml.YAMLError:
+            return None
+
+        if not isinstance(data, dict):
+            return None
+
+        matching = {k: v for k, v in data.items() if any(kw in str(k).lower() for kw in keywords)}
+        if not matching:
+            return None
+
+        return yaml.dump(matching, default_flow_style=False, allow_unicode=True)
+
+    def _extract_toml_sections(self, content: str, keywords: set[str]) -> str | None:
+        """
+        Parses TOML and returns top-level keys whose name contains any keyword,
+        serialized as Python repr key=value lines (not valid TOML syntax).
+        Returns None if no keys match or parsing fails.
+        """
+        try:
+            data = tomllib.loads(content)
+        except tomllib.TOMLDecodeError:
+            return None
+
+        matching = {k: v for k, v in data.items() if any(kw in k.lower() for kw in keywords)}
+        if not matching:
+            return None
+
+        # Serialize matching keys back as simple TOML representation
+        lines = []
+        for k, v in matching.items():
+            lines.append(f"{k} = {repr(v)}")
+        return "\n".join(lines)
diff --git a/services/apps/git_integration/uv.lock b/services/apps/git_integration/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ dependencies = [`
`35`	`35`	`"aioboto3>=15.1.0",`
`36`	`36`	`"slugify>=0.0.1",`
`37`	`37`	`"orjson>=3.11.3",`
	`38`	`+ "pyyaml>=6.0",`
`38`	`39`	`]`
`39`	`40`
`40`	`41`