Skip to content

Commit 003b27c

Browse files
authored
feat(maintainers): section extraction for ai analysis [CM-1047] (#3965)
Signed-off-by: Mouad BANI <mouad-mb@outlook.com>
1 parent b6aff89 commit 003b27c

4 files changed

Lines changed: 117 additions & 11 deletions

File tree

services/apps/git_integration/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies = [
3535
"aioboto3>=15.1.0",
3636
"slugify>=0.0.1",
3737
"orjson>=3.11.3",
38+
"pyyaml>=6.0",
3839
]
3940

4041

services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from crowdgit.models.service_execution import ServiceExecution
3838
from crowdgit.services.base.base_service import BaseService
3939
from crowdgit.services.maintainer.bedrock import invoke_bedrock
40+
from crowdgit.services.maintainer.section_extractor import SectionExtractor
4041
from crowdgit.services.utils import run_shell_command
4142
from crowdgit.settings import MAINTAINER_RETRY_INTERVAL_DAYS, MAINTAINER_UPDATE_INTERVAL_HOURS
4243

@@ -93,6 +94,7 @@ class MaintainerService(BaseService):
9394
"code_owners",
9495
"emeritus",
9596
"workgroup",
97+
"readme",
9698
}
9799

98100
VALID_EXTENSIONS = {
@@ -132,6 +134,12 @@ class MaintainerService(BaseService):
132134
STEM_MATCH_SCORE = 50
133135
PARTIAL_STEM_SCORE = 25
134136

137+
# Files in KNOWN_PATHS that still need section filtering (contain non-governance content)
138+
SECTION_FILTERED_PATHS = {"readme.md", "governance.md"}
139+
SCORING_KEYWORDS_SET = frozenset(SCORING_KEYWORDS)
140+
141+
_section_extractor = SectionExtractor()
142+
135143
def make_role(self, title: str):
136144
title = title.lower()
137145
title = (
@@ -360,8 +368,10 @@ async def process_chunk(chunk_index: int, chunk: str):
360368
self.get_extraction_prompt(maintainer_filename, content),
361369
pydantic_model=MaintainerInfo,
362370
)
363-
self.logger.info("Maintainers file content analyzed by AI")
364-
self.logger.info(f"Maintainers response: {maintainer_info}")
371+
info_count = len(maintainer_info.output.info) if maintainer_info.output.info else 0
372+
self.logger.info(
373+
f"Maintainers file content analyzed by AI (found={info_count}, cost={maintainer_info.cost:.4f})"
374+
)
365375
if maintainer_info.output.info is not None:
366376
return AggregatedMaintainerInfo(
367377
output=AggregatedMaintainerInfoItems(info=maintainer_info.output.info),
@@ -373,7 +383,7 @@ async def process_chunk(chunk_index: int, chunk: str):
373383
)
374384
else:
375385
self.logger.error(
376-
f"Expected a list of maintainer info or an error message, got: {str(maintainer_info)}"
386+
f"Expected a list of maintainer info or an error message, got error={maintainer_info.output.error}"
377387
)
378388
raise MaintanerAnalysisError(
379389
error_message="Unexpected response from AI for Maintainers analysis",
@@ -586,6 +596,16 @@ async def analyze_and_build_result(self, filename: str, content: str) -> Maintai
586596
f"Skipping README file '{filename}': no governance keyword found in content"
587597
)
588598
raise MaintanerAnalysisError(error_code=ErrorCode.NO_MAINTAINER_FOUND)
599+
600+
fname = os.path.basename(filename).lower()
601+
if fname not in self.KNOWN_PATHS or fname in self.SECTION_FILTERED_PATHS:
602+
extracted = self._section_extractor.extract(fname, content, self.SCORING_KEYWORDS_SET)
603+
if extracted:
604+
self.logger.info(f"Using extracted sections for '{filename}'")
605+
content = extracted
606+
else:
607+
self.logger.debug(f"No sections extracted for '{filename}', using full content")
608+
589609
result = await self.analyze_file_content(filename, content)
590610

591611
if not result.output.info:
@@ -664,12 +684,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
664684
root_candidates, subdir_candidates = await self.find_candidate_files(repo_path)
665685
all_candidates = root_candidates + subdir_candidates
666686
candidate_files = [(path, score) for path, _, score in all_candidates][:100]
667-
self.logger.debug(
668-
f"Detection step 2: {len(root_candidates)} root candidate(s), "
669-
f"{len(subdir_candidates)} subdir candidate(s); "
670-
f"root={[p for p, _, _ in root_candidates]}, "
671-
f"subdir_top={[p for p, _, _ in subdir_candidates[:3]]}"
672-
)
673687

674688
# Step 3: Try root-level files first (in score order), then top subdirectory file
675689
failed_candidates: set[str] = set()
@@ -757,7 +771,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
757771
f"Passing {len(ai_input_files)} files to AI for maintainer file detection "
758772
f"(total repo files: {len(file_names)})"
759773
)
760-
self.logger.debug(f"AI input files: {[f for f, _ in ai_input_files]}")
761774
ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(ai_input_files)
762775
ai_suggested_file = ai_file_name
763776
total_cost += ai_cost
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import os
2+
import re
3+
4+
import tomllib
5+
import yaml
6+
7+
8+
class SectionExtractor:
9+
"""
10+
Extracts relevant sections from file content based on file format and governance keywords.
11+
Returns None when no relevant sections are found or format is unsupported (caller falls back to full content).
12+
"""
13+
14+
def extract(self, filename: str, content: str, keywords: set[str]) -> str | None:
15+
"""
16+
Returns extracted relevant section text, or None if no sections found.
17+
filename should be the basename (lowercased).
18+
"""
19+
ext = os.path.splitext(filename)[1]
20+
if ext in (".md", ".markdown"):
21+
return self._extract_markdown_sections(content, keywords)
22+
elif ext in (".yaml", ".yml"):
23+
return self._extract_yaml_sections(content, keywords)
24+
elif ext == ".toml":
25+
return self._extract_toml_sections(content, keywords)
26+
return None
27+
28+
def _extract_markdown_sections(self, content: str, keywords: set[str]) -> str | None:
29+
"""
30+
Splits content on `#`-style heading lines only (# / ## / ###...).
31+
Includes a section if its heading text contains any keyword.
32+
Returns joined matching sections, or None if none match.
33+
"""
34+
heading_pattern = re.compile(r"^#{1,6}\s+", re.MULTILINE)
35+
# Split into (heading_line, body) pairs; first element may be pre-heading content
36+
parts = heading_pattern.split(content)
37+
headings = heading_pattern.findall(content)
38+
39+
# parts[0] is text before the first heading (skip it)
40+
# parts[1..] correspond to headings[0..]
41+
matching_sections = []
42+
for i, heading_marker in enumerate(headings):
43+
block = parts[i + 1] # block starts right after the heading marker
44+
# The first line of block is the heading text
45+
first_newline = block.find("\n")
46+
heading_text = block[:first_newline].strip() if first_newline != -1 else block.strip()
47+
if any(kw in heading_text.lower() for kw in keywords):
48+
matching_sections.append(f"{heading_marker}{block}")
49+
50+
return "".join(matching_sections) if matching_sections else None
51+
52+
def _extract_yaml_sections(self, content: str, keywords: set[str]) -> str | None:
53+
"""
54+
Parses YAML and returns top-level keys whose name contains any keyword, serialized back to YAML.
55+
Returns None if no keys match or parsing fails.
56+
"""
57+
try:
58+
data = yaml.safe_load(content)
59+
except yaml.YAMLError:
60+
return None
61+
62+
if not isinstance(data, dict):
63+
return None
64+
65+
matching = {k: v for k, v in data.items() if any(kw in str(k).lower() for kw in keywords)}
66+
if not matching:
67+
return None
68+
69+
return yaml.dump(matching, default_flow_style=False, allow_unicode=True)
70+
71+
def _extract_toml_sections(self, content: str, keywords: set[str]) -> str | None:
72+
"""
73+
Parses TOML and returns top-level keys whose name contains any keyword,
74+
serialized as Python repr key=value lines (not valid TOML syntax).
75+
Returns None if no keys match or parsing fails.
76+
"""
77+
try:
78+
data = tomllib.loads(content)
79+
except tomllib.TOMLDecodeError:
80+
return None
81+
82+
matching = {k: v for k, v in data.items() if any(kw in k.lower() for kw in keywords)}
83+
if not matching:
84+
return None
85+
86+
# Serialize matching keys back as simple TOML representation
87+
lines = []
88+
for k, v in matching.items():
89+
lines.append(f"{k} = {repr(v)}")
90+
return "\n".join(lines)

services/apps/git_integration/uv.lock

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)