|
1 | | -"""Inner Loop 2: Results analysis and explanation. |
| 1 | +"""Inner Loop 2: Results analysis and interpretation. |
2 | 2 |
|
3 | | -Iterates: LLM forms claims about simulation results → extracts |
4 | | -specific numbers → citation verifier checks → adversarial questioning |
5 | | -→ revise until explanation is complete. |
| 3 | +The metrics pipeline extracts verified numbers. The LLM interprets them. |
| 4 | +Clear separation: facts are machine-generated, explanations are LLM-generated. |
6 | 5 |
|
7 | | -Stateless and in-memory. All state is passed through function arguments. |
| 6 | +Flow: |
| 7 | + results JSON → metrics_report (verified numbers) → LLM interprets → findings |
8 | 8 | """ |
9 | 9 |
|
10 | 10 | from __future__ import annotations |
11 | 11 |
|
12 | | -from dataclasses import dataclass, field |
| 12 | +from dataclasses import dataclass |
13 | 13 |
|
14 | 14 | from .backend import LLMBackend |
15 | | -from .citation_verifier import ( |
16 | | - VerificationResult, |
17 | | - extract_claims_from_text, |
18 | | - verify_claims, |
19 | | -) |
20 | | - |
21 | | - |
22 | | -@dataclass |
23 | | -class Finding: |
24 | | - """A single verified finding from the analysis loop.""" |
25 | | - |
26 | | - claim: str |
27 | | - evidence: str # specific numbers cited |
28 | | - verification: VerificationResult |
29 | | - adversarial_check: str # what would disprove this |
| 15 | +from .metrics_report import build_metrics_report |
30 | 16 |
|
31 | 17 |
|
32 | 18 | @dataclass |
33 | 19 | class AnalysisResult: |
34 | 20 | """Output of the analysis loop.""" |
35 | 21 |
|
36 | | - findings: list[Finding] = field(default_factory=list) |
| 22 | + metrics_report: str # machine-generated, verified |
| 23 | + interpretation: str # LLM-generated explanation |
| 24 | + next_hypothesis: str # LLM-generated suggestion for next experiment |
37 | 25 | iterations_used: int = 0 |
38 | 26 | complete: bool = False |
39 | 27 |
|
40 | 28 | def summary(self) -> str: |
41 | | - n = len(self.findings) |
42 | | - verified = sum(1 for f in self.findings if f.verification.all_verified) |
43 | | - return f"{n} findings ({verified} fully verified), {self.iterations_used} iterations" |
| 29 | + lines = self.interpretation.strip().splitlines() |
| 30 | + n_lines = len(lines) |
| 31 | + preview = lines[0][:80] if lines else "(empty)" |
| 32 | + return f"{n_lines} lines, {self.iterations_used} iterations: {preview}..." |
44 | 33 |
|
45 | 34 |
|
46 | 35 | _ANALYSIS_SYSTEM_PROMPT = """\ |
47 | | -You are analyzing network simulation results. Your job is to explain |
48 | | -WHY the results look the way they do, grounded in specific numbers. |
49 | | -
|
50 | | -Rules: |
51 | | -1. Every claim must cite specific values from the data using dot-path |
52 | | - notation: steps.step_name.data.field.subfield = value |
53 | | -2. After each claim, state what would disprove it. |
54 | | -3. Be precise. "BAC drops significantly" is not acceptable. |
55 | | - "steps.tm_lh_path.data.flow_results.0.summary.overall_ratio = 0.3333" is. |
| 36 | +You are a network reliability engineer analyzing simulation results. |
| 37 | +
|
| 38 | +You will receive a METRICS REPORT containing verified numbers from the |
| 39 | +simulation. These numbers are machine-computed and correct — do not |
| 40 | +question or re-derive them. |
| 41 | +
|
| 42 | +Your job: explain WHY the results look the way they do. Connect the |
| 43 | +numbers to the topology structure. Identify what matters and what doesn't. |
| 44 | +
|
| 45 | +Be direct. No filler. Every sentence should convey an insight about the |
| 46 | +topology's behavior under failure. |
56 | 47 | """ |
57 | 48 |
|
58 | | -_ANALYSIS_PROMPT_TEMPLATE = """\ |
| 49 | +_ANALYSIS_PROMPT = """\ |
59 | 50 | Hypothesis being tested: |
60 | 51 | {hypothesis} |
61 | 52 |
|
62 | | -Simulation results summary: |
63 | | -{results_summary} |
64 | | -
|
65 | | -{feedback} |
| 53 | +{metrics_report} |
66 | 54 |
|
67 | | -Provide your analysis as a series of findings. For each finding: |
68 | | -1. State the claim |
69 | | -2. Cite specific values using dot-path notation (path = value) |
70 | | -3. State what would disprove this claim |
| 55 | +Explain these results. For each failure mode: |
| 56 | +1. Why does BAC have this value? What structural property causes it? |
| 57 | +2. Does latency degrade under failure, or just bandwidth? |
| 58 | +3. Are both directions affected equally? |
71 | 59 |
|
72 | | -Format each finding as: |
73 | | -
|
74 | | -CLAIM: <your claim> |
75 | | -EVIDENCE: <dot.path = value, one per line> |
76 | | -DISPROOF: <what would disprove this> |
| 60 | +Then summarize: what are the key design strengths and weaknesses of this topology? |
77 | 61 | """ |
78 | 62 |
|
79 | | -_ADVERSARIAL_PROMPT = """\ |
80 | | -You previously found: |
81 | | -{finding_text} |
| 63 | +_NEXT_HYPOTHESIS_PROMPT = """\ |
| 64 | +Based on this analysis: |
82 | 65 |
|
83 | | -The citations were verified against the actual data: |
84 | | -{verification_summary} |
| 66 | +{interpretation} |
85 | 67 |
|
86 | | -Now critically question this finding: |
87 | | -1. Is there a simpler explanation? |
88 | | -2. Could this be a coincidence or artifact? |
89 | | -3. What additional evidence would strengthen or weaken this claim? |
| 68 | +Original hypothesis: |
| 69 | +{hypothesis} |
90 | 70 |
|
91 | | -If the finding stands, respond with "CONFIRMED". |
92 | | -If it needs revision, provide the revised finding in the same format. |
93 | | -""" |
| 71 | +Metrics summary: |
| 72 | +{metrics_summary} |
94 | 73 |
|
| 74 | +Propose the next topology experiment. Consider: |
| 75 | +- What structural variation might improve resilience? |
| 76 | +- What tradeoff hasn't been explored (cost vs redundancy, latency vs bandwidth)? |
| 77 | +- What aspect of the results is surprising or unexplained? |
95 | 78 |
|
96 | | -def _parse_findings(response: str) -> list[dict[str, str]]: |
97 | | - """Parse LLM response into finding dicts.""" |
98 | | - findings: list[dict[str, str]] = [] |
99 | | - current: dict[str, str] = {} |
100 | | - |
101 | | - for line in response.splitlines(): |
102 | | - line = line.strip() |
103 | | - if line.startswith("CLAIM:"): |
104 | | - if current.get("claim"): |
105 | | - findings.append(current) |
106 | | - current = {"claim": line[6:].strip()} |
107 | | - elif line.startswith("EVIDENCE:"): |
108 | | - current["evidence"] = line[9:].strip() |
109 | | - elif line.startswith("DISPROOF:"): |
110 | | - current["disproof"] = line[9:].strip() |
111 | | - elif current.get("evidence") is not None and "=" in line and "." in line: |
112 | | - # Continuation of evidence lines |
113 | | - current["evidence"] += "\n" + line |
114 | | - |
115 | | - if current.get("claim"): |
116 | | - findings.append(current) |
117 | | - |
118 | | - return findings |
119 | | - |
120 | | - |
121 | | -def _build_results_summary(results: dict) -> str: |
122 | | - """Build a concise summary of simulation results for the LLM.""" |
123 | | - lines: list[str] = [] |
124 | | - steps = results.get("steps", {}) |
125 | | - |
126 | | - for step_name, step_data in steps.items(): |
127 | | - data = step_data.get("data", {}) |
128 | | - |
129 | | - baseline = data.get("baseline", {}) |
130 | | - flow_results = data.get("flow_results", []) |
131 | | - |
132 | | - if baseline and isinstance(baseline, dict): |
133 | | - summary = baseline.get("summary", {}) |
134 | | - lines.append( |
135 | | - f"{step_name}: baseline ratio={summary.get('overall_ratio', 'N/A')}, " |
136 | | - f"placed={summary.get('total_placed', 'N/A')}, " |
137 | | - f"demand={summary.get('total_demand', 'N/A')}" |
138 | | - ) |
139 | | - |
140 | | - if flow_results: |
141 | | - n_patterns = len(flow_results) |
142 | | - total_iters = sum(fr.get("occurrence_count", 1) for fr in flow_results) |
143 | | - lines.append( |
144 | | - f" {n_patterns} unique failure patterns, {total_iters} iterations" |
145 | | - ) |
146 | | - for fr in flow_results[:3]: |
147 | | - s = fr.get("summary", {}) |
148 | | - lines.append( |
149 | | - f" pattern (count={fr.get('occurrence_count', 1)}): " |
150 | | - f"ratio={s.get('overall_ratio', 'N/A')}, " |
151 | | - f"placed={s.get('total_placed', 'N/A')}" |
152 | | - ) |
153 | | - |
154 | | - # MSD alpha |
155 | | - alpha = data.get("alpha_star") |
156 | | - if alpha is not None: |
157 | | - lines.append(f"{step_name}: alpha_star={alpha}") |
158 | | - |
159 | | - return "\n".join(lines) |
| 79 | +Respond with a clear, actionable topology description that can be |
| 80 | +directly used to generate an ngraph scenario. Be specific about |
| 81 | +node counts, link capacities, and failure modes to test. |
| 82 | +""" |
160 | 83 |
|
161 | 84 |
|
162 | 85 | def run_analysis_loop( |
163 | 86 | results: dict, |
164 | 87 | hypothesis: str, |
165 | 88 | backend: LLMBackend, |
166 | | - max_iterations: int = 10, |
| 89 | + max_iterations: int = 3, |
167 | 90 | ) -> AnalysisResult: |
168 | | - """Run the analysis loop on simulation results. |
| 91 | + """Analyze simulation results using verified metrics + LLM interpretation. |
169 | 92 |
|
170 | | - Iterates: LLM forms claims → extract citations → verify → adversarial check |
171 | | - until findings are complete or budget is exhausted. |
| 93 | + 1. Compute metrics programmatically (trustworthy) |
| 94 | + 2. Ask LLM to interpret the metrics (where it adds value) |
| 95 | + 3. Ask LLM to propose the next hypothesis (closes the outer loop) |
172 | 96 |
|
173 | 97 | Args: |
174 | 98 | results: ngraph simulation results dict. |
175 | | - hypothesis: The hypothesis being tested (natural language). |
176 | | - backend: LLM backend for analysis. |
177 | | - max_iterations: Maximum analysis iterations. |
| 99 | + hypothesis: The hypothesis being tested. |
| 100 | + backend: LLM backend for interpretation. |
| 101 | + max_iterations: Max retries if LLM produces empty response. |
178 | 102 |
|
179 | 103 | Returns: |
180 | | - AnalysisResult with verified findings. |
| 104 | + AnalysisResult with verified metrics, interpretation, and next hypothesis. |
181 | 105 | """ |
182 | | - results_summary = _build_results_summary(results) |
183 | | - all_findings: list[Finding] = [] |
184 | | - feedback = "" |
| 106 | + # Step 1: compute verified metrics (no LLM involved) |
| 107 | + metrics_report = build_metrics_report(results) |
185 | 108 |
|
186 | | - for iteration in range(max_iterations): |
187 | | - # Ask LLM to analyze |
188 | | - prompt = _ANALYSIS_PROMPT_TEMPLATE.format( |
| 109 | + # Step 2: ask LLM to interpret |
| 110 | + interpretation = "" |
| 111 | + for _attempt in range(max_iterations): |
| 112 | + prompt = _ANALYSIS_PROMPT.format( |
189 | 113 | hypothesis=hypothesis, |
190 | | - results_summary=results_summary, |
191 | | - feedback=feedback, |
| 114 | + metrics_report=metrics_report, |
192 | 115 | ) |
193 | 116 | response = backend.generate(prompt, system=_ANALYSIS_SYSTEM_PROMPT) |
| 117 | + interpretation = response.strip() |
| 118 | + if interpretation: |
| 119 | + break |
194 | 120 |
|
195 | | - # Parse findings |
196 | | - raw_findings = _parse_findings(response) |
197 | | - if not raw_findings: |
198 | | - feedback = "Your previous response did not contain any findings in the expected format. Please use CLAIM: / EVIDENCE: / DISPROOF: format." |
199 | | - continue |
200 | | - |
201 | | - # Verify citations for each finding |
202 | | - new_findings: list[Finding] = [] |
203 | | - for raw in raw_findings: |
204 | | - evidence_text = raw.get("evidence", "") |
205 | | - claims = extract_claims_from_text(evidence_text) |
206 | | - verification = verify_claims(claims, results) |
207 | | - |
208 | | - finding = Finding( |
209 | | - claim=raw.get("claim", ""), |
210 | | - evidence=evidence_text, |
211 | | - verification=verification, |
212 | | - adversarial_check=raw.get("disproof", ""), |
213 | | - ) |
214 | | - new_findings.append(finding) |
215 | | - |
216 | | - all_findings.extend(new_findings) |
217 | | - |
218 | | - # Check if any claims failed verification |
219 | | - mismatches = [f for f in new_findings if f.verification.mismatches] |
220 | | - if mismatches: |
221 | | - mismatch_details = [] |
222 | | - for f in mismatches: |
223 | | - for c in f.verification.mismatches: |
224 | | - mismatch_details.append( |
225 | | - f" {c.path}: claimed {c.claimed_value}, actual {c.actual_value}" |
226 | | - ) |
227 | | - feedback = ( |
228 | | - "Some of your cited values do not match the actual data:\n" |
229 | | - + "\n".join(mismatch_details) |
230 | | - + "\nPlease recheck and revise your analysis." |
231 | | - ) |
232 | | - continue |
233 | | - |
234 | | - # All findings verified — analysis is complete |
| 121 | + if not interpretation: |
235 | 122 | return AnalysisResult( |
236 | | - findings=all_findings, |
237 | | - iterations_used=iteration + 1, |
238 | | - complete=True, |
| 123 | + metrics_report=metrics_report, |
| 124 | + interpretation="(LLM produced no interpretation)", |
| 125 | + next_hypothesis="", |
| 126 | + iterations_used=max_iterations, |
| 127 | + complete=False, |
239 | 128 | ) |
240 | 129 |
|
241 | | - # Budget exhausted |
| 130 | + # Step 3: ask LLM to propose next hypothesis |
| 131 | + # Use a brief metrics summary (first 20 lines) to avoid token bloat |
| 132 | + metrics_lines = metrics_report.splitlines() |
| 133 | + metrics_summary = "\n".join(metrics_lines[:20]) |
| 134 | + |
| 135 | + next_prompt = _NEXT_HYPOTHESIS_PROMPT.format( |
| 136 | + interpretation=interpretation, |
| 137 | + hypothesis=hypothesis, |
| 138 | + metrics_summary=metrics_summary, |
| 139 | + ) |
| 140 | + next_hypothesis = backend.generate( |
| 141 | + next_prompt, system=_ANALYSIS_SYSTEM_PROMPT |
| 142 | + ).strip() |
| 143 | + |
242 | 144 | return AnalysisResult( |
243 | | - findings=all_findings, |
244 | | - iterations_used=max_iterations, |
245 | | - complete=False, |
| 145 | + metrics_report=metrics_report, |
| 146 | + interpretation=interpretation, |
| 147 | + next_hypothesis=next_hypothesis, |
| 148 | + iterations_used=1, |
| 149 | + complete=True, |
246 | 150 | ) |
0 commit comments