|
| 1 | +"""Inner Loop 2: Results analysis and explanation. |
| 2 | +
|
| 3 | +Iterates: LLM forms claims about simulation results → extracts |
| 4 | +specific numbers → citation verifier checks → adversarial questioning |
| 5 | +→ revise until explanation is complete. |
| 6 | +
|
| 7 | +Stateless and in-memory. All state is passed through function arguments. |
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +from dataclasses import dataclass, field |
| 13 | + |
| 14 | +from .backend import LLMBackend |
| 15 | +from .citation_verifier import ( |
| 16 | + VerificationResult, |
| 17 | + extract_claims_from_text, |
| 18 | + verify_claims, |
| 19 | +) |
| 20 | + |
| 21 | + |
| 22 | +@dataclass |
| 23 | +class Finding: |
| 24 | + """A single verified finding from the analysis loop.""" |
| 25 | + |
| 26 | + claim: str |
| 27 | + evidence: str # specific numbers cited |
| 28 | + verification: VerificationResult |
| 29 | + adversarial_check: str # what would disprove this |
| 30 | + |
| 31 | + |
| 32 | +@dataclass |
| 33 | +class AnalysisResult: |
| 34 | + """Output of the analysis loop.""" |
| 35 | + |
| 36 | + findings: list[Finding] = field(default_factory=list) |
| 37 | + iterations_used: int = 0 |
| 38 | + complete: bool = False |
| 39 | + |
| 40 | + def summary(self) -> str: |
| 41 | + n = len(self.findings) |
| 42 | + verified = sum(1 for f in self.findings if f.verification.all_verified) |
| 43 | + return f"{n} findings ({verified} fully verified), {self.iterations_used} iterations" |
| 44 | + |
| 45 | + |
| 46 | +_ANALYSIS_SYSTEM_PROMPT = """\ |
| 47 | +You are analyzing network simulation results. Your job is to explain |
| 48 | +WHY the results look the way they do, grounded in specific numbers. |
| 49 | +
|
| 50 | +Rules: |
| 51 | +1. Every claim must cite specific values from the data using dot-path |
| 52 | + notation: steps.step_name.data.field.subfield = value |
| 53 | +2. After each claim, state what would disprove it. |
| 54 | +3. Be precise. "BAC drops significantly" is not acceptable. |
| 55 | + "steps.tm_lh_path.data.flow_results.0.summary.overall_ratio = 0.3333" is. |
| 56 | +""" |
| 57 | + |
| 58 | +_ANALYSIS_PROMPT_TEMPLATE = """\ |
| 59 | +Hypothesis being tested: |
| 60 | +{hypothesis} |
| 61 | +
|
| 62 | +Simulation results summary: |
| 63 | +{results_summary} |
| 64 | +
|
| 65 | +{feedback} |
| 66 | +
|
| 67 | +Provide your analysis as a series of findings. For each finding: |
| 68 | +1. State the claim |
| 69 | +2. Cite specific values using dot-path notation (path = value) |
| 70 | +3. State what would disprove this claim |
| 71 | +
|
| 72 | +Format each finding as: |
| 73 | +
|
| 74 | +CLAIM: <your claim> |
| 75 | +EVIDENCE: <dot.path = value, one per line> |
| 76 | +DISPROOF: <what would disprove this> |
| 77 | +""" |
| 78 | + |
| 79 | +_ADVERSARIAL_PROMPT = """\ |
| 80 | +You previously found: |
| 81 | +{finding_text} |
| 82 | +
|
| 83 | +The citations were verified against the actual data: |
| 84 | +{verification_summary} |
| 85 | +
|
| 86 | +Now critically question this finding: |
| 87 | +1. Is there a simpler explanation? |
| 88 | +2. Could this be a coincidence or artifact? |
| 89 | +3. What additional evidence would strengthen or weaken this claim? |
| 90 | +
|
| 91 | +If the finding stands, respond with "CONFIRMED". |
| 92 | +If it needs revision, provide the revised finding in the same format. |
| 93 | +""" |
| 94 | + |
| 95 | + |
| 96 | +def _parse_findings(response: str) -> list[dict[str, str]]: |
| 97 | + """Parse LLM response into finding dicts.""" |
| 98 | + findings: list[dict[str, str]] = [] |
| 99 | + current: dict[str, str] = {} |
| 100 | + |
| 101 | + for line in response.splitlines(): |
| 102 | + line = line.strip() |
| 103 | + if line.startswith("CLAIM:"): |
| 104 | + if current.get("claim"): |
| 105 | + findings.append(current) |
| 106 | + current = {"claim": line[6:].strip()} |
| 107 | + elif line.startswith("EVIDENCE:"): |
| 108 | + current["evidence"] = line[9:].strip() |
| 109 | + elif line.startswith("DISPROOF:"): |
| 110 | + current["disproof"] = line[9:].strip() |
| 111 | + elif current.get("evidence") is not None and "=" in line and "." in line: |
| 112 | + # Continuation of evidence lines |
| 113 | + current["evidence"] += "\n" + line |
| 114 | + |
| 115 | + if current.get("claim"): |
| 116 | + findings.append(current) |
| 117 | + |
| 118 | + return findings |
| 119 | + |
| 120 | + |
| 121 | +def _build_results_summary(results: dict) -> str: |
| 122 | + """Build a concise summary of simulation results for the LLM.""" |
| 123 | + lines: list[str] = [] |
| 124 | + steps = results.get("steps", {}) |
| 125 | + |
| 126 | + for step_name, step_data in steps.items(): |
| 127 | + data = step_data.get("data", {}) |
| 128 | + |
| 129 | + baseline = data.get("baseline", {}) |
| 130 | + flow_results = data.get("flow_results", []) |
| 131 | + |
| 132 | + if baseline and isinstance(baseline, dict): |
| 133 | + summary = baseline.get("summary", {}) |
| 134 | + lines.append( |
| 135 | + f"{step_name}: baseline ratio={summary.get('overall_ratio', 'N/A')}, " |
| 136 | + f"placed={summary.get('total_placed', 'N/A')}, " |
| 137 | + f"demand={summary.get('total_demand', 'N/A')}" |
| 138 | + ) |
| 139 | + |
| 140 | + if flow_results: |
| 141 | + n_patterns = len(flow_results) |
| 142 | + total_iters = sum(fr.get("occurrence_count", 1) for fr in flow_results) |
| 143 | + lines.append( |
| 144 | + f" {n_patterns} unique failure patterns, {total_iters} iterations" |
| 145 | + ) |
| 146 | + for fr in flow_results[:3]: |
| 147 | + s = fr.get("summary", {}) |
| 148 | + lines.append( |
| 149 | + f" pattern (count={fr.get('occurrence_count', 1)}): " |
| 150 | + f"ratio={s.get('overall_ratio', 'N/A')}, " |
| 151 | + f"placed={s.get('total_placed', 'N/A')}" |
| 152 | + ) |
| 153 | + |
| 154 | + # MSD alpha |
| 155 | + alpha = data.get("alpha_star") |
| 156 | + if alpha is not None: |
| 157 | + lines.append(f"{step_name}: alpha_star={alpha}") |
| 158 | + |
| 159 | + return "\n".join(lines) |
| 160 | + |
| 161 | + |
| 162 | +def run_analysis_loop( |
| 163 | + results: dict, |
| 164 | + hypothesis: str, |
| 165 | + backend: LLMBackend, |
| 166 | + max_iterations: int = 10, |
| 167 | +) -> AnalysisResult: |
| 168 | + """Run the analysis loop on simulation results. |
| 169 | +
|
| 170 | + Iterates: LLM forms claims → extract citations → verify → adversarial check |
| 171 | + until findings are complete or budget is exhausted. |
| 172 | +
|
| 173 | + Args: |
| 174 | + results: ngraph simulation results dict. |
| 175 | + hypothesis: The hypothesis being tested (natural language). |
| 176 | + backend: LLM backend for analysis. |
| 177 | + max_iterations: Maximum analysis iterations. |
| 178 | +
|
| 179 | + Returns: |
| 180 | + AnalysisResult with verified findings. |
| 181 | + """ |
| 182 | + results_summary = _build_results_summary(results) |
| 183 | + all_findings: list[Finding] = [] |
| 184 | + feedback = "" |
| 185 | + |
| 186 | + for iteration in range(max_iterations): |
| 187 | + # Ask LLM to analyze |
| 188 | + prompt = _ANALYSIS_PROMPT_TEMPLATE.format( |
| 189 | + hypothesis=hypothesis, |
| 190 | + results_summary=results_summary, |
| 191 | + feedback=feedback, |
| 192 | + ) |
| 193 | + response = backend.generate(prompt, system=_ANALYSIS_SYSTEM_PROMPT) |
| 194 | + |
| 195 | + # Parse findings |
| 196 | + raw_findings = _parse_findings(response) |
| 197 | + if not raw_findings: |
| 198 | + feedback = "Your previous response did not contain any findings in the expected format. Please use CLAIM: / EVIDENCE: / DISPROOF: format." |
| 199 | + continue |
| 200 | + |
| 201 | + # Verify citations for each finding |
| 202 | + new_findings: list[Finding] = [] |
| 203 | + for raw in raw_findings: |
| 204 | + evidence_text = raw.get("evidence", "") |
| 205 | + claims = extract_claims_from_text(evidence_text) |
| 206 | + verification = verify_claims(claims, results) |
| 207 | + |
| 208 | + finding = Finding( |
| 209 | + claim=raw.get("claim", ""), |
| 210 | + evidence=evidence_text, |
| 211 | + verification=verification, |
| 212 | + adversarial_check=raw.get("disproof", ""), |
| 213 | + ) |
| 214 | + new_findings.append(finding) |
| 215 | + |
| 216 | + all_findings.extend(new_findings) |
| 217 | + |
| 218 | + # Check if any claims failed verification |
| 219 | + mismatches = [f for f in new_findings if f.verification.mismatches] |
| 220 | + if mismatches: |
| 221 | + mismatch_details = [] |
| 222 | + for f in mismatches: |
| 223 | + for c in f.verification.mismatches: |
| 224 | + mismatch_details.append( |
| 225 | + f" {c.path}: claimed {c.claimed_value}, actual {c.actual_value}" |
| 226 | + ) |
| 227 | + feedback = ( |
| 228 | + "Some of your cited values do not match the actual data:\n" |
| 229 | + + "\n".join(mismatch_details) |
| 230 | + + "\nPlease recheck and revise your analysis." |
| 231 | + ) |
| 232 | + continue |
| 233 | + |
| 234 | + # All findings verified — analysis is complete |
| 235 | + return AnalysisResult( |
| 236 | + findings=all_findings, |
| 237 | + iterations_used=iteration + 1, |
| 238 | + complete=True, |
| 239 | + ) |
| 240 | + |
| 241 | + # Budget exhausted |
| 242 | + return AnalysisResult( |
| 243 | + findings=all_findings, |
| 244 | + iterations_used=max_iterations, |
| 245 | + complete=False, |
| 246 | + ) |
0 commit comments