Skip to content

Commit 7ece2dc

Browse files
fix(demo): graceful power cap skip, cloud cost in idle waste, JSON input for Demo 5
- Demo 1: Pre-flight checks if power capping works before running. Shows clear "not available on this platform" panel on cloud GPUs instead of spamming WARNING lines. - Demo 3: Shows cloud instance cost alongside electricity cost. H100 at $3.99/hr = $2,873/month idle waste — much scarier than the $11.98/month electricity number alone. - Demo 5: Accepts --result-file PATH for standalone scaling projections without needing to run Demos 1/2/4 first. - power_control.set_power_limit: Added quiet parameter to suppress prints during pre-flight checks. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8bc0669 commit 7ece2dc

2 files changed

Lines changed: 92 additions & 16 deletions

File tree

agent/demos/investor_demo.py

Lines changed: 80 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from __future__ import annotations
2828

2929
import argparse
30+
import json
3031
import os
3132
import subprocess
3233
import sys
@@ -65,8 +66,26 @@ def _get_carbon_client():
6566

6667

6768
def _get_power_control():
68-
from efficiency.power_control import get_default_power_limit
69-
return get_default_power_limit
69+
from efficiency.power_control import get_default_power_limit, set_power_limit
70+
return get_default_power_limit, set_power_limit
71+
72+
73+
# ── Cloud cost lookup ────────────────────────────────────────────────────────
74+
75+
# Approximate spot/on-demand rates ($/hr) for common cloud GPUs
76+
CLOUD_RATES_PER_HOUR: dict[str, float] = {
77+
"H100": 3.99, "H200": 5.49, "A100": 1.89,
78+
"RTX 4090": 0.59, "RTX 3090": 0.44, "L40S": 1.14,
79+
"L40": 0.89, "A10G": 0.50, "T4": 0.20, "V100": 0.80,
80+
}
81+
82+
83+
def _lookup_cloud_rate(gpu_name: str) -> float | None:
84+
"""Match GPU name to cloud hourly rate via substring."""
85+
for key, rate in CLOUD_RATES_PER_HOUR.items():
86+
if key in gpu_name:
87+
return rate
88+
return None
7089

7190

7291
# ── Console helpers ──────────────────────────────────────────────────────────
@@ -109,11 +128,28 @@ def demo_1_powercap(gpu: int, duration: int, iterations: int, warmup: int) -> di
109128
"""'Free Money' power cap test — same workload, lower TDP."""
110129
_banner(1, 5, '"Free Money" Power Cap A/B')
111130

112-
get_default = _get_power_control()
131+
get_default, set_limit = _get_power_control()
113132
try:
114133
default_w = get_default(gpu)
115134
except Exception:
116135
default_w = 450 # RTX 4090 fallback
136+
137+
# Pre-flight: check if power capping is available
138+
can_cap = set_limit(gpu, default_w, quiet=True)
139+
if not can_cap:
140+
if _rich:
141+
Console().print(Panel(
142+
"[bold yellow]Power capping not available on this platform[/bold yellow]\n"
143+
"Cloud containers typically block GPU power limit changes.\n"
144+
"This demo requires bare-metal or on-prem GPU access.\n"
145+
"Skipping Demo 1.",
146+
border_style="yellow",
147+
))
148+
else:
149+
print(" Power capping not available on this platform (cloud container).")
150+
print(" Skipping Demo 1.")
151+
return None
152+
117153
capped_w = int(default_w * 0.70)
118154

119155
print(f" Default TDP: {default_w}W")
@@ -230,23 +266,44 @@ def demo_3_idle_waste(gpu: int, sample_duration: int = 15) -> dict | None:
230266
avg_power = result.avg_power_w
231267
idle_fraction = result.idle_fraction
232268
monthly_waste_kwh = (avg_power / 1000.0) * 720 # kWh per month
233-
monthly_waste_usd = monthly_waste_kwh * 0.12
269+
electricity_waste_usd = monthly_waste_kwh * 0.12
270+
271+
# Cloud instance cost (the real number)
272+
cloud_rate = _lookup_cloud_rate(gpu_name)
273+
cloud_monthly = cloud_rate * 720 if cloud_rate else None
234274

235275
from optimize import _print_rich as opt_rich, _print_plain as opt_plain
236276
if _rich:
237277
opt_rich(result)
238278
console = Console()
239279
console.print()
240-
console.print(Panel(
241-
f"[bold red]This GPU is burning ${monthly_waste_usd:.2f}/month doing nothing[/bold red]\n"
280+
281+
waste_lines = []
282+
if cloud_monthly:
283+
waste_lines.append(
284+
f"[bold red]This GPU is burning ${cloud_monthly:,.0f}/month in cloud costs doing nothing[/bold red]"
285+
)
286+
waste_lines.append(
287+
f"Cloud instance: ${cloud_rate:.2f}/hr = [bold]${cloud_monthly:,.0f}/month[/bold] | "
288+
f"Electricity: ${electricity_waste_usd:.2f}/month"
289+
)
290+
else:
291+
waste_lines.append(
292+
f"[bold red]This GPU is burning ${electricity_waste_usd:.2f}/month doing nothing[/bold red]"
293+
)
294+
waste_lines.append(
242295
f"Idle fraction: {idle_fraction*100:.0f}% | "
243296
f"Avg power: {avg_power:.0f}W | "
244-
f"Monthly waste: {monthly_waste_kwh:.0f} kWh",
245-
border_style="red",
246-
))
297+
f"Monthly waste: {monthly_waste_kwh:.0f} kWh"
298+
)
299+
console.print(Panel("\n".join(waste_lines), border_style="red"))
247300
else:
248301
opt_plain(result)
249-
print(f"\n >>> This GPU is burning ${monthly_waste_usd:.2f}/month doing nothing")
302+
if cloud_monthly:
303+
print(f"\n >>> This GPU is burning ${cloud_monthly:,.0f}/month in cloud costs doing nothing")
304+
print(f" Cloud: ${cloud_rate:.2f}/hr = ${cloud_monthly:,.0f}/mo | Electricity: ${electricity_waste_usd:.2f}/mo")
305+
else:
306+
print(f"\n >>> This GPU is burning ${electricity_waste_usd:.2f}/month doing nothing")
250307
print(f" Idle: {idle_fraction*100:.0f}% | Power: {avg_power:.0f}W | Waste: {monthly_waste_kwh:.0f} kWh/mo")
251308

252309
return asdict(result)
@@ -369,8 +426,18 @@ def run_demo(args: argparse.Namespace) -> int:
369426
ab_results.append(r)
370427

371428
if demo_choice in ("all", "5"):
429+
# Load from file if provided
430+
if hasattr(args, "result_file") and args.result_file:
431+
try:
432+
with open(args.result_file) as f:
433+
ab_results = [json.load(f)]
434+
except (FileNotFoundError, json.JSONDecodeError) as e:
435+
print(f" ERROR: Could not load result file: {e}")
436+
return 1
437+
372438
if not ab_results:
373-
print(" Demo 5 requires results from Demos 1, 2, or 4. Run --demo all first.")
439+
print(" Demo 5 requires results from Demos 1, 2, or 4.")
440+
print(" Run --demo all first, or provide --result-file PATH.")
374441
else:
375442
demo_5_scaling(ab_results)
376443

@@ -402,4 +469,6 @@ def make_parser() -> argparse.ArgumentParser:
402469
help="Quick mode: 30s, 1 iteration (default)")
403470
p.add_argument("--full", action="store_true", default=False,
404471
help="Full mode: 120s, 3 iterations")
472+
p.add_argument("--result-file", type=str, metavar="PATH",
473+
help="Path to ABResult JSON file (for Demo 5 standalone)")
405474
return p

agent/efficiency/power_control.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,24 @@
2727
pynvml = None # type: ignore[assignment]
2828

2929

30-
def set_power_limit(gpu_index: int, watts: int) -> bool:
30+
def set_power_limit(gpu_index: int, watts: int, quiet: bool = False) -> bool:
3131
"""
3232
Set GPU power limit via NVML.
3333
3434
Falls back to nvidia-smi if NVML persistence mode isn't enabled.
3535
Requires root/sudo on bare metal.
36+
37+
Args:
38+
quiet: If True, suppress all print output (for pre-flight checks).
3639
"""
3740
if pynvml is not None:
3841
try:
3942
pynvml.nvmlInit()
4043
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
4144
pynvml.nvmlDeviceSetPowerManagementLimit(handle, watts * 1000) # mW
4245
actual = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0
43-
print(f" Power limit set to {actual:.0f}W via NVML")
46+
if not quiet:
47+
print(f" Power limit set to {actual:.0f}W via NVML")
4448
return True
4549
except pynvml.NVMLError:
4650
pass
@@ -52,13 +56,16 @@ def set_power_limit(gpu_index: int, watts: int) -> bool:
5256
capture_output=True, text=True, timeout=10,
5357
)
5458
if result.returncode == 0:
55-
print(f" Power limit set to {watts}W via nvidia-smi")
59+
if not quiet:
60+
print(f" Power limit set to {watts}W via nvidia-smi")
5661
return True
5762
else:
58-
print(f" WARNING: nvidia-smi -pl failed: {result.stderr.strip()}")
63+
if not quiet:
64+
print(f" WARNING: nvidia-smi -pl failed: {result.stderr.strip()}")
5965
return False
6066
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
61-
print(f" WARNING: Could not set power limit: {e}")
67+
if not quiet:
68+
print(f" WARNING: Could not set power limit: {e}")
6269
return False
6370

6471

0 commit comments

Comments
 (0)