Skip to content

Commit 1a4f1c5

Browse files
feat: add all untracked files — carbon, benchmarks, dashboard, migrations, agent APIs
Includes: - agent/machine_id.py, tag_client.py - app/api: agent heartbeat, benchmarks, carbon, cron, dashboard, metrics ingest, reports, user profile - app: carbon page, dashboard pages (main + settings) - components/dashboard: AgentsPanel, CarbonExportButton, ClusterFilter, JobsCarbonTable, Footer - database/migrations: 025–030 (machine identity, metrics fields, cluster summary, benchmarks, carbon, carbon leaderboard) - lib: api-auth, rate-limiter, supabase-client, supabase-server - tsconfig.json, vercel.json, sitemap.ts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 84bb889 commit 1a4f1c5

40 files changed

Lines changed: 3385 additions & 0 deletions

File tree

agent/machine_id.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2026 Kevin (AluminatiAI)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""
15+
Stable machine identity for AluminatAI GPU Agent.
16+
17+
Generates a UUID the first time it runs and persists it to
18+
~/.config/aluminatai/machine_id so it survives hostname changes and
19+
process restarts. On I/O failure (read-only FS, permission denied, etc.)
20+
an ephemeral UUID is returned — the agent never crashes due to identity
21+
issues.
22+
"""
23+
from __future__ import annotations
24+
25+
import uuid
26+
from pathlib import Path
27+
28+
_MACHINE_ID_PATH = Path("~/.config/aluminatai/machine_id")
29+
30+
31+
def get_machine_id() -> str:
32+
"""Return a stable UUID string for this machine.
33+
34+
Reads from ~/.config/aluminatai/machine_id if it exists; generates and
35+
persists a new UUID on first call. Returns an ephemeral (non-persisted)
36+
UUID if any I/O error occurs.
37+
"""
38+
path = _MACHINE_ID_PATH.expanduser()
39+
try:
40+
path.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
41+
if path.exists():
42+
mid = path.read_text().strip()
43+
if mid:
44+
return mid
45+
mid = str(uuid.uuid4())
46+
path.write_text(mid)
47+
return mid
48+
except OSError:
49+
# Ephemeral fallback — non-fatal, agent continues running
50+
return str(uuid.uuid4())

agent/tag_client.py

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
# Copyright 2026 Kevin (AluminatiAI)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# AluminatiAI — https://github.com/AgentMulder404/AluminatAI
16+
"""
17+
TagClient: polls GET /api/v1/tag and caches active job registrations.
18+
19+
Usage:
20+
client = TagClient(api_endpoint, api_key, poll_interval=30)
21+
client.start() # starts background polling thread
22+
23+
match = client.match(gpu_index=0, pid=12345, ts=datetime.now(UTC))
24+
if match:
25+
# match.job_id, match.team_id, match.model_tag
26+
"""
27+
28+
from __future__ import annotations
29+
30+
import logging
31+
import threading
32+
import time
33+
from dataclasses import dataclass
34+
from datetime import datetime, timezone, timedelta
35+
from typing import Optional
36+
37+
import requests
38+
39+
logger = logging.getLogger(__name__)
40+
41+
# ── Data model ─────────────────────────────────────────────────────────────────
42+
43+
44+
@dataclass
45+
class TagRecord:
46+
id: str
47+
job_id: str
48+
team_id: Optional[str]
49+
model_tag: Optional[str]
50+
gpu_indices: Optional[list[int]] # None = all GPUs
51+
start_time: datetime
52+
end_time: Optional[datetime] # None = open-ended
53+
pid: Optional[int]
54+
55+
56+
# ── Client ─────────────────────────────────────────────────────────────────────
57+
58+
59+
class TagClient:
60+
"""
61+
Background-polling client for the /api/v1/tag REST endpoint.
62+
63+
Thread-safe: `match()` can be called from any thread.
64+
"""
65+
66+
def __init__(
67+
self,
68+
api_endpoint: str, # e.g. "https://aluminatiai.com/v1/metrics/ingest"
69+
api_key: str,
70+
poll_interval: int = 30, # seconds between polls
71+
):
72+
# Derive base URL from the ingest endpoint (strip trailing path)
73+
# Expected: https://host/v1/metrics/ingest → https://host
74+
parts = api_endpoint.split("/v1/")
75+
self._base_url = parts[0].rstrip("/")
76+
self._tag_url = f"{self._base_url}/api/v1/tag"
77+
78+
self._api_key = api_key
79+
self._poll_interval = poll_interval
80+
81+
self._lock = threading.Lock()
82+
self._tags: list[TagRecord] = []
83+
self._last_poll: Optional[datetime] = None
84+
self._thread: Optional[threading.Thread] = None
85+
self._stop_event = threading.Event()
86+
87+
# ── Lifecycle ──────────────────────────────────────────────────────────────
88+
89+
def start(self) -> None:
90+
"""Start the background polling thread (idempotent)."""
91+
if self._thread and self._thread.is_alive():
92+
return
93+
self._stop_event.clear()
94+
self._thread = threading.Thread(target=self._poll_loop, daemon=True, name="tag-client")
95+
self._thread.start()
96+
logger.info("TagClient started — polling %s every %ds", self._tag_url, self._poll_interval)
97+
98+
def stop(self) -> None:
99+
"""Signal the polling thread to stop."""
100+
self._stop_event.set()
101+
102+
# ── Matching ───────────────────────────────────────────────────────────────
103+
104+
def match(
105+
self,
106+
gpu_index: int,
107+
pid: Optional[int],
108+
ts: datetime,
109+
) -> Optional[TagRecord]:
110+
"""
111+
Return the highest-priority tag that covers this (gpu_index, pid, timestamp).
112+
113+
Priority: most-recently-started tag wins when multiple match.
114+
Returns None if no tag matches.
115+
"""
116+
if ts.tzinfo is None:
117+
ts = ts.replace(tzinfo=timezone.utc)
118+
119+
with self._lock:
120+
candidates: list[TagRecord] = []
121+
for tag in self._tags:
122+
# Time window: start_time ≤ ts ≤ end_time (or open-ended)
123+
if ts < tag.start_time:
124+
continue
125+
if tag.end_time is not None and ts > tag.end_time:
126+
continue
127+
# GPU index filter (None = all GPUs)
128+
if tag.gpu_indices is not None and gpu_index not in tag.gpu_indices:
129+
continue
130+
# PID filter (None = any PID)
131+
if tag.pid is not None and pid != tag.pid:
132+
continue
133+
candidates.append(tag)
134+
135+
if not candidates:
136+
return None
137+
138+
# Most recently started tag wins
139+
return max(candidates, key=lambda t: t.start_time)
140+
141+
# ── Internal polling ───────────────────────────────────────────────────────
142+
143+
def _poll_loop(self) -> None:
144+
while not self._stop_event.is_set():
145+
try:
146+
self._fetch()
147+
except Exception as exc:
148+
logger.warning("TagClient poll error: %s", exc)
149+
self._stop_event.wait(self._poll_interval)
150+
151+
def _fetch(self) -> None:
152+
# Ask for tags from the last 25 hours (slightly more than a day to catch
153+
# long-running jobs that started before the last poll window).
154+
since = (datetime.now(timezone.utc) - timedelta(hours=25)).isoformat()
155+
headers = {"X-API-Key": self._api_key}
156+
157+
resp = requests.get(
158+
self._tag_url,
159+
params={"since": since},
160+
headers=headers,
161+
timeout=10,
162+
)
163+
resp.raise_for_status()
164+
165+
raw_tags = resp.json().get("tags", [])
166+
parsed: list[TagRecord] = []
167+
for t in raw_tags:
168+
try:
169+
parsed.append(TagRecord(
170+
id=t["id"],
171+
job_id=t["job_id"],
172+
team_id=t.get("team_id"),
173+
model_tag=t.get("model_tag"),
174+
gpu_indices=t.get("gpu_indices"),
175+
start_time=_parse_dt(t["start_time"]),
176+
end_time=_parse_dt(t["end_time"]) if t.get("end_time") else None,
177+
pid=t.get("pid"),
178+
))
179+
except Exception as exc:
180+
logger.debug("TagClient: skipping malformed tag %s: %s", t.get("id"), exc)
181+
182+
with self._lock:
183+
self._tags = parsed
184+
self._last_poll = datetime.now(timezone.utc)
185+
186+
logger.debug("TagClient: fetched %d tag(s)", len(parsed))
187+
188+
@property
189+
def tag_count(self) -> int:
190+
with self._lock:
191+
return len(self._tags)
192+
193+
194+
def _parse_dt(value: str) -> datetime:
195+
"""Parse ISO 8601 timestamp to an aware datetime (UTC)."""
196+
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
197+
if dt.tzinfo is None:
198+
dt = dt.replace(tzinfo=timezone.utc)
199+
return dt

app/api/agent/agents/route.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import { NextRequest, NextResponse } from "next/server";
2+
import { createSupabaseCookieClient } from "@/lib/supabase-server";
3+
import { createSupabaseServerClient } from "@/lib/supabase-client";
4+
import { rateLimit, getRateLimitHeaders } from "@/lib/rate-limiter";
5+
6+
export const runtime = "edge";
7+
8+
const ONLINE_THRESHOLD_MIN = 10;
9+
10+
export async function GET(req: NextRequest) {
11+
// Auth via dashboard session
12+
const cookieClient = createSupabaseCookieClient();
13+
const {
14+
data: { user },
15+
error: authError,
16+
} = await cookieClient.auth.getUser();
17+
18+
if (authError || !user) {
19+
return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
20+
}
21+
22+
// Rate limit: 60 req/min per user
23+
const rl = await rateLimit(`agents:${user.id}`, 60);
24+
if (!rl.success) {
25+
return NextResponse.json(
26+
{ error: "Rate limit exceeded" },
27+
{ status: 429, headers: getRateLimitHeaders(rl) }
28+
);
29+
}
30+
31+
const supabase = createSupabaseServerClient();
32+
const { data, error } = await supabase
33+
.from("agent_heartbeats")
34+
.select(
35+
"hostname, machine_id, cluster_tag, location_hint, gpu_count, gpu_names, agent_version, scheduler, last_seen"
36+
)
37+
.eq("user_id", user.id)
38+
.order("last_seen", { ascending: false });
39+
40+
if (error) {
41+
return NextResponse.json({ error: error.message }, { status: 500 });
42+
}
43+
44+
const cutoff = new Date(Date.now() - ONLINE_THRESHOLD_MIN * 60 * 1000);
45+
const agents = (data ?? []).map((row) => ({
46+
...row,
47+
is_online: new Date(row.last_seen) > cutoff,
48+
}));
49+
50+
return NextResponse.json(agents, {
51+
headers: getRateLimitHeaders(rl),
52+
});
53+
}

app/api/agent/heartbeat/route.ts

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import { NextRequest, NextResponse } from "next/server";
2+
import { validateApiKey } from "@/lib/api-auth";
3+
import { createSupabaseServerClient } from "@/lib/supabase-client";
4+
5+
export const runtime = "edge";
6+
7+
export async function POST(req: NextRequest) {
8+
const apiKey = req.headers.get("x-api-key") ?? "";
9+
const auth = await validateApiKey(apiKey);
10+
if (!auth.valid) {
11+
return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
12+
}
13+
14+
let body: Record<string, unknown>;
15+
try {
16+
body = await req.json();
17+
} catch {
18+
return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
19+
}
20+
21+
const {
22+
agent_version,
23+
hostname,
24+
gpu_count,
25+
gpu_uuids,
26+
scheduler,
27+
uptime_sec,
28+
config_hash,
29+
machine_id,
30+
cluster_tag,
31+
location_hint,
32+
gpu_names,
33+
} = body as {
34+
agent_version?: string;
35+
hostname?: string;
36+
gpu_count?: number;
37+
gpu_uuids?: string[];
38+
scheduler?: string;
39+
uptime_sec?: number;
40+
config_hash?: string;
41+
machine_id?: string;
42+
cluster_tag?: string;
43+
location_hint?: string;
44+
gpu_names?: string[];
45+
};
46+
47+
const supabase = createSupabaseServerClient();
48+
const { error } = await supabase.from("agent_heartbeats").upsert(
49+
{
50+
user_id: auth.userId,
51+
hostname: hostname ?? "",
52+
agent_version: agent_version ?? "",
53+
gpu_count: gpu_count ?? 0,
54+
gpu_uuids: gpu_uuids ?? [],
55+
scheduler: scheduler ?? "none",
56+
uptime_sec: uptime_sec ?? 0,
57+
config_hash: config_hash ?? "",
58+
machine_id: machine_id ?? null,
59+
cluster_tag: cluster_tag ?? "",
60+
location_hint: location_hint ?? "",
61+
gpu_names: gpu_names ?? [],
62+
last_seen: new Date().toISOString(),
63+
},
64+
{ onConflict: "user_id,hostname" }
65+
);
66+
67+
if (error) {
68+
return NextResponse.json({ error: error.message }, { status: 500 });
69+
}
70+
71+
return NextResponse.json({ ok: true });
72+
}

0 commit comments

Comments
 (0)