Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion scripts/cloud_run_runtime_guard.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,19 @@ def _load_services() -> list[str]:
return unique


def _scheduler_job_pattern_for_services(services: list[str]) -> str:
candidates: list[str] = []
for service in services:
service_name = str(service or "").strip()
if not service_name:
continue
candidates.append(service_name)
if service_name.endswith("-service"):
candidates.append(service_name.removesuffix("-service"))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Anchor derived scheduler aliases

When RUNTIME_GUARD_SCHEDULER_JOB_PATTERN is unset, adding the stripped alias as a bare regex alternative makes the runtime guard match sibling jobs that only share the prefix. For example, with configured service longbridge-quant-hk-service, the generated longbridge\-quant\-hk alternative also matches longbridge-quant-hk-verify-service-scheduler, so a failure in that separate scheduler job would be reported against this guard run. Please constrain the generated alternatives to the intended scheduler job names or boundaries.

Useful? React with 👍 / 👎.

unique = list(dict.fromkeys(candidates))
return "|".join(re.escape(candidate) for candidate in unique)


def _run_gcloud_logging(project: str, log_filter: str, limit: int) -> list[dict[str, Any]]:
command = [
"gcloud",
Expand Down Expand Up @@ -214,7 +227,6 @@ def main() -> int:
require_success = _env_bool("RUNTIME_GUARD_REQUIRE_SUCCESS", False)
fail_workflow = _env_bool("RUNTIME_GUARD_FAIL_WORKFLOW_ON_ALERT", True)
check_scheduler = _env_bool("RUNTIME_GUARD_CHECK_SCHEDULER", True)
scheduler_pattern = os.environ.get("RUNTIME_GUARD_SCHEDULER_JOB_PATTERN") or ""

since = (
dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=lookback_minutes)
Expand All @@ -230,6 +242,10 @@ def main() -> int:
except RuntimeError as exc:
services = []
issues.append(f"service configuration error: {exc}")
scheduler_pattern = (
os.environ.get("RUNTIME_GUARD_SCHEDULER_JOB_PATTERN")
or _scheduler_job_pattern_for_services(services)
)

for service in services:
log_filter = (
Expand Down
18 changes: 15 additions & 3 deletions scripts/execution_report_heartbeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,24 @@ def _describe_scheduler_jobs_for_services(
) -> list[dict[str, Any]]:
jobs = []
for service in services:
job = _describe_scheduler_job(f"{service}-scheduler", project=project)
if job:
jobs.append(job)
for job_name in _scheduler_job_name_candidates(service):
job = _describe_scheduler_job(job_name, project=project)
if job:
jobs.append(job)
break
return jobs


def _scheduler_job_name_candidates(service: str) -> list[str]:
service_name = str(service or "").strip()
if not service_name:
return []
candidates = [f"{service_name}-scheduler"]
if service_name.endswith("-service"):
candidates.append(f"{service_name.removesuffix('-service')}-scheduler")
return _unique_values(candidates)


def _scheduler_job_targets_strategy_run(job: dict[str, Any], service: str) -> bool:
if str(job.get("state") or "").strip().upper() not in {"", "ENABLED"}:
return False
Expand Down
13 changes: 13 additions & 0 deletions tests/test_cloud_run_runtime_guard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from __future__ import annotations

import re

from scripts import cloud_run_runtime_guard as guard


def test_scheduler_job_pattern_includes_service_alias():
pattern = guard._scheduler_job_pattern_for_services(["longbridge-quant-hk-service"])

assert re.search(pattern, "longbridge-quant-hk-service-scheduler")
assert re.search(pattern, "longbridge-quant-hk-scheduler")
assert not re.search(pattern, "longbridge-quant-sg-scheduler")
38 changes: 38 additions & 0 deletions tests/test_execution_report_heartbeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,44 @@ def test_scheduler_aware_required_services_fall_back_to_named_scheduler_describe
assert scheduler_checked is True


def test_scheduler_aware_named_fallback_uses_service_alias(monkeypatch):
_clear_runtime_env(monkeypatch)
monkeypatch.setenv("CLOUD_RUN_SERVICE", "longbridge-quant-hk-service")
monkeypatch.setattr(
heartbeat,
"_list_scheduler_jobs",
lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("cloudscheduler.jobs.list denied")),
)
requested_job_names = []

def fake_describe_scheduler_job(job_name, **_kwargs):
requested_job_names.append(job_name)
if job_name != "longbridge-quant-hk-scheduler":
return None
return {
"state": "ENABLED",
"schedule": "45 15 1-7 * *",
"timeZone": "Asia/Hong_Kong",
"httpTarget": {"uri": "https://longbridge-quant-hk-service.example.run.app/"},
}

monkeypatch.setattr(heartbeat, "_describe_scheduler_job", fake_describe_scheduler_job)

required, skip_reason, scheduler_checked = heartbeat._resolve_required_services(
project="project-1",
since=dt.datetime(2026, 6, 10, 0, 0, tzinfo=dt.timezone.utc),
now=dt.datetime(2026, 6, 10, 2, 0, tzinfo=dt.timezone.utc),
)

assert requested_job_names == [
"longbridge-quant-hk-service-scheduler",
"longbridge-quant-hk-scheduler",
]
assert required == []
assert skip_reason and "no configured Cloud Scheduler main job was due" in skip_reason
assert scheduler_checked is True


def test_main_skips_when_no_scheduler_main_job_is_due(monkeypatch, capsys):
_clear_runtime_env(monkeypatch)
monkeypatch.setenv("GCP_PROJECT_ID", "longbridgequant")
Expand Down