From a706fe3a1b6ac05800087ccfc809e8cd2d59d056 Mon Sep 17 00:00:00 2001 From: Pigbibi <20649888+Pigbibi@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:32:46 +0800 Subject: [PATCH] Harden scheduler switch guards --- scripts/cloud_run_runtime_guard.py | 18 ++++++++++- scripts/execution_report_heartbeat.py | 18 +++++++++-- tests/test_cloud_run_runtime_guard.py | 13 ++++++++ tests/test_execution_report_heartbeat.py | 38 ++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 tests/test_cloud_run_runtime_guard.py diff --git a/scripts/cloud_run_runtime_guard.py b/scripts/cloud_run_runtime_guard.py index bab869b..9e6dc22 100644 --- a/scripts/cloud_run_runtime_guard.py +++ b/scripts/cloud_run_runtime_guard.py @@ -85,6 +85,19 @@ def _load_services() -> list[str]: return unique +def _scheduler_job_pattern_for_services(services: list[str]) -> str: + candidates: list[str] = [] + for service in services: + service_name = str(service or "").strip() + if not service_name: + continue + candidates.append(service_name) + if service_name.endswith("-service"): + candidates.append(service_name.removesuffix("-service")) + unique = list(dict.fromkeys(candidates)) + return "|".join(re.escape(candidate) for candidate in unique) + + def _run_gcloud_logging(project: str, log_filter: str, limit: int) -> list[dict[str, Any]]: command = [ "gcloud", @@ -214,7 +227,6 @@ def main() -> int: require_success = _env_bool("RUNTIME_GUARD_REQUIRE_SUCCESS", False) fail_workflow = _env_bool("RUNTIME_GUARD_FAIL_WORKFLOW_ON_ALERT", True) check_scheduler = _env_bool("RUNTIME_GUARD_CHECK_SCHEDULER", True) - scheduler_pattern = os.environ.get("RUNTIME_GUARD_SCHEDULER_JOB_PATTERN") or "" since = ( dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=lookback_minutes) @@ -230,6 +242,10 @@ def main() -> int: except RuntimeError as exc: services = [] issues.append(f"service configuration error: {exc}") + scheduler_pattern = ( + os.environ.get("RUNTIME_GUARD_SCHEDULER_JOB_PATTERN") + or _scheduler_job_pattern_for_services(services) + ) for service in services: log_filter = ( diff --git a/scripts/execution_report_heartbeat.py b/scripts/execution_report_heartbeat.py index f9ea3b6..e10d2ab 100644 --- a/scripts/execution_report_heartbeat.py +++ b/scripts/execution_report_heartbeat.py @@ -313,12 +313,24 @@ def _describe_scheduler_jobs_for_services( ) -> list[dict[str, Any]]: jobs = [] for service in services: - job = _describe_scheduler_job(f"{service}-scheduler", project=project) - if job: - jobs.append(job) + for job_name in _scheduler_job_name_candidates(service): + job = _describe_scheduler_job(job_name, project=project) + if job: + jobs.append(job) + break return jobs +def _scheduler_job_name_candidates(service: str) -> list[str]: + service_name = str(service or "").strip() + if not service_name: + return [] + candidates = [f"{service_name}-scheduler"] + if service_name.endswith("-service"): + candidates.append(f"{service_name.removesuffix('-service')}-scheduler") + return _unique_values(candidates) + + def _scheduler_job_targets_strategy_run(job: dict[str, Any], service: str) -> bool: if str(job.get("state") or "").strip().upper() not in {"", "ENABLED"}: return False diff --git a/tests/test_cloud_run_runtime_guard.py b/tests/test_cloud_run_runtime_guard.py new file mode 100644 index 0000000..ef0bcd4 --- /dev/null +++ b/tests/test_cloud_run_runtime_guard.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import re + +from scripts import cloud_run_runtime_guard as guard + + +def test_scheduler_job_pattern_includes_service_alias(): + pattern = guard._scheduler_job_pattern_for_services(["longbridge-quant-hk-service"]) + + assert re.search(pattern, "longbridge-quant-hk-service-scheduler") + assert re.search(pattern, "longbridge-quant-hk-scheduler") + assert not re.search(pattern, "longbridge-quant-sg-scheduler") diff --git a/tests/test_execution_report_heartbeat.py b/tests/test_execution_report_heartbeat.py index 23e90ee..287574a 100644 --- a/tests/test_execution_report_heartbeat.py +++ b/tests/test_execution_report_heartbeat.py @@ -172,6 +172,44 @@ def test_scheduler_aware_required_services_fall_back_to_named_scheduler_describe assert scheduler_checked is True +def test_scheduler_aware_named_fallback_uses_service_alias(monkeypatch): + _clear_runtime_env(monkeypatch) + monkeypatch.setenv("CLOUD_RUN_SERVICE", "longbridge-quant-hk-service") + monkeypatch.setattr( + heartbeat, + "_list_scheduler_jobs", + lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("cloudscheduler.jobs.list denied")), + ) + requested_job_names = [] + + def fake_describe_scheduler_job(job_name, **_kwargs): + requested_job_names.append(job_name) + if job_name != "longbridge-quant-hk-scheduler": + return None + return { + "state": "ENABLED", + "schedule": "45 15 1-7 * *", + "timeZone": "Asia/Hong_Kong", + "httpTarget": {"uri": "https://longbridge-quant-hk-service.example.run.app/"}, + } + + monkeypatch.setattr(heartbeat, "_describe_scheduler_job", fake_describe_scheduler_job) + + required, skip_reason, scheduler_checked = heartbeat._resolve_required_services( + project="project-1", + since=dt.datetime(2026, 6, 10, 0, 0, tzinfo=dt.timezone.utc), + now=dt.datetime(2026, 6, 10, 2, 0, tzinfo=dt.timezone.utc), + ) + + assert requested_job_names == [ + "longbridge-quant-hk-service-scheduler", + "longbridge-quant-hk-scheduler", + ] + assert required == [] + assert skip_reason and "no configured Cloud Scheduler main job was due" in skip_reason + assert scheduler_checked is True + + def test_main_skips_when_no_scheduler_main_job_is_due(monkeypatch, capsys): _clear_runtime_env(monkeypatch) monkeypatch.setenv("GCP_PROJECT_ID", "longbridgequant")