QuantStrategyLab · Pigbibi · Jun 2, 2026 · Jun 2, 2026
@@ -0,0 +1,80 @@
+name: Runtime Guard
+
+on:
+  workflow_dispatch:
+    inputs:
+      lookback_minutes:
+        description: "Cloud Logging lookback window in minutes."
+        required: false
+        type: string
+        default: "180"
+      require_success:
+        description: "Alert if no successful Cloud Run request exists in the lookback window."
+        required: false
+        type: choice
+        default: "false"
+        options:
+          - "false"
+          - "true"
+      fail_workflow_on_alert:
+        description: "Fail this workflow when an alert is emitted."
+        required: false
+        type: choice
+        default: "true"
+        options:
+          - "true"
+          - "false"
+  schedule:
+    - cron: "29,59 * * * *"
+
+env:
+  GCP_PROJECT_ID: longbridgequant
+  GCP_WORKLOAD_IDENTITY_PROVIDER: projects/252919773759/locations/global/workloadIdentityPools/github-actions/providers/github-main
+  GCP_WORKLOAD_IDENTITY_SERVICE_ACCOUNT: longbridge-platform-deploy@longbridgequant.iam.gserviceaccount.com
+
+jobs:
+  guard:
+    name: Check ${{ matrix.target.label }} Cloud Run runtime
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - label: PAPER
+            environment: longbridge-paper
+          - label: HK
+            environment: longbridge-hk
+          - label: SG
+            environment: longbridge-sg
+    permissions:
+      contents: read
+      id-token: write
+    environment: ${{ matrix.target.environment }}
+    env:
+      RUNTIME_GUARD_NAME: LongBridgePlatform ${{ matrix.target.label }}
+      RUNTIME_GUARD_CLOUD_RUN_SERVICES: ${{ vars.RUNTIME_GUARD_CLOUD_RUN_SERVICES }}
+      RUNTIME_GUARD_LOOKBACK_MINUTES: ${{ inputs.lookback_minutes || vars.RUNTIME_GUARD_LOOKBACK_MINUTES || '180' }}
+      RUNTIME_GUARD_REQUIRE_SUCCESS: ${{ inputs.require_success || vars.RUNTIME_GUARD_REQUIRE_SUCCESS || 'false' }}
+      RUNTIME_GUARD_FAIL_WORKFLOW_ON_ALERT: ${{ inputs.fail_workflow_on_alert || vars.RUNTIME_GUARD_FAIL_WORKFLOW_ON_ALERT || 'true' }}
+      RUNTIME_GUARD_SCHEDULER_JOB_PATTERN: ${{ vars.RUNTIME_GUARD_SCHEDULER_JOB_PATTERN || vars.CLOUD_RUN_SERVICE }}
+      CLOUD_RUN_SERVICE: ${{ vars.CLOUD_RUN_SERVICE }}
+      GLOBAL_TELEGRAM_CHAT_ID: ${{ vars.GLOBAL_TELEGRAM_CHAT_ID }}
+      CRISIS_ALERT_TELEGRAM_CHAT_IDS: ${{ vars.CRISIS_ALERT_TELEGRAM_CHAT_IDS }}
+      CRISIS_ALERT_TELEGRAM_API_BASE_URL: ${{ vars.CRISIS_ALERT_TELEGRAM_API_BASE_URL }}
+      TELEGRAM_TOKEN: ${{ secrets.TELEGRAM_TOKEN }}
+      CRISIS_ALERT_TELEGRAM_BOT_TOKEN: ${{ secrets.CRISIS_ALERT_TELEGRAM_BOT_TOKEN }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v3
+        with:
+          workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ env.GCP_WORKLOAD_IDENTITY_SERVICE_ACCOUNT }}
+
+      - name: Set up gcloud
+        uses: google-github-actions/setup-gcloud@v3
+
+      - name: Check Cloud Scheduler and Cloud Run logs
+        run: python scripts/cloud_run_runtime_guard.py
@@ -183,6 +183,38 @@ Important:
 - Here "shared" only means **shared inside this repository** between the `paper`, `HK`, and `SG` Cloud Run services. The Telegram token can still be shared, but LongPort app credentials should live in Secret Manager and be referenced by per-environment secret-name variables; they are not meant to be a global secret set reused by unrelated quant repos.
 - If you want one cross-project shared layer across multiple quant repos, keep it small: `GLOBAL_TELEGRAM_CHAT_ID`, `NOTIFY_LANG`, `CRISIS_ALERT_CHANNELS`, and shared crisis alert settings under `CRISIS_ALERT_EMAIL_*`/`CRISIS_ALERT_PUSH_*` are reasonable when the same alert policy applies; account credentials, deployment keys, and alert secrets are not.
 
+### Runtime guard alerting
+
+`.github/workflows/runtime-guard.yml` is a second notification layer for failures
+outside the LongBridge Flask handler. It runs once per GitHub Environment
+(`longbridge-paper`, `longbridge-hk`, and `longbridge-sg`), reads Cloud Logging
+for recent Cloud Scheduler errors and Cloud Run request/runtime failures, then
+sends Telegram directly through `CRISIS_ALERT_TELEGRAM_BOT_TOKEN` +
+`CRISIS_ALERT_TELEGRAM_CHAT_IDS` or the fallback `TELEGRAM_TOKEN` +
+`GLOBAL_TELEGRAM_CHAT_ID`.
+
+The guard does not invoke Cloud Run trading routes. It is meant to catch cases
+where Scheduler cannot reach the service, OIDC/IAM/audience is wrong, Cloud Run
+returns 4xx/5xx, or the container fails before app-level Telegram fallback code
+can run.
+
+Required setup:
+
+- keep each Environment's `CLOUD_RUN_SERVICE` set, or set
+  `RUNTIME_GUARD_CLOUD_RUN_SERVICES`
+- grant the GitHub deploy service account `roles/logging.viewer` on
+  `longbridgequant`
+- keep Telegram chat/token variables or secrets configured in GitHub
+- optionally set `RUNTIME_GUARD_SCHEDULER_JOB_PATTERN` per Environment; by
+  default the workflow filters Scheduler logs by that Environment's
+  `CLOUD_RUN_SERVICE`
+
+The scheduled guard runs every 30 minutes. For a missed-run heartbeat, set
+`RUNTIME_GUARD_REQUIRE_SUCCESS=true` and choose
+`RUNTIME_GUARD_LOOKBACK_MINUTES` so the window covers the expected Scheduler run
+for that Environment. The default leaves the heartbeat check off to avoid false
+alerts outside active market windows.
+
 ### Deployment unit and naming
 
 - `QuantPlatformKit` is only a shared dependency; Cloud Run still deploys `LongBridgePlatform` itself.
@@ -379,6 +411,30 @@ Secret Manager 中需存在 `LONGPORT_SECRET_NAME` 指定的密钥（默认: `lo
 - 这里的“共享”只是指 **同一个仓库里的 paper / HK / SG 服务共享**。Telegram token 可以继续共用，但 LongPort app 凭据建议放到 Secret Manager，并通过各自 Environment 里的 secret-name 变量引用，不建议把它们当成所有 quant 共用的全局 secrets。
 - 如果你真的要在多个 quant 仓库之间保留一层全局共享，建议只保留 `GLOBAL_TELEGRAM_CHAT_ID`、`NOTIFY_LANG`、`CRISIS_ALERT_CHANNELS`，以及同一套危机告警策略下的 `CRISIS_ALERT_EMAIL_*`/`CRISIS_ALERT_PUSH_*` 这种低耦合配置。账户凭据、部署 key 和告警 secret 不要做成全局共享。
 
+### Runtime Guard 告警
+
+`.github/workflows/runtime-guard.yml` 是 LongBridge Flask handler 之外的第二层通知。它按
+GitHub Environment 分别运行一次（`longbridge-paper`、`longbridge-hk`、`longbridge-sg`），
+只读取 Cloud Logging 中最近的 Cloud Scheduler 错误和 Cloud Run 请求/运行失败，然后直接通过
+`CRISIS_ALERT_TELEGRAM_BOT_TOKEN` + `CRISIS_ALERT_TELEGRAM_CHAT_IDS` 或 fallback 的
+`TELEGRAM_TOKEN` + `GLOBAL_TELEGRAM_CHAT_ID` 发 Telegram。
+
+这个 guard 不会调用 Cloud Run 的交易路由，主要覆盖 Scheduler 没打到服务、
+OIDC/IAM/audience 配错、Cloud Run 返回 4xx/5xx、或容器在 app-level Telegram fallback
+执行前就失败的情况。
+
+需要的配置：
+
+- 每个 Environment 保持 `CLOUD_RUN_SERVICE` 正确，或设置 `RUNTIME_GUARD_CLOUD_RUN_SERVICES`
+- GitHub deploy service account 需要 `longbridgequant` 项目级 `roles/logging.viewer`
+- GitHub 中继续配置 Telegram chat/token 变量或 secrets
+- 可选按 Environment 设置 `RUNTIME_GUARD_SCHEDULER_JOB_PATTERN`；默认会按该 Environment 的
+  `CLOUD_RUN_SERVICE` 过滤 Scheduler 日志
+
+默认计划每 30 分钟检查一次。若要做 missed-run 心跳，按 Environment 设置
+`RUNTIME_GUARD_REQUIRE_SUCCESS=true`，并把 `RUNTIME_GUARD_LOOKBACK_MINUTES` 设成覆盖该环境预期
+Scheduler 运行时间的窗口。默认不强制心跳，避免非交易窗口误报。
+
 ### 部署单元和命名建议
 
 - `QuantPlatformKit` 只是共享依赖，不单独部署；Cloud Run 继续只部署 `LongBridgePlatform`。