From 5d9cc429438b0467bd93fe8552431ed5789ae902 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 28 May 2026 08:02:03 -0400 Subject: [PATCH 1/2] feat: stale process webhook alert Fire an alert when the count of workers with a heartbeat older than 5 minutes meets or exceeds `alert_stale_process_threshold`. Reuses the existing `alert_webhook_url` and `alert_webhook_cooldown` config. Payload type is `"stale_processes"`. Stale count is read from the already-computed `queue_stats` hash to avoid a redundant DB query. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 1 + app/models/solid_stack_web/alert_webhook.rb | 5 +++ .../install/templates/initializer.rb | 5 ++- lib/solid_stack_web.rb | 6 ++- .../solid_stack_web/alert_webhook_spec.rb | 40 +++++++++++++++++-- 5 files changed, 50 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 988e004..2bd95f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Slow job webhook alert — fires when the number of currently-claimed jobs exceeding `slow_job_threshold` reaches `alert_slow_job_count_threshold`; respects the shared `alert_webhook_url` and `alert_webhook_cooldown` settings; payload includes `type: "slow_jobs"`, `count`, and `threshold` +- Stale process webhook alert — fires when the number of workers with a heartbeat older than 5 minutes meets `alert_stale_process_threshold`; reuses the shared webhook URL and cooldown; payload includes `type: "stale_processes"`, `count`, and `threshold` ## [1.2.0] - 2026-05-27 diff --git a/app/models/solid_stack_web/alert_webhook.rb b/app/models/solid_stack_web/alert_webhook.rb index 6af1c37..618c43c 100644 --- a/app/models/solid_stack_web/alert_webhook.rb +++ b/app/models/solid_stack_web/alert_webhook.rb @@ -44,6 +44,11 @@ def build_alerts alerts << { type: "slow_jobs", count: count, threshold: count_threshold } if count >= count_threshold end + if (threshold = SolidStackWeb.alert_stale_process_threshold) + count = @queue_stats[:processes_stale] + alerts << { type: "stale_processes", count: count, threshold: threshold } if count >= threshold + end + alerts end diff --git a/lib/generators/solid_stack_web/install/templates/initializer.rb b/lib/generators/solid_stack_web/install/templates/initializer.rb index 1832bef..ef8e6cf 100644 --- a/lib/generators/solid_stack_web/install/templates/initializer.rb +++ b/lib/generators/solid_stack_web/install/templates/initializer.rb @@ -47,6 +47,7 @@ # "critical" => 50, # "default" => 500 # } - # config.alert_slow_job_count_threshold = 3 # fire when N+ claimed jobs exceed slow_job_threshold duration - # config.alert_webhook_cooldown = 3600 # seconds between repeat alerts + # config.alert_slow_job_count_threshold = 3 # fire when N+ claimed jobs exceed slow_job_threshold duration + # config.alert_stale_process_threshold = 1 # fire when N+ workers have a stale heartbeat (>5 min old) + # config.alert_webhook_cooldown = 3600 # seconds between repeat alerts end diff --git a/lib/solid_stack_web.rb b/lib/solid_stack_web.rb index 025bb34..2f9b813 100644 --- a/lib/solid_stack_web.rb +++ b/lib/solid_stack_web.rb @@ -6,7 +6,7 @@ class << self attr_writer :page_size, :connects_to, :slow_job_threshold, :alert_webhook_url, :alert_webhook_cooldown, :alert_failure_threshold, :alert_queue_thresholds, - :alert_slow_job_count_threshold, + :alert_slow_job_count_threshold, :alert_stale_process_threshold, :dashboard_refresh_interval, :default_refresh_interval, :search_results_limit, :allow_value_preview @@ -42,6 +42,10 @@ def alert_slow_job_count_threshold @alert_slow_job_count_threshold end + def alert_stale_process_threshold + @alert_stale_process_threshold + end + def dashboard_refresh_interval @dashboard_refresh_interval || 5_000 end diff --git a/spec/models/solid_stack_web/alert_webhook_spec.rb b/spec/models/solid_stack_web/alert_webhook_spec.rb index 54fbb60..3e02f68 100644 --- a/spec/models/solid_stack_web/alert_webhook_spec.rb +++ b/spec/models/solid_stack_web/alert_webhook_spec.rb @@ -2,15 +2,16 @@ RSpec.describe SolidStackWeb::AlertWebhook do let(:webhook_url) { "https://hooks.example.com/alert" } - let(:queue_stats) { { failed: 0, ready: 0 } } + let(:queue_stats) { { failed: 0, ready: 0, processes_stale: 0 } } after do SolidStackWeb.alert_webhook_url = nil SolidStackWeb.alert_failure_threshold = nil SolidStackWeb.alert_queue_thresholds = nil - SolidStackWeb.alert_slow_job_count_threshold = nil - SolidStackWeb.alert_webhook_cooldown = nil - SolidStackWeb.slow_job_threshold = nil + SolidStackWeb.alert_slow_job_count_threshold = nil + SolidStackWeb.alert_stale_process_threshold = nil + SolidStackWeb.alert_webhook_cooldown = nil + SolidStackWeb.slow_job_threshold = nil Rails.cache.clear end @@ -137,6 +138,37 @@ def claimed_job(created_at:) expect(stub).not_to have_been_requested end end + + context "stale process threshold" do + before { SolidStackWeb.alert_stale_process_threshold = 1 } + + it "does not POST when stale process count is below the threshold" do + stub = stub_request(:post, webhook_url).to_return(status: 200) + described_class.check(queue_stats.merge(processes_stale: 0)) + expect(stub).not_to have_been_requested + end + + it "POSTs when stale process count meets the threshold" do + stub = stub_request(:post, webhook_url).to_return(status: 200) + described_class.check(queue_stats.merge(processes_stale: 1)) + expect(stub).to have_been_requested + end + + it "includes stale_processes type in the payload" do + stub = stub_request(:post, webhook_url) + .with { |req| JSON.parse(req.body)["alerts"].any? { |a| a["type"] == "stale_processes" } } + .to_return(status: 200) + described_class.check(queue_stats.merge(processes_stale: 2)) + expect(stub).to have_been_requested + end + + it "does not POST when threshold is not configured" do + SolidStackWeb.alert_stale_process_threshold = nil + stub = stub_request(:post, webhook_url).to_return(status: 200) + described_class.check(queue_stats.merge(processes_stale: 5)) + expect(stub).not_to have_been_requested + end + end end end end From 03c42e81064f5be984fd5017cca0e65c2764a2d3 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 28 May 2026 08:03:01 -0400 Subject: [PATCH 2/2] docs: remove shipped stale process webhook alert from ROADMAP v1.3 Co-Authored-By: Claude Sonnet 4.6 --- ROADMAP.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index 4e0af80..f6f0079 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,8 +2,6 @@ `solid_stack_web` aims to be the definitive operational dashboard for the full Rails Solid Stack — a single mountable engine covering **Solid Queue**, **Solid Cache**, and **Solid Cable** with the depth needed for day-to-day production operations, not just a status page. -The project has shipped through v1.2.0, covering full Solid Queue depth — job management, queue controls, worker visibility, failed job handling, job history, sortable columns, and persistent filter preferences. The roadmap ahead deepens the observability story: configurable alerting (v1.3), opt-in audit logging (v1.4), and extensibility hooks for host apps (v2.0). - --- @@ -11,7 +9,6 @@ The project has shipped through v1.2.0, covering full Solid Queue depth — job > _More signals, fewer blind spots._ -- **Process stale webhook alert** — fire when a worker's `last_heartbeat_at` expires; a worker going silent means jobs stop processing without any visible signal - **Job wait time column** — show time from `enqueued_at` to `created_at` on claimed executions; a direct measure of queue SLA (how long jobs waited before a worker picked them up) ---