From 4f55522d97605eeb79407815ba70ddb0f0707224 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 21 May 2026 16:39:06 -0400 Subject: [PATCH 1/2] feat: add queue depth alert webhook New alert_queue_thresholds config accepts a hash of queue_name => ready-job-count limits. A webhook fires when any queue's ready count meets or exceeds its threshold, using the same alert_webhook_url endpoint(s) with event "queue_depth_threshold_exceeded". Cooldown is tracked independently per queue so a busy queue does not suppress alerts for others. Co-Authored-By: Claude Sonnet 4.6 --- .../solid_queue_web/dashboard_controller.rb | 1 + .../solid_queue_web/queue_depth_alert.rb | 74 ++++++++++ lib/solid_queue_web.rb | 6 +- .../solid_queue_web/queue_depth_alert_spec.rb | 129 ++++++++++++++++++ 4 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 app/services/solid_queue_web/queue_depth_alert.rb create mode 100644 spec/services/solid_queue_web/queue_depth_alert_spec.rb diff --git a/app/controllers/solid_queue_web/dashboard_controller.rb b/app/controllers/solid_queue_web/dashboard_controller.rb index 4d00f6f..0a97a46 100644 --- a/app/controllers/solid_queue_web/dashboard_controller.rb +++ b/app/controllers/solid_queue_web/dashboard_controller.rb @@ -3,6 +3,7 @@ class DashboardController < ApplicationController def index @stats = DashboardStats.new AlertWebhook.call(failure_count: @stats.counts[:failed]) + QueueDepthAlert.call end end end diff --git a/app/services/solid_queue_web/queue_depth_alert.rb b/app/services/solid_queue_web/queue_depth_alert.rb new file mode 100644 index 0000000..46a9f09 --- /dev/null +++ b/app/services/solid_queue_web/queue_depth_alert.rb @@ -0,0 +1,74 @@ +require "net/http" +require "json" +require "uri" + +module SolidQueueWeb + class QueueDepthAlert + MUTEX = Mutex.new + + class << self + def call + return unless configured? + + queue_depths = SolidQueue::ReadyExecution + .joins(:job) + .group("solid_queue_jobs.queue_name") + .count + + queue_depths.each do |queue_name, depth| + threshold = SolidQueueWeb.alert_queue_thresholds[queue_name.to_s] + next unless threshold && depth >= threshold + next unless should_fire?(queue_name) + + urls = webhook_urls + Thread.new { urls.each { |url| post(url, queue_name, depth, threshold) } } + end + end + + def reset! + MUTEX.synchronize { @last_fired_at = {} } + end + + private + + def configured? + SolidQueueWeb.alert_queue_thresholds.any? && webhook_urls.any? + end + + def webhook_urls + Array(SolidQueueWeb.alert_webhook_url).flatten.compact.select(&:present?) + end + + def should_fire?(queue_name) + MUTEX.synchronize do + @last_fired_at ||= {} + cooldown = SolidQueueWeb.alert_webhook_cooldown + return false if @last_fired_at[queue_name] && Time.current - @last_fired_at[queue_name] < cooldown + + @last_fired_at[queue_name] = Time.current + true + end + end + + def post(url_string, queue_name, depth, threshold) + uri = URI.parse(url_string) + payload = JSON.generate( + event: "queue_depth_threshold_exceeded", + queue_name: queue_name, + depth: depth, + threshold: threshold, + fired_at: Time.current.iso8601 + ) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = uri.scheme == "https" + http.open_timeout = 5 + http.read_timeout = 10 + request = Net::HTTP::Post.new(uri.path.presence || "/", "Content-Type" => "application/json") + request.body = payload + http.request(request) + rescue => e + Rails.logger.error("[SolidQueueWeb] Queue depth alert webhook failed: #{e.message}") + end + end + end +end diff --git a/lib/solid_queue_web.rb b/lib/solid_queue_web.rb index 1ee1b77..09c19f2 100644 --- a/lib/solid_queue_web.rb +++ b/lib/solid_queue_web.rb @@ -6,7 +6,7 @@ module SolidQueueWeb class << self attr_writer :page_size, :dashboard_refresh_interval, :default_refresh_interval, :search_results_limit, :slow_job_threshold, :alert_webhook_url, :alert_failure_threshold, :alert_webhook_cooldown, - :connects_to + :alert_queue_thresholds, :connects_to def page_size @page_size || 25 @@ -40,6 +40,10 @@ def alert_webhook_cooldown @alert_webhook_cooldown || 3600 end + def alert_queue_thresholds + @alert_queue_thresholds || {} + end + def connects_to @connects_to end diff --git a/spec/services/solid_queue_web/queue_depth_alert_spec.rb b/spec/services/solid_queue_web/queue_depth_alert_spec.rb new file mode 100644 index 0000000..4bdfc05 --- /dev/null +++ b/spec/services/solid_queue_web/queue_depth_alert_spec.rb @@ -0,0 +1,129 @@ +require "rails_helper" + +RSpec.describe SolidQueueWeb::QueueDepthAlert do + let(:webhook_url) { "http://example.com/webhook" } + + before do + SolidQueueWeb.alert_webhook_url = webhook_url + SolidQueueWeb.alert_queue_thresholds = { "critical" => 5 } + SolidQueueWeb.alert_webhook_cooldown = 3600 + allow(Thread).to receive(:new).and_yield + allow_any_instance_of(Net::HTTP).to receive(:request).and_return(Net::HTTPSuccess.new("1.1", "200", "OK")) + end + + after do + SolidQueueWeb.alert_webhook_url = nil + SolidQueueWeb.alert_queue_thresholds = nil + SolidQueueWeb.alert_webhook_cooldown = nil + described_class.reset! + end + + def enqueue(queue_name:, count:) + count.times do + job = SolidQueue::Job.create!( + queue_name: queue_name, + class_name: "TestJob", + arguments: {}, + active_job_id: SecureRandom.uuid + ) + job.ready_execution + end + end + + describe ".call" do + it "fires when a queue's ready count meets the threshold" do + enqueue(queue_name: "critical", count: 5) + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call + end + + it "fires when a queue's ready count exceeds the threshold" do + enqueue(queue_name: "critical", count: 10) + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call + end + + it "does not fire when ready count is below the threshold" do + enqueue(queue_name: "critical", count: 4) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not fire for queues without a configured threshold" do + enqueue(queue_name: "default", count: 100) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not fire when alert_queue_thresholds is empty" do + SolidQueueWeb.alert_queue_thresholds = {} + enqueue(queue_name: "critical", count: 10) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not fire when no webhook URL is configured" do + SolidQueueWeb.alert_webhook_url = nil + enqueue(queue_name: "critical", count: 10) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "fires independently per queue" do + SolidQueueWeb.alert_queue_thresholds = { "critical" => 5, "default" => 10 } + enqueue(queue_name: "critical", count: 5) + enqueue(queue_name: "default", count: 3) + expect(Net::HTTP).to receive(:new).once.and_call_original + described_class.call + end + + it "does not fire again within the cooldown window" do + enqueue(queue_name: "critical", count: 5) + described_class.call + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "fires again after the cooldown window expires" do + enqueue(queue_name: "critical", count: 5) + described_class.call + described_class.instance_variable_get(:@last_fired_at)["critical"] = 2.hours.ago + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call + end + + it "tracks cooldown independently per queue" do + SolidQueueWeb.alert_queue_thresholds = { "critical" => 5, "default" => 5 } + enqueue(queue_name: "critical", count: 5) + enqueue(queue_name: "default", count: 5) + described_class.call + described_class.instance_variable_get(:@last_fired_at)["critical"] = 2.hours.ago + expect(Net::HTTP).to receive(:new).once.and_call_original + described_class.call + end + + it "sends a JSON payload with the correct fields" do + enqueue(queue_name: "critical", count: 5) + posted_body = nil + allow_any_instance_of(Net::HTTP).to receive(:request) do |_, req| + posted_body = JSON.parse(req.body) + Net::HTTPSuccess.new("1.1", "200", "OK") + end + + described_class.call + + expect(posted_body["event"]).to eq("queue_depth_threshold_exceeded") + expect(posted_body["queue_name"]).to eq("critical") + expect(posted_body["depth"]).to eq(5) + expect(posted_body["threshold"]).to eq(5) + expect(posted_body["fired_at"]).to be_present + end + + it "logs an error and does not raise when the HTTP request fails" do + enqueue(queue_name: "critical", count: 5) + allow_any_instance_of(Net::HTTP).to receive(:request).and_raise(RuntimeError, "connection refused") + expect(Rails.logger).to receive(:error).with(/Queue depth alert webhook failed/) + expect { described_class.call }.not_to raise_error + end + end +end From fe0b0f86766fbdc545601c95379aff433947ad98 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 21 May 2026 16:39:14 -0400 Subject: [PATCH 2/2] docs: update README, ROADMAP, and CHANGELOG for queue depth alert Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 1 + README.md | 30 ++++++++++++++++++++++++++++-- ROADMAP.md | 2 +- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32e6473..d5250f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Queue depth alert — `alert_queue_thresholds` accepts a hash of `queue_name => ready_job_count`; a webhook fires when any configured queue's ready count meets or exceeds its threshold; cooldown is tracked independently per queue so a busy queue doesn't suppress alerts for others; uses the same `alert_webhook_url` endpoint(s) with `event: "queue_depth_threshold_exceeded"` and `queue_name`, `depth`, and `threshold` fields in the payload - Multiple webhook targets — `alert_webhook_url` now accepts an array of URL strings; all configured endpoints receive the same payload when the failure threshold is exceeded; a failure posting to one URL is logged and skipped without blocking the remaining targets - Retry failed job with modified arguments — the Arguments card on the job detail page becomes an editable textarea for failed jobs; editing the JSON and clicking "Retry with these arguments" updates the job record and retries in one step; invalid JSON redirects back with an error message without touching the failed execution diff --git a/README.md b/README.md index 2bf94ad..8552e3f 100644 --- a/README.md +++ b/README.md @@ -100,9 +100,10 @@ SolidQueueWeb.configure do |config| config.default_refresh_interval = 30_000 # jobs/processes/history auto-refresh in ms (default: 10_000) config.search_results_limit = 10 # max results per status in global search (default: 25) config.slow_job_threshold = 5.minutes # flag claimed jobs running longer than this (default: nil = disabled) - config.alert_webhook_url = "https://hooks.example.com/solid-queue" # POST target (default: nil = disabled) + config.alert_webhook_url = "https://hooks.example.com/solid-queue" # POST target — string or array (default: nil = disabled) config.alert_failure_threshold = 10 # fire when failed count >= this (default: nil = disabled) - config.alert_webhook_cooldown = 1800 # seconds between repeated alerts (default: 3600) + config.alert_queue_thresholds = { "critical" => 50, "default" => 200 } # fire when queue depth >= threshold (default: {}) + config.alert_webhook_cooldown = 1800 # seconds between repeated alerts per alert type (default: 3600) config.connects_to = { reading: :reading, writing: :writing } # read replica (default: nil) end @@ -151,6 +152,31 @@ The request body is JSON: The webhook fires asynchronously in a background thread so dashboard page loads are never delayed. HTTP errors are logged to `Rails.logger` and swallowed. The cooldown window prevents repeated alerts while the count stays elevated — the clock resets on each app restart. +## Queue depth alerts + +Set `alert_queue_thresholds` to fire a webhook when any queue's ready job count meets or exceeds a per-queue limit: + +```ruby +SolidQueueWeb.configure do |config| + config.alert_webhook_url = "https://hooks.example.com/solid-queue" + config.alert_queue_thresholds = { "critical" => 50, "default" => 200 } +end +``` + +The same `alert_webhook_url` endpoint(s) receive the payload, with a distinct event type so you can route it differently: + +```json +{ + "event": "queue_depth_threshold_exceeded", + "queue_name": "critical", + "depth": 63, + "threshold": 50, + "fired_at": "2026-05-21T12:34:56Z" +} +``` + +Cooldown is tracked independently per queue, so a persistently deep "critical" queue does not suppress alerts for "default". The shared `alert_webhook_cooldown` setting applies to each queue separately. + ## Metrics endpoint `GET /jobs/metrics.json` returns a machine-readable JSON document suitable for Prometheus scraping, uptime monitors, or external dashboards. No configuration is required — the endpoint is available as soon as the engine is mounted. diff --git a/ROADMAP.md b/ROADMAP.md index 40c3cee..f09404b 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -14,7 +14,7 @@ Pull requests for any of these are welcome. See [Contributing](README.md#contrib |---|---| | ~~**Retry failed job with modified arguments**~~ | ✓ Shipped — editable textarea on the job detail page; submitting updates the job record and retries in one step. | | ~~**Multiple webhook targets**~~ | ✓ Shipped — `alert_webhook_url` accepts a string or an array; all URLs receive the same payload; one failure doesn't block the rest. | -| **Queue depth alert** | Fire a webhook when a queue's ready count exceeds a per-queue threshold (e.g. `alert_queue_thresholds: { critical: 50 }`). | +| ~~**Queue depth alert**~~ | ✓ Shipped — `alert_queue_thresholds` hash maps queue names to ready-count limits; cooldown tracked independently per queue; same `alert_webhook_url` endpoint(s) used with a distinct event type. | ---