From 54902bfde263b8366283abce58809977d1dde004 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 28 May 2026 06:18:19 -0400 Subject: [PATCH 1/2] feat: slow job webhook alert Add SlowJobAlert service and alert_slow_job_count_threshold config option. Fires a webhook (event: slow_job_threshold_exceeded) when the number of currently-running slow claimed jobs meets or exceeds the configured count. Requires slow_job_threshold to be set (defines what "slow" means). Uses the same alert_webhook_url and alert_webhook_cooldown settings as other alert types. Triggered on every dashboard page load, subject to cooldown. Co-Authored-By: Claude Sonnet 4.6 --- .../solid_queue_web/dashboard_controller.rb | 1 + .../solid_queue_web/slow_job_alert.rb | 70 ++++++++ lib/solid_queue_web.rb | 6 +- .../solid_queue_web/slow_job_alert_spec.rb | 160 ++++++++++++++++++ 4 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 app/services/solid_queue_web/slow_job_alert.rb create mode 100644 spec/services/solid_queue_web/slow_job_alert_spec.rb diff --git a/app/controllers/solid_queue_web/dashboard_controller.rb b/app/controllers/solid_queue_web/dashboard_controller.rb index 0a97a46..1e8c081 100644 --- a/app/controllers/solid_queue_web/dashboard_controller.rb +++ b/app/controllers/solid_queue_web/dashboard_controller.rb @@ -4,6 +4,7 @@ def index @stats = DashboardStats.new AlertWebhook.call(failure_count: @stats.counts[:failed]) QueueDepthAlert.call + SlowJobAlert.call end end end diff --git a/app/services/solid_queue_web/slow_job_alert.rb b/app/services/solid_queue_web/slow_job_alert.rb new file mode 100644 index 0000000..65e2563 --- /dev/null +++ b/app/services/solid_queue_web/slow_job_alert.rb @@ -0,0 +1,70 @@ +require "net/http" +require "json" +require "uri" + +module SolidQueueWeb + class SlowJobAlert + MUTEX = Mutex.new + + class << self + def call + return unless configured? + + slow_count = SolidQueue::ClaimedExecution + .where("created_at <= ?", SolidQueueWeb.slow_job_threshold.ago) + .count + + return if slow_count < SolidQueueWeb.alert_slow_job_count_threshold + return unless should_fire? + + urls = webhook_urls + Thread.new { urls.each { |url| post(url, slow_count) } } + end + + def reset! + MUTEX.synchronize { @last_fired_at = nil } + end + + private + + def configured? + SolidQueueWeb.slow_job_threshold.present? && + SolidQueueWeb.alert_slow_job_count_threshold.present? && + webhook_urls.any? + end + + def webhook_urls + Array(SolidQueueWeb.alert_webhook_url).flatten.compact.select(&:present?) + end + + def should_fire? + MUTEX.synchronize do + cooldown = SolidQueueWeb.alert_webhook_cooldown + return false if @last_fired_at && Time.current - @last_fired_at < cooldown + + @last_fired_at = Time.current + true + end + end + + def post(url_string, slow_count) + uri = URI.parse(url_string) + payload = JSON.generate( + event: "slow_job_threshold_exceeded", + slow_job_count: slow_count, + threshold: SolidQueueWeb.alert_slow_job_count_threshold, + fired_at: Time.current.iso8601 + ) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = uri.scheme == "https" + http.open_timeout = 5 + http.read_timeout = 10 + request = Net::HTTP::Post.new(uri.path.presence || "/", "Content-Type" => "application/json") + request.body = payload + http.request(request) + rescue => e + Rails.logger.error("[SolidQueueWeb] Slow job alert webhook failed: #{e.message}") + end + end + end +end diff --git a/lib/solid_queue_web.rb b/lib/solid_queue_web.rb index ad8f95b..81db3a0 100644 --- a/lib/solid_queue_web.rb +++ b/lib/solid_queue_web.rb @@ -6,7 +6,7 @@ module SolidQueueWeb class << self attr_writer :page_size, :dashboard_refresh_interval, :default_refresh_interval, :search_results_limit, :slow_job_threshold, :alert_webhook_url, :alert_failure_threshold, :alert_webhook_cooldown, - :alert_queue_thresholds, :connects_to, :time_zone + :alert_queue_thresholds, :alert_slow_job_count_threshold, :connects_to, :time_zone def page_size @page_size || 25 @@ -44,6 +44,10 @@ def alert_queue_thresholds @alert_queue_thresholds || {} end + def alert_slow_job_count_threshold + @alert_slow_job_count_threshold + end + def connects_to @connects_to end diff --git a/spec/services/solid_queue_web/slow_job_alert_spec.rb b/spec/services/solid_queue_web/slow_job_alert_spec.rb new file mode 100644 index 0000000..f2314cf --- /dev/null +++ b/spec/services/solid_queue_web/slow_job_alert_spec.rb @@ -0,0 +1,160 @@ +require "rails_helper" + +RSpec.describe SolidQueueWeb::SlowJobAlert do + let(:webhook_url) { "http://example.com/webhook" } + + before do + SolidQueueWeb.alert_webhook_url = webhook_url + SolidQueueWeb.slow_job_threshold = 5.minutes + SolidQueueWeb.alert_slow_job_count_threshold = 3 + SolidQueueWeb.alert_webhook_cooldown = 3600 + allow(Thread).to receive(:new).and_yield + allow_any_instance_of(Net::HTTP).to receive(:request).and_return(Net::HTTPSuccess.new("1.1", "200", "OK")) + end + + after do + SolidQueueWeb.alert_webhook_url = nil + SolidQueueWeb.slow_job_threshold = nil + SolidQueueWeb.alert_slow_job_count_threshold = nil + SolidQueueWeb.alert_webhook_cooldown = nil + described_class.reset! + end + + let(:worker_process) do + SolidQueue::Process.create!( + kind: "Worker", pid: 99_999, hostname: "test-host", + name: "worker-slow-alert-test", last_heartbeat_at: Time.current + ) + end + + def create_slow_claimed_job(count: 1) + count.times do |i| + job = SolidQueue::Job.create!( + queue_name: "default", + class_name: "SlowTestJob", + arguments: {}, + active_job_id: SecureRandom.uuid + ) + execution = SolidQueue::ClaimedExecution.create!(job: job, process: worker_process) + execution.update_columns(created_at: 10.minutes.ago) + end + end + + describe ".call" do + it "fires when slow job count meets the threshold" do + create_slow_claimed_job(count: 3) + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call + end + + it "fires when slow job count exceeds the threshold" do + create_slow_claimed_job(count: 5) + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call + end + + it "does not fire when slow job count is below the threshold" do + create_slow_claimed_job(count: 2) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not count jobs that are not yet slow" do + job = SolidQueue::Job.create!( + queue_name: "default", + class_name: "FastTestJob", + arguments: {}, + active_job_id: SecureRandom.uuid + ) + execution = SolidQueue::ClaimedExecution.create!(job: job, process: worker_process) + execution.update_columns(created_at: 1.minute.ago) + + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not fire when slow_job_threshold is not configured" do + SolidQueueWeb.slow_job_threshold = nil + create_slow_claimed_job(count: 5) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not fire when alert_slow_job_count_threshold is not configured" do + SolidQueueWeb.alert_slow_job_count_threshold = nil + create_slow_claimed_job(count: 5) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not fire when webhook url is not configured" do + SolidQueueWeb.alert_webhook_url = nil + create_slow_claimed_job(count: 5) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "does not fire again within the cooldown window" do + create_slow_claimed_job(count: 3) + described_class.call + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call + end + + it "fires again after the cooldown window expires" do + create_slow_claimed_job(count: 3) + described_class.call + described_class.instance_variable_set(:@last_fired_at, 2.hours.ago) + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call + end + + it "sends a JSON payload with the correct fields" do + create_slow_claimed_job(count: 4) + posted_body = nil + allow_any_instance_of(Net::HTTP).to receive(:request) do |_, req| + posted_body = JSON.parse(req.body) + Net::HTTPSuccess.new("1.1", "200", "OK") + end + + described_class.call + + expect(posted_body["event"]).to eq("slow_job_threshold_exceeded") + expect(posted_body["slow_job_count"]).to eq(4) + expect(posted_body["threshold"]).to eq(3) + expect(posted_body["fired_at"]).to be_present + end + + it "sets Content-Type to application/json" do + create_slow_claimed_job(count: 3) + sent_request = nil + allow_any_instance_of(Net::HTTP).to receive(:request) do |_, req| + sent_request = req + Net::HTTPSuccess.new("1.1", "200", "OK") + end + + described_class.call + + expect(sent_request["Content-Type"]).to eq("application/json") + end + + it "logs an error and does not raise when the HTTP request fails" do + create_slow_claimed_job(count: 3) + allow_any_instance_of(Net::HTTP).to receive(:request).and_raise(RuntimeError, "connection refused") + expect(Rails.logger).to receive(:error).with(/Slow job alert webhook failed/) + expect { described_class.call }.not_to raise_error + end + + context "when alert_webhook_url is an array" do + let(:second_url) { "http://example.com/second-webhook" } + + before { SolidQueueWeb.alert_webhook_url = [webhook_url, second_url] } + + it "posts to all configured URLs" do + create_slow_claimed_job(count: 3) + expect(Net::HTTP).to receive(:new).twice.and_call_original + described_class.call + end + end + end +end From be7810c3f005d1206f9d73d2522b2567a5927873 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 28 May 2026 06:18:26 -0400 Subject: [PATCH 2/2] docs: update CHANGELOG and README for slow job webhook alert Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 4 ++++ README.md | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a082a25..f2e6a49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Slow job webhook alert — set `alert_slow_job_count_threshold` (integer) to fire a webhook whenever the number of currently-running slow jobs meets or exceeds the configured count; requires `slow_job_threshold` to define what "slow" means; uses the same `alert_webhook_url` and `alert_webhook_cooldown` settings as other alert types; event name `slow_job_threshold_exceeded` + ## [1.3.0] - 2026-05-27 ### Added diff --git a/README.md b/README.md index 9063eed..95cb1ca 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ SolidQueueWeb surfaces all of this in a browser UI available at any route you ch - **Dashboard quick actions** — "Retry All Failed" and "Discard All Blocked" cards appear on the dashboard only when the respective count is non-zero; one-click bulk operations with confirm dialogs, keeping the dashboard clean when everything is healthy - **CSV export** — "Export CSV" button on the jobs, failed jobs, and history pages downloads all records matching the current filters; columns are tailored per view - **Slow job detection** — when `slow_job_threshold` is configured, claimed jobs running longer than the threshold are flagged with an orange row, a "slow" badge, and a "Running For" duration column on the Running tab; a "Slow Jobs" warning card appears on the dashboard with a link to the Running tab -- **Webhook alerts** — set `alert_webhook_url` and `alert_failure_threshold` to receive a POST request whenever the failed job count meets or exceeds the threshold; fires asynchronously so dashboard performance is unaffected; a configurable cooldown (default 1 h) prevents repeated alerts while the count stays elevated +- **Webhook alerts** — set `alert_webhook_url` and `alert_failure_threshold` to receive a POST request whenever the failed job count meets or exceeds the threshold; set `alert_queue_thresholds` for per-queue depth alerts; set `alert_slow_job_count_threshold` (requires `slow_job_threshold`) for slow-job count alerts; all fire asynchronously with a configurable cooldown (default 1 h) to prevent repeated alerts - **Performance analytics** — per-job-class statistics at `/jobs/performance` showing run count, average, p50, p95, p99, standard deviation, min, and max duration; sorted by p95 descending so the slowest classes surface first; high std dev surfaces inconsistent jobs worth investigating; period filter scopes to 1h / 24h / 7d or all time; each class name links to the filtered History view - **Failed job trend chart** — a "Failures — Last 12 Hours" bar chart on the dashboard shows failures per hour over the last 12 hours; bars are red, making failure spikes visible before clicking into the failed jobs list - **Error frequency report** — `GET /jobs/failed_jobs/errors` groups all failed jobs by error class and message prefix, shows a count per group, and surfaces a sample backtrace in an expandable row; sorted by count descending so the most common errors appear first; accessible via the "Error Summary" button on the Failed Jobs page @@ -106,8 +106,9 @@ SolidQueueWeb.configure do |config| config.slow_job_threshold = 5.minutes # flag claimed jobs running longer than this (default: nil = disabled) config.alert_webhook_url = "https://hooks.example.com/solid-queue" # POST target — string or array (default: nil = disabled) config.alert_failure_threshold = 10 # fire when failed count >= this (default: nil = disabled) - config.alert_queue_thresholds = { "critical" => 50, "default" => 200 } # fire when queue depth >= threshold (default: {}) - config.alert_webhook_cooldown = 1800 # seconds between repeated alerts per alert type (default: 3600) + config.alert_queue_thresholds = { "critical" => 50, "default" => 200 } # fire when queue depth >= threshold (default: {}) + config.alert_slow_job_count_threshold = 5 # fire when slow job count >= this (default: nil = disabled) + config.alert_webhook_cooldown = 1800 # seconds between repeated alerts per alert type (default: 3600) config.connects_to = { reading: :reading, writing: :writing } # read replica (default: nil) config.time_zone = "America/New_York" # display timezone for all timestamps (default: nil = UTC) end @@ -182,6 +183,32 @@ The same `alert_webhook_url` endpoint(s) receive the payload, with a distinct ev Cooldown is tracked independently per queue, so a persistently deep "critical" queue does not suppress alerts for "default". The shared `alert_webhook_cooldown` setting applies to each queue separately. +## Slow job alerts + +Set `alert_slow_job_count_threshold` to fire a webhook when the number of currently-running slow jobs meets or exceeds a count. This requires `slow_job_threshold` to also be configured — it defines what "slow" means. + +```ruby +SolidQueueWeb.configure do |config| + config.slow_job_threshold = 5.minutes # a job is "slow" if it has been claimed longer than this + config.alert_slow_job_count_threshold = 3 # fire when >= 3 jobs are slow + config.alert_webhook_url = "https://hooks.example.com/solid-queue" + config.alert_webhook_cooldown = 1800 # don't re-fire for 30 minutes (default: 3600) +end +``` + +The same `alert_webhook_url` endpoint(s) receive the payload with a distinct event type: + +```json +{ + "event": "slow_job_threshold_exceeded", + "slow_job_count": 5, + "threshold": 3, + "fired_at": "2026-05-28T08:00:00Z" +} +``` + +The alert fires on every dashboard page load while the condition persists, subject to the cooldown window. + ## Metrics endpoint `GET /jobs/metrics.json` returns a machine-readable JSON document suitable for Prometheus scraping, uptime monitors, or external dashboards. No configuration is required — the endpoint is available as soon as the engine is mounted.