diff --git a/CHANGELOG.md b/CHANGELOG.md index e148945..988e004 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Slow job webhook alert — fires when the number of currently-claimed jobs exceeding `slow_job_threshold` reaches `alert_slow_job_count_threshold`; respects the shared `alert_webhook_url` and `alert_webhook_cooldown` settings; payload includes `type: "slow_jobs"`, `count`, and `threshold` + ## [1.2.0] - 2026-05-27 ### Added diff --git a/app/controllers/solid_stack_web/dashboard_controller.rb b/app/controllers/solid_stack_web/dashboard_controller.rb index 90e1947..5dab3ed 100644 --- a/app/controllers/solid_stack_web/dashboard_controller.rb +++ b/app/controllers/solid_stack_web/dashboard_controller.rb @@ -6,6 +6,7 @@ def index @cable_stats = CableStats.new.to_h @throughput = ThroughputSparkline.new @failures = FailedJobSparkline.new + AlertWebhook.check(@queue_stats) end end end diff --git a/app/models/solid_stack_web/alert_webhook.rb b/app/models/solid_stack_web/alert_webhook.rb index d71f64a..6af1c37 100644 --- a/app/models/solid_stack_web/alert_webhook.rb +++ b/app/models/solid_stack_web/alert_webhook.rb @@ -39,6 +39,11 @@ def build_alerts alerts << { type: "queue_depth", queue: queue_name.to_s, count: count, threshold: threshold } if count >= threshold end + if (job_threshold = SolidStackWeb.slow_job_threshold) && (count_threshold = SolidStackWeb.alert_slow_job_count_threshold) + count = ::SolidQueue::ClaimedExecution.where("created_at <= ?", job_threshold.seconds.ago).count + alerts << { type: "slow_jobs", count: count, threshold: count_threshold } if count >= count_threshold + end + alerts end diff --git a/lib/generators/solid_stack_web/install/templates/initializer.rb b/lib/generators/solid_stack_web/install/templates/initializer.rb index 6554ed0..1832bef 100644 --- a/lib/generators/solid_stack_web/install/templates/initializer.rb +++ b/lib/generators/solid_stack_web/install/templates/initializer.rb @@ -41,11 +41,12 @@ # Alert webhook — POST to this URL when a threshold is breached. # Delivery failures are silently swallowed; configure a cooldown to avoid storms. # - # config.alert_webhook_url = "https://hooks.example.com/my-alert" - # config.alert_failure_threshold = 10 # fire when failed jobs >= this - # config.alert_queue_thresholds = { # fire when a queue's ready depth >= value + # config.alert_webhook_url = "https://hooks.example.com/my-alert" + # config.alert_failure_threshold = 10 # fire when failed jobs >= this + # config.alert_queue_thresholds = { # fire when a queue's ready depth >= value # "critical" => 50, # "default" => 500 # } - # config.alert_webhook_cooldown = 3600 # seconds between repeat alerts + # config.alert_slow_job_count_threshold = 3 # fire when N+ claimed jobs exceed slow_job_threshold duration + # config.alert_webhook_cooldown = 3600 # seconds between repeat alerts end diff --git a/lib/solid_stack_web.rb b/lib/solid_stack_web.rb index 30c16b9..025bb34 100644 --- a/lib/solid_stack_web.rb +++ b/lib/solid_stack_web.rb @@ -6,6 +6,7 @@ class << self attr_writer :page_size, :connects_to, :slow_job_threshold, :alert_webhook_url, :alert_webhook_cooldown, :alert_failure_threshold, :alert_queue_thresholds, + :alert_slow_job_count_threshold, :dashboard_refresh_interval, :default_refresh_interval, :search_results_limit, :allow_value_preview @@ -37,6 +38,10 @@ def alert_queue_thresholds @alert_queue_thresholds || {} end + def alert_slow_job_count_threshold + @alert_slow_job_count_threshold + end + def dashboard_refresh_interval @dashboard_refresh_interval || 5_000 end diff --git a/spec/models/solid_stack_web/alert_webhook_spec.rb b/spec/models/solid_stack_web/alert_webhook_spec.rb index 00fd8e9..54fbb60 100644 --- a/spec/models/solid_stack_web/alert_webhook_spec.rb +++ b/spec/models/solid_stack_web/alert_webhook_spec.rb @@ -5,10 +5,12 @@ let(:queue_stats) { { failed: 0, ready: 0 } } after do - SolidStackWeb.alert_webhook_url = nil - SolidStackWeb.alert_failure_threshold = nil - SolidStackWeb.alert_queue_thresholds = nil - SolidStackWeb.alert_webhook_cooldown = nil + SolidStackWeb.alert_webhook_url = nil + SolidStackWeb.alert_failure_threshold = nil + SolidStackWeb.alert_queue_thresholds = nil + SolidStackWeb.alert_slow_job_count_threshold = nil + SolidStackWeb.alert_webhook_cooldown = nil + SolidStackWeb.slow_job_threshold = nil Rails.cache.clear end @@ -76,6 +78,65 @@ stub_request(:post, webhook_url).to_raise(Net::OpenTimeout) expect { described_class.check(queue_stats.merge(failed: 5)) }.not_to raise_error end + + context "slow job threshold" do + let(:worker) do + SolidQueue::Process.create!(kind: "Worker", name: "worker-spec", pid: 99_999, + hostname: "test", last_heartbeat_at: Time.current) + end + + def claimed_job(created_at:) + SolidQueue::Job.skip_callback(:create, :after, :prepare_for_execution) + job = SolidQueue::Job.create!(class_name: "SlowJob", queue_name: "default") + SolidQueue::Job.set_callback(:create, :after, :prepare_for_execution) + SolidQueue::ClaimedExecution.create!(job: job, process_id: worker.id, created_at: created_at) + end + + before do + SolidStackWeb.slow_job_threshold = 300 + SolidStackWeb.alert_slow_job_count_threshold = 2 + end + + it "does not POST when slow claimed job count is below the count threshold" do + claimed_job(created_at: 10.minutes.ago) + stub = stub_request(:post, webhook_url).to_return(status: 200) + described_class.check(queue_stats) + expect(stub).not_to have_been_requested + end + + it "POSTs when slow claimed job count meets the count threshold" do + 2.times { claimed_job(created_at: 10.minutes.ago) } + stub = stub_request(:post, webhook_url).to_return(status: 200) + described_class.check(queue_stats) + expect(stub).to have_been_requested + end + + it "does not POST for claimed jobs within the slow_job_threshold window" do + claimed_job(created_at: 1.minute.ago) + claimed_job(created_at: 1.minute.ago) + stub = stub_request(:post, webhook_url).to_return(status: 200) + described_class.check(queue_stats) + expect(stub).not_to have_been_requested + end + + it "includes slow_jobs type in the payload" do + 2.times { claimed_job(created_at: 10.minutes.ago) } + stub = stub_request(:post, webhook_url) + .with { |req| JSON.parse(req.body)["alerts"].any? { |a| a["type"] == "slow_jobs" } } + .to_return(status: 200) + described_class.check(queue_stats) + expect(stub).to have_been_requested + end + + it "does not POST when slow_job_threshold is not set" do + SolidStackWeb.slow_job_threshold = nil + claimed_job(created_at: 10.minutes.ago) + claimed_job(created_at: 10.minutes.ago) + stub = stub_request(:post, webhook_url).to_return(status: 200) + described_class.check(queue_stats) + expect(stub).not_to have_been_requested + end + end end end end