Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Slow job webhook alert — fires when the number of currently-claimed jobs exceeding `slow_job_threshold` reaches `alert_slow_job_count_threshold`; respects the shared `alert_webhook_url` and `alert_webhook_cooldown` settings; payload includes `type: "slow_jobs"`, `count`, and `threshold`

## [1.2.0] - 2026-05-27

### Added
Expand Down
1 change: 1 addition & 0 deletions app/controllers/solid_stack_web/dashboard_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ def index
@cable_stats = CableStats.new.to_h
@throughput = ThroughputSparkline.new
@failures = FailedJobSparkline.new
AlertWebhook.check(@queue_stats)
end
end
end
5 changes: 5 additions & 0 deletions app/models/solid_stack_web/alert_webhook.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ def build_alerts
alerts << { type: "queue_depth", queue: queue_name.to_s, count: count, threshold: threshold } if count >= threshold
end

if (job_threshold = SolidStackWeb.slow_job_threshold) && (count_threshold = SolidStackWeb.alert_slow_job_count_threshold)
count = ::SolidQueue::ClaimedExecution.where("created_at <= ?", job_threshold.seconds.ago).count
alerts << { type: "slow_jobs", count: count, threshold: count_threshold } if count >= count_threshold
end

alerts
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@
# Alert webhook — POST to this URL when a threshold is breached.
# Delivery failures are silently swallowed; configure a cooldown to avoid storms.
#
# config.alert_webhook_url = "https://hooks.example.com/my-alert"
# config.alert_failure_threshold = 10 # fire when failed jobs >= this
# config.alert_queue_thresholds = { # fire when a queue's ready depth >= value
# config.alert_webhook_url = "https://hooks.example.com/my-alert"
# config.alert_failure_threshold = 10 # fire when failed jobs >= this
# config.alert_queue_thresholds = { # fire when a queue's ready depth >= value
# "critical" => 50,
# "default" => 500
# }
# config.alert_webhook_cooldown = 3600 # seconds between repeat alerts
# config.alert_slow_job_count_threshold = 3 # fire when N+ claimed jobs exceed slow_job_threshold duration
# config.alert_webhook_cooldown = 3600 # seconds between repeat alerts
end
5 changes: 5 additions & 0 deletions lib/solid_stack_web.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class << self
attr_writer :page_size, :connects_to, :slow_job_threshold,
:alert_webhook_url, :alert_webhook_cooldown,
:alert_failure_threshold, :alert_queue_thresholds,
:alert_slow_job_count_threshold,
:dashboard_refresh_interval, :default_refresh_interval,
:search_results_limit, :allow_value_preview

Expand Down Expand Up @@ -37,6 +38,10 @@ def alert_queue_thresholds
@alert_queue_thresholds || {}
end

def alert_slow_job_count_threshold
@alert_slow_job_count_threshold
end

def dashboard_refresh_interval
@dashboard_refresh_interval || 5_000
end
Expand Down
69 changes: 65 additions & 4 deletions spec/models/solid_stack_web/alert_webhook_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
let(:queue_stats) { { failed: 0, ready: 0 } }

after do
SolidStackWeb.alert_webhook_url = nil
SolidStackWeb.alert_failure_threshold = nil
SolidStackWeb.alert_queue_thresholds = nil
SolidStackWeb.alert_webhook_cooldown = nil
SolidStackWeb.alert_webhook_url = nil
SolidStackWeb.alert_failure_threshold = nil
SolidStackWeb.alert_queue_thresholds = nil
SolidStackWeb.alert_slow_job_count_threshold = nil
SolidStackWeb.alert_webhook_cooldown = nil
SolidStackWeb.slow_job_threshold = nil
Rails.cache.clear
end

Expand Down Expand Up @@ -76,6 +78,65 @@
stub_request(:post, webhook_url).to_raise(Net::OpenTimeout)
expect { described_class.check(queue_stats.merge(failed: 5)) }.not_to raise_error
end

context "slow job threshold" do
let(:worker) do
SolidQueue::Process.create!(kind: "Worker", name: "worker-spec", pid: 99_999,
hostname: "test", last_heartbeat_at: Time.current)
end

def claimed_job(created_at:)
SolidQueue::Job.skip_callback(:create, :after, :prepare_for_execution)
job = SolidQueue::Job.create!(class_name: "SlowJob", queue_name: "default")
SolidQueue::Job.set_callback(:create, :after, :prepare_for_execution)
SolidQueue::ClaimedExecution.create!(job: job, process_id: worker.id, created_at: created_at)
end

before do
SolidStackWeb.slow_job_threshold = 300
SolidStackWeb.alert_slow_job_count_threshold = 2
end

it "does not POST when slow claimed job count is below the count threshold" do
claimed_job(created_at: 10.minutes.ago)
stub = stub_request(:post, webhook_url).to_return(status: 200)
described_class.check(queue_stats)
expect(stub).not_to have_been_requested
end

it "POSTs when slow claimed job count meets the count threshold" do
2.times { claimed_job(created_at: 10.minutes.ago) }
stub = stub_request(:post, webhook_url).to_return(status: 200)
described_class.check(queue_stats)
expect(stub).to have_been_requested
end

it "does not POST for claimed jobs within the slow_job_threshold window" do
claimed_job(created_at: 1.minute.ago)
claimed_job(created_at: 1.minute.ago)
stub = stub_request(:post, webhook_url).to_return(status: 200)
described_class.check(queue_stats)
expect(stub).not_to have_been_requested
end

it "includes slow_jobs type in the payload" do
2.times { claimed_job(created_at: 10.minutes.ago) }
stub = stub_request(:post, webhook_url)
.with { |req| JSON.parse(req.body)["alerts"].any? { |a| a["type"] == "slow_jobs" } }
.to_return(status: 200)
described_class.check(queue_stats)
expect(stub).to have_been_requested
end

it "does not POST when slow_job_threshold is not set" do
SolidStackWeb.slow_job_threshold = nil
claimed_job(created_at: 10.minutes.ago)
claimed_job(created_at: 10.minutes.ago)
stub = stub_request(:post, webhook_url).to_return(status: 200)
described_class.check(queue_stats)
expect(stub).not_to have_been_requested
end
end
end
end
end