diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d90a43..5b2bbbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Webhook alert config — `alert_webhook_url` and `alert_failure_threshold` settings POST a JSON payload (`event`, `failure_count`, `threshold`, `fired_at`) to any URL when the failed job count meets or exceeds the threshold; fires asynchronously in a background thread so dashboard requests are never blocked; a configurable `alert_webhook_cooldown` (default 3600 s) prevents repeated alerts while the count stays elevated; HTTP errors are logged and swallowed - Bulk retry with delay — "+5s", "+10s", "+30s", and "+1m" stagger buttons on the Failed Jobs page retry all matched jobs with a configurable interval between each; the first job runs immediately, subsequent jobs are scheduled at incremental offsets; uses per-execution `retry` so `scheduled_at` is respected by SolidQueue's dispatcher; buttons only appear when more than one job is present - Scheduled job management — "Run Now" promotes a scheduled job to run immediately by back-dating its `scheduled_at`; "+1h", "+24h", and "+7d" buttons push `scheduled_at` forward by the chosen offset; both actions update the execution and the underlying job record; Turbo Stream responses remove the row on "Run Now" and update the `scheduled_at` cell in place on postpone diff --git a/README.md b/README.md index 1503d85..fdc37a6 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ SolidQueueWeb surfaces all of this in a browser UI available at any route you ch - **Dashboard quick actions** — "Retry All Failed" and "Discard All Blocked" cards appear on the dashboard only when the respective count is non-zero; one-click bulk operations with confirm dialogs, keeping the dashboard clean when everything is healthy - **CSV export** — "Export CSV" button on the jobs, failed jobs, and history pages downloads all records matching the current filters; columns are tailored per view - **Slow job detection** — when `slow_job_threshold` is configured, claimed jobs running longer than the threshold are flagged with an orange row, a "slow" badge, and a "Running For" duration column on the Running tab; a "Slow Jobs" warning card appears on the dashboard with a link to the Running tab +- **Webhook alerts** — set `alert_webhook_url` and `alert_failure_threshold` to receive a POST request whenever the failed job count meets or exceeds the threshold; fires asynchronously so dashboard performance is unaffected; a configurable cooldown (default 1 h) prevents repeated alerts while the count stays elevated ## Screenshots @@ -99,6 +100,9 @@ SolidQueueWeb.configure do |config| config.default_refresh_interval = 30_000 # jobs/processes/history auto-refresh in ms (default: 10_000) config.search_results_limit = 10 # max results per status in global search (default: 25) config.slow_job_threshold = 5.minutes # flag claimed jobs running longer than this (default: nil = disabled) + config.alert_webhook_url = "https://hooks.example.com/solid-queue" # POST target (default: nil = disabled) + config.alert_failure_threshold = 10 # fire when failed count >= this (default: nil = disabled) + config.alert_webhook_cooldown = 1800 # seconds between repeated alerts (default: 3600) end SolidQueueWeb.authenticate do @@ -118,7 +122,6 @@ Planned features, roughly ordered by priority: - Admin audit log — record who retried or discarded which jobs and when (requires host-app user identity) **Infrastructure** -- Webhook / alert config — POST to a URL when the failure count exceeds a threshold - Multi-database support — when Solid Queue runs on a separate database from the host app - Read replica support — route dashboard queries to a replica to avoid impacting the primary diff --git a/app/controllers/solid_queue_web/application_controller.rb b/app/controllers/solid_queue_web/application_controller.rb index 30aab2e..1ead939 100644 --- a/app/controllers/solid_queue_web/application_controller.rb +++ b/app/controllers/solid_queue_web/application_controller.rb @@ -4,7 +4,8 @@ module SolidQueueWeb class ApplicationController < ActionController::Base include Pagy::Method - PERIOD_DURATIONS = { "1h" => 1.hour, "24h" => 24.hours, "7d" => 7.days }.freeze + PERIOD_DURATIONS = { "1h" => 1.hour, "24h" => 24.hours, "7d" => 7.days }.freeze + STAGGER_INTERVALS = { "5s" => 5.seconds, "10s" => 10.seconds, "30s" => 30.seconds, "1m" => 1.minute }.freeze before_action :authenticate! diff --git a/app/controllers/solid_queue_web/dashboard_controller.rb b/app/controllers/solid_queue_web/dashboard_controller.rb index d4ed830..4d00f6f 100644 --- a/app/controllers/solid_queue_web/dashboard_controller.rb +++ b/app/controllers/solid_queue_web/dashboard_controller.rb @@ -2,6 +2,7 @@ module SolidQueueWeb class DashboardController < ApplicationController def index @stats = DashboardStats.new + AlertWebhook.call(failure_count: @stats.counts[:failed]) end end end diff --git a/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb b/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb index 7950a0f..d5c8eee 100644 --- a/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb +++ b/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb @@ -1,7 +1,5 @@ module SolidQueueWeb class RetryFailedJobsController < ApplicationController - STAGGER_INTERVALS = { "5s" => 5.seconds, "10s" => 10.seconds, "30s" => 30.seconds, "1m" => 1.minute }.freeze - before_action :set_filter_params def create diff --git a/app/services/solid_queue_web/alert_webhook.rb b/app/services/solid_queue_web/alert_webhook.rb new file mode 100644 index 0000000..5e1e759 --- /dev/null +++ b/app/services/solid_queue_web/alert_webhook.rb @@ -0,0 +1,58 @@ +require "net/http" +require "json" +require "uri" + +module SolidQueueWeb + class AlertWebhook + MUTEX = Mutex.new + + class << self + def call(failure_count:) + return unless configured? + return if failure_count < SolidQueueWeb.alert_failure_threshold + return unless should_fire? + + Thread.new { post(SolidQueueWeb.alert_webhook_url, failure_count) } + end + + def reset! + MUTEX.synchronize { @last_fired_at = nil } + end + + private + + def configured? + SolidQueueWeb.alert_webhook_url.present? && SolidQueueWeb.alert_failure_threshold.present? + end + + def should_fire? + MUTEX.synchronize do + cooldown = SolidQueueWeb.alert_webhook_cooldown + return false if @last_fired_at && Time.current - @last_fired_at < cooldown + + @last_fired_at = Time.current + true + end + end + + def post(url_string, failure_count) + uri = URI.parse(url_string) + payload = JSON.generate( + event: "failure_threshold_exceeded", + failure_count: failure_count, + threshold: SolidQueueWeb.alert_failure_threshold, + fired_at: Time.current.iso8601 + ) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = uri.scheme == "https" + http.open_timeout = 5 + http.read_timeout = 10 + request = Net::HTTP::Post.new(uri.path.presence || "/", "Content-Type" => "application/json") + request.body = payload + http.request(request) + rescue => e + Rails.logger.error("[SolidQueueWeb] Alert webhook failed: #{e.message}") + end + end + end +end diff --git a/lib/solid_queue_web.rb b/lib/solid_queue_web.rb index 37f1538..d9ea8d2 100644 --- a/lib/solid_queue_web.rb +++ b/lib/solid_queue_web.rb @@ -5,7 +5,7 @@ module SolidQueueWeb class << self attr_writer :page_size, :dashboard_refresh_interval, :default_refresh_interval, :search_results_limit, - :slow_job_threshold + :slow_job_threshold, :alert_webhook_url, :alert_failure_threshold, :alert_webhook_cooldown def page_size @page_size || 25 @@ -27,6 +27,18 @@ def slow_job_threshold @slow_job_threshold end + def alert_webhook_url + @alert_webhook_url + end + + def alert_failure_threshold + @alert_failure_threshold + end + + def alert_webhook_cooldown + @alert_webhook_cooldown || 3600 + end + def configure yield self end diff --git a/spec/requests/solid_queue_web/dashboard_spec.rb b/spec/requests/solid_queue_web/dashboard_spec.rb index 475e00a..35a3e1a 100644 --- a/spec/requests/solid_queue_web/dashboard_spec.rb +++ b/spec/requests/solid_queue_web/dashboard_spec.rb @@ -78,6 +78,27 @@ end end + describe "alert webhook" do + after do + SolidQueueWeb.alert_webhook_url = nil + SolidQueueWeb.alert_failure_threshold = nil + SolidQueueWeb::AlertWebhook.reset! + end + + it "calls AlertWebhook with the current failed job count" do + expect(SolidQueueWeb::AlertWebhook).to receive(:call).with(failure_count: kind_of(Integer)) + get "/jobs" + end + + it "does not raise when webhook fires during a dashboard request" do + SolidQueueWeb.alert_webhook_url = "http://example.com/hook" + SolidQueueWeb.alert_failure_threshold = 0 + allow(Thread).to receive(:new) + get "/jobs" + expect(response).to have_http_status(:ok) + end + end + describe "authentication" do after { SolidQueueWeb.instance_variable_set(:@authenticate, nil) } diff --git a/spec/services/solid_queue_web/alert_webhook_spec.rb b/spec/services/solid_queue_web/alert_webhook_spec.rb new file mode 100644 index 0000000..115448c --- /dev/null +++ b/spec/services/solid_queue_web/alert_webhook_spec.rb @@ -0,0 +1,101 @@ +require "rails_helper" + +RSpec.describe SolidQueueWeb::AlertWebhook do + let(:webhook_url) { "http://example.com/webhook" } + + before do + SolidQueueWeb.alert_webhook_url = webhook_url + SolidQueueWeb.alert_failure_threshold = 5 + SolidQueueWeb.alert_webhook_cooldown = 3600 + allow(Thread).to receive(:new).and_yield + allow_any_instance_of(Net::HTTP).to receive(:request).and_return(Net::HTTPSuccess.new("1.1", "200", "OK")) + end + + after do + SolidQueueWeb.alert_webhook_url = nil + SolidQueueWeb.alert_failure_threshold = nil + SolidQueueWeb.alert_webhook_cooldown = nil + described_class.reset! + end + + describe ".call" do + it "fires when failure count meets the threshold" do + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call(failure_count: 5) + end + + it "fires when failure count exceeds the threshold" do + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call(failure_count: 10) + end + + it "does not fire when failure count is below threshold" do + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call(failure_count: 4) + end + + it "does not fire when webhook url is not configured" do + SolidQueueWeb.alert_webhook_url = nil + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call(failure_count: 10) + end + + it "does not fire when threshold is not configured" do + SolidQueueWeb.alert_failure_threshold = nil + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call(failure_count: 10) + end + + it "does not fire again within the cooldown window" do + described_class.call(failure_count: 5) + expect_any_instance_of(Net::HTTP).not_to receive(:request) + described_class.call(failure_count: 5) + end + + it "fires again after the cooldown window expires" do + described_class.call(failure_count: 5) + described_class.instance_variable_set(:@last_fired_at, 2.hours.ago) + expect_any_instance_of(Net::HTTP).to receive(:request) + described_class.call(failure_count: 5) + end + + it "posts to the configured URL" do + uri = URI.parse(webhook_url) + expect(Net::HTTP).to receive(:new).with(uri.host, uri.port).and_call_original + described_class.call(failure_count: 5) + end + + it "sends a JSON payload with event, failure_count, threshold, and fired_at" do + posted_body = nil + allow_any_instance_of(Net::HTTP).to receive(:request) do |_, req| + posted_body = JSON.parse(req.body) + Net::HTTPSuccess.new("1.1", "200", "OK") + end + + described_class.call(failure_count: 7) + + expect(posted_body["event"]).to eq("failure_threshold_exceeded") + expect(posted_body["failure_count"]).to eq(7) + expect(posted_body["threshold"]).to eq(5) + expect(posted_body["fired_at"]).to be_present + end + + it "sets Content-Type to application/json" do + sent_request = nil + allow_any_instance_of(Net::HTTP).to receive(:request) do |_, req| + sent_request = req + Net::HTTPSuccess.new("1.1", "200", "OK") + end + + described_class.call(failure_count: 5) + + expect(sent_request["Content-Type"]).to eq("application/json") + end + + it "logs an error and does not raise when the HTTP request fails" do + allow_any_instance_of(Net::HTTP).to receive(:request).and_raise(RuntimeError, "connection refused") + expect(Rails.logger).to receive(:error).with(/Alert webhook failed/) + expect { described_class.call(failure_count: 5) }.not_to raise_error + end + end +end