From b2c322d206c2c5b82e5e59f68b2a8bd5f55f5df7 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 21 May 2026 09:56:27 -0400 Subject: [PATCH 1/4] feat: metrics/health endpoint at GET /jobs/metrics.json Adds a JSON-only endpoint exposing job counts (ready/scheduled/claimed/ blocked/failed), throughput (completed_1h/24h), per-queue depth and paused status, and process health (total/healthy/stale/by_kind). When slow_job_threshold is configured, a slow_jobs count is also included. Goes through the same auth and connects_to middleware as all other routes. Co-Authored-By: Claude Sonnet 4.6 --- .../solid_queue_web/metrics_controller.rb | 48 +++++++ config/routes.rb | 1 + spec/requests/solid_queue_web/metrics_spec.rb | 127 ++++++++++++++++++ 3 files changed, 176 insertions(+) create mode 100644 app/controllers/solid_queue_web/metrics_controller.rb create mode 100644 spec/requests/solid_queue_web/metrics_spec.rb diff --git a/app/controllers/solid_queue_web/metrics_controller.rb b/app/controllers/solid_queue_web/metrics_controller.rb new file mode 100644 index 0000000..2edfe0b --- /dev/null +++ b/app/controllers/solid_queue_web/metrics_controller.rb @@ -0,0 +1,48 @@ +module SolidQueueWeb + class MetricsController < ApplicationController + def index + now = Time.current + processes = SolidQueue::Process.all.to_a + threshold = SolidQueue.process_alive_threshold.ago + + queues = SolidQueue::Queue.all.sort_by(&:name) + queue_depths = SolidQueue::ReadyExecution + .joins(:job) + .group("solid_queue_jobs.queue_name") + .count + + finished_times = SolidQueue::Job.where(finished_at: 24.hours.ago..now).pluck(:finished_at) + + payload = { + generated_at: now.iso8601, + jobs: { + ready: SolidQueue::ReadyExecution.count, + scheduled: SolidQueue::ScheduledExecution.count, + claimed: SolidQueue::ClaimedExecution.count, + blocked: SolidQueue::BlockedExecution.count, + failed: SolidQueue::FailedExecution.count + }, + throughput: { + completed_1h: finished_times.count { |t| t >= 1.hour.ago }, + completed_24h: finished_times.size + }, + queues: queues.map { |q| + { name: q.name, depth: queue_depths[q.name] || 0, paused: q.paused? } + }, + processes: { + total: processes.size, + healthy: processes.count { |p| p.last_heartbeat_at >= threshold }, + stale: processes.count { |p| p.last_heartbeat_at < threshold }, + by_kind: processes.group_by(&:kind).transform_values(&:size) + } + } + + slow_threshold = SolidQueueWeb.slow_job_threshold + if slow_threshold + payload[:slow_jobs] = SolidQueue::ClaimedExecution.where("created_at <= ?", slow_threshold.ago).count + end + + render json: payload + end + end +end diff --git a/config/routes.rb b/config/routes.rb index 3756d92..d3cc83b 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -2,6 +2,7 @@ root to: "dashboard#index" resource :blocked_jobs, only: [:destroy] + get "metrics", to: "metrics#index", as: :metrics, defaults: { format: :json } get "search", to: "search#index", as: :search get "history", to: "history#index", as: :history diff --git a/spec/requests/solid_queue_web/metrics_spec.rb b/spec/requests/solid_queue_web/metrics_spec.rb new file mode 100644 index 0000000..dd83d82 --- /dev/null +++ b/spec/requests/solid_queue_web/metrics_spec.rb @@ -0,0 +1,127 @@ +require "rails_helper" + +RSpec.describe "Metrics", type: :request do + describe "GET /jobs/metrics.json" do + it "returns HTTP success" do + get "/jobs/metrics.json" + expect(response).to have_http_status(:ok) + end + + it "responds with JSON content type" do + get "/jobs/metrics.json" + expect(response.content_type).to match(%r{application/json}) + end + + it "includes a generated_at timestamp" do + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["generated_at"]).to match(/\A\d{4}-\d{2}-\d{2}T/) + end + + it "includes job counts for all statuses" do + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["jobs"].keys).to match_array(%w[ready scheduled claimed blocked failed]) + end + + it "reflects actual job counts" do + SolidQueue::Job.create!( + queue_name: "default", class_name: "TestJob", + arguments: {}.to_json, priority: 0, active_job_id: SecureRandom.uuid + ) + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["jobs"]["ready"]).to be >= 1 + end + + it "includes throughput with completed_1h and completed_24h" do + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["throughput"].keys).to match_array(%w[completed_1h completed_24h]) + expect(data["throughput"]["completed_1h"]).to be_a(Integer) + expect(data["throughput"]["completed_24h"]).to be_a(Integer) + end + + it "includes queues array with name, depth, and paused" do + SolidQueue::Job.create!( + queue_name: "metrics-test", class_name: "TestJob", + arguments: {}.to_json, priority: 0, active_job_id: SecureRandom.uuid + ) + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["queues"]).to be_an(Array) + queue = data["queues"].find { |q| q["name"] == "metrics-test" } + expect(queue).to include("name" => "metrics-test", "depth" => 1, "paused" => false) + end + + it "includes process summary" do + SolidQueue::Process.create!( + kind: "Worker", pid: 77_777, hostname: "test-host", + name: "worker-metrics-test", last_heartbeat_at: Time.current + ) + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["processes"].keys).to match_array(%w[total healthy stale by_kind]) + expect(data["processes"]["total"]).to be >= 1 + end + + it "counts healthy vs stale processes correctly" do + SolidQueue::Process.create!( + kind: "Worker", pid: 77_778, hostname: "test-host", + name: "worker-healthy", last_heartbeat_at: Time.current + ) + stale = SolidQueue::Process.create!( + kind: "Worker", pid: 77_779, hostname: "test-host", + name: "worker-stale", last_heartbeat_at: Time.current + ) + stale.update_columns(last_heartbeat_at: 2.hours.ago) + + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["processes"]["healthy"]).to be >= 1 + expect(data["processes"]["stale"]).to be >= 1 + end + + it "does not include slow_jobs key when threshold is not configured" do + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data).not_to have_key("slow_jobs") + end + + it "includes slow_jobs count when threshold is configured" do + SolidQueueWeb.slow_job_threshold = 1.second + process = SolidQueue::Process.create!( + kind: "Worker", pid: 77_780, hostname: "test-host", + name: "worker-slow-m", last_heartbeat_at: Time.current + ) + job = SolidQueue::Job.create!( + queue_name: "default", class_name: "SlowJob", + arguments: {}.to_json, priority: 0, active_job_id: SecureRandom.uuid + ) + SolidQueue::ClaimedExecution.create!(job: job, process: process) + .tap { |e| e.update_columns(created_at: 10.minutes.ago) } + + get "/jobs/metrics.json" + data = JSON.parse(response.body) + expect(data["slow_jobs"]).to be >= 1 + ensure + SolidQueueWeb.slow_job_threshold = nil + end + + describe "authentication" do + after { SolidQueueWeb.instance_variable_set(:@authenticate, nil) } + + it "allows access when auth block returns truthy" do + SolidQueueWeb.authenticate { true } + get "/jobs/metrics.json" + expect(response).to have_http_status(:ok) + end + + it "returns 401 when auth block returns falsy" do + SolidQueueWeb.authenticate { false } + get "/jobs/metrics.json" + expect(response).to have_http_status(:unauthorized) + end + end + end +end From 112123d3a5cda322663236bffbd4e7c2c3b09c8d Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 21 May 2026 09:57:25 -0400 Subject: [PATCH 2/4] docs: update CHANGELOG and README for metrics endpoint Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 1 + README.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ec44e5..8a03632 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Metrics / health endpoint — `GET /jobs/metrics.json` returns a JSON document with job counts (`ready`, `scheduled`, `claimed`, `blocked`, `failed`), throughput (`completed_1h`, `completed_24h`), per-queue depth and pause state, and process health (`total`, `healthy`, `stale`, `by_kind`); when `slow_job_threshold` is configured, a `slow_jobs` count is also included; the endpoint goes through the same authentication and `connects_to` middleware as all other routes - Recurring task "Run Now" — a "Run Now" button on the Recurring Tasks page triggers `task.enqueue(at: Time.current)` to enqueue the job immediately without waiting for its next scheduled run; SolidQueue's `RecurringExecution` deduplication prevents double-enqueuing - Read replica support — when `connects_to` is set to `{ reading: , writing: }`, the engine automatically routes GET requests to the reading role and mutating requests (POST/DELETE/PATCH) to the writing role via `ActiveRecord::Base.connected_to(role:)`; passing any other hash (e.g. `{ role: :writing }`, `{ shard: :name }`) falls through to `connected_to` directly; defaults to `nil` so single-database setups are unaffected - Webhook alert config — `alert_webhook_url` and `alert_failure_threshold` settings POST a JSON payload (`event`, `failure_count`, `threshold`, `fired_at`) to any URL when the failed job count meets or exceeds the threshold; fires asynchronously in a background thread so dashboard requests are never blocked; a configurable `alert_webhook_cooldown` (default 3600 s) prevents repeated alerts while the count stays elevated; HTTP errors are logged and swallowed diff --git a/README.md b/README.md index 01e248b..5aba563 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ SolidQueueWeb surfaces all of this in a browser UI available at any route you ch - **CSV export** — "Export CSV" button on the jobs, failed jobs, and history pages downloads all records matching the current filters; columns are tailored per view - **Slow job detection** — when `slow_job_threshold` is configured, claimed jobs running longer than the threshold are flagged with an orange row, a "slow" badge, and a "Running For" duration column on the Running tab; a "Slow Jobs" warning card appears on the dashboard with a link to the Running tab - **Webhook alerts** — set `alert_webhook_url` and `alert_failure_threshold` to receive a POST request whenever the failed job count meets or exceeds the threshold; fires asynchronously so dashboard performance is unaffected; a configurable cooldown (default 1 h) prevents repeated alerts while the count stays elevated +- **Metrics / health endpoint** — `GET /jobs/metrics.json` returns a machine-readable JSON document with job counts, throughput, per-queue depth and pause state, and process health summary; suitable for Prometheus scraping, uptime monitors, or external dashboards; `slow_jobs` count included when `slow_job_threshold` is configured ## Screenshots @@ -169,7 +170,6 @@ Planned features, roughly ordered by priority: - Bulk scheduled job actions — "Run All Now" button on the Scheduled tab, mirroring the "Retry All" pattern on the Failed Jobs page **Observability** -- Metrics / health endpoint — `GET /jobs/metrics.json` exposing job counts, queue depths, and process health for Prometheus scraping or external dashboards - Performance analytics — average and percentile (p50/p95) duration per job class derived from the history table; surfaces slow job types before they become a problem - Priority filter — filter and sort the jobs list by Solid Queue job priority From 2e4f13eca029c1f90903c3fc0f9aeb7905dfe052 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 21 May 2026 09:59:09 -0400 Subject: [PATCH 3/4] docs: add metrics endpoint section to README with example response Co-Authored-By: Claude Sonnet 4.6 --- README.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/README.md b/README.md index 5aba563..036b33b 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,48 @@ The request body is JSON: The webhook fires asynchronously in a background thread so dashboard page loads are never delayed. HTTP errors are logged to `Rails.logger` and swallowed. The cooldown window prevents repeated alerts while the count stays elevated — the clock resets on each app restart. +## Metrics endpoint + +`GET /jobs/metrics.json` returns a machine-readable JSON document suitable for Prometheus scraping, uptime monitors, or external dashboards. No configuration is required — the endpoint is available as soon as the engine is mounted. + +``` +GET /jobs/metrics.json +``` + +Example response: + +```json +{ + "generated_at": "2026-05-21T12:00:00Z", + "jobs": { + "ready": 12, + "scheduled": 8, + "claimed": 3, + "blocked": 5, + "failed": 9 + }, + "throughput": { + "completed_1h": 15, + "completed_24h": 87 + }, + "queues": [ + { "name": "critical", "depth": 2, "paused": false }, + { "name": "default", "depth": 4, "paused": false }, + { "name": "mailers", "depth": 3, "paused": true } + ], + "processes": { + "total": 4, + "healthy": 4, + "stale": 0, + "by_kind": { "Dispatcher": 1, "Supervisor": 1, "Worker": 2 } + } +} +``` + +When `slow_job_threshold` is configured, a `slow_jobs` integer is also included at the top level. + +The endpoint respects the same authentication and `connects_to` settings as the rest of the dashboard. A process is counted as **stale** when its `last_heartbeat_at` is older than `SolidQueue.process_alive_threshold` (default: 5 minutes). + ## Read replica support Set `connects_to` with both `reading:` and `writing:` keys to enable automatic role switching. GET requests are routed to the reading role; POST/DELETE/PATCH requests use the writing role. From e1910f09e715c0f8c80a64731354d321c7201781 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Thu, 21 May 2026 10:01:20 -0400 Subject: [PATCH 4/4] refactor: extract MetricsPayload service from MetricsController Moves all query and aggregation logic into a dedicated service object, matching the DashboardStats/QueueStats pattern. Controller becomes a single render call. Co-Authored-By: Claude Sonnet 4.6 --- .../solid_queue_web/metrics_controller.rb | 43 +----------- .../solid_queue_web/metrics_payload.rb | 66 +++++++++++++++++++ 2 files changed, 67 insertions(+), 42 deletions(-) create mode 100644 app/services/solid_queue_web/metrics_payload.rb diff --git a/app/controllers/solid_queue_web/metrics_controller.rb b/app/controllers/solid_queue_web/metrics_controller.rb index 2edfe0b..83fef16 100644 --- a/app/controllers/solid_queue_web/metrics_controller.rb +++ b/app/controllers/solid_queue_web/metrics_controller.rb @@ -1,48 +1,7 @@ module SolidQueueWeb class MetricsController < ApplicationController def index - now = Time.current - processes = SolidQueue::Process.all.to_a - threshold = SolidQueue.process_alive_threshold.ago - - queues = SolidQueue::Queue.all.sort_by(&:name) - queue_depths = SolidQueue::ReadyExecution - .joins(:job) - .group("solid_queue_jobs.queue_name") - .count - - finished_times = SolidQueue::Job.where(finished_at: 24.hours.ago..now).pluck(:finished_at) - - payload = { - generated_at: now.iso8601, - jobs: { - ready: SolidQueue::ReadyExecution.count, - scheduled: SolidQueue::ScheduledExecution.count, - claimed: SolidQueue::ClaimedExecution.count, - blocked: SolidQueue::BlockedExecution.count, - failed: SolidQueue::FailedExecution.count - }, - throughput: { - completed_1h: finished_times.count { |t| t >= 1.hour.ago }, - completed_24h: finished_times.size - }, - queues: queues.map { |q| - { name: q.name, depth: queue_depths[q.name] || 0, paused: q.paused? } - }, - processes: { - total: processes.size, - healthy: processes.count { |p| p.last_heartbeat_at >= threshold }, - stale: processes.count { |p| p.last_heartbeat_at < threshold }, - by_kind: processes.group_by(&:kind).transform_values(&:size) - } - } - - slow_threshold = SolidQueueWeb.slow_job_threshold - if slow_threshold - payload[:slow_jobs] = SolidQueue::ClaimedExecution.where("created_at <= ?", slow_threshold.ago).count - end - - render json: payload + render json: MetricsPayload.new.to_h end end end diff --git a/app/services/solid_queue_web/metrics_payload.rb b/app/services/solid_queue_web/metrics_payload.rb new file mode 100644 index 0000000..59ae2f5 --- /dev/null +++ b/app/services/solid_queue_web/metrics_payload.rb @@ -0,0 +1,66 @@ +module SolidQueueWeb + class MetricsPayload + def initialize + @now = Time.current + end + + def to_h + payload = { + generated_at: @now.iso8601, + jobs: job_counts, + throughput: throughput, + queues: queue_list, + processes: process_summary + } + slow = slow_jobs_count + payload[:slow_jobs] = slow unless slow.nil? + payload + end + + private + + def job_counts + { + ready: SolidQueue::ReadyExecution.count, + scheduled: SolidQueue::ScheduledExecution.count, + claimed: SolidQueue::ClaimedExecution.count, + blocked: SolidQueue::BlockedExecution.count, + failed: SolidQueue::FailedExecution.count + } + end + + def throughput + finished_times = SolidQueue::Job.where(finished_at: 24.hours.ago..@now).pluck(:finished_at) + { + completed_1h: finished_times.count { |t| t >= 1.hour.ago }, + completed_24h: finished_times.size + } + end + + def queue_list + depths = SolidQueue::ReadyExecution + .joins(:job) + .group("solid_queue_jobs.queue_name") + .count + SolidQueue::Queue.all.sort_by(&:name).map do |q| + { name: q.name, depth: depths[q.name] || 0, paused: q.paused? } + end + end + + def process_summary + processes = SolidQueue::Process.all.to_a + threshold = SolidQueue.process_alive_threshold.ago + { + total: processes.size, + healthy: processes.count { |p| p.last_heartbeat_at >= threshold }, + stale: processes.count { |p| p.last_heartbeat_at < threshold }, + by_kind: processes.group_by(&:kind).transform_values(&:size) + } + end + + def slow_jobs_count + threshold = SolidQueueWeb.slow_job_threshold + SolidQueue::ClaimedExecution.where("created_at <= ?", threshold.ago).count if threshold + end + end +end