From 792b61c84848bacaba79386e3b4c325e81c9ac4b Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Wed, 27 May 2026 14:50:04 -0400 Subject: [PATCH 1/4] feat: error frequency report at /jobs/failed_jobs/errors Co-Authored-By: Claude Sonnet 4.6 --- .../solid_queue_web/_08_detail.css | 7 ++ .../failed_jobs/errors_controller.rb | 9 ++ .../solid_queue_web/error_frequency_report.rb | 34 +++++++ .../failed_jobs/errors/index.html.erb | 44 +++++++++ .../failed_jobs/index.html.erb | 1 + config/routes.rb | 2 + .../solid_queue_web/failed_job_errors_spec.rb | 98 +++++++++++++++++++ 7 files changed, 195 insertions(+) create mode 100644 app/controllers/solid_queue_web/failed_jobs/errors_controller.rb create mode 100644 app/services/solid_queue_web/error_frequency_report.rb create mode 100644 app/views/solid_queue_web/failed_jobs/errors/index.html.erb create mode 100644 spec/requests/solid_queue_web/failed_job_errors_spec.rb diff --git a/app/assets/stylesheets/solid_queue_web/_08_detail.css b/app/assets/stylesheets/solid_queue_web/_08_detail.css index 7425d0a..392e783 100644 --- a/app/assets/stylesheets/solid_queue_web/_08_detail.css +++ b/app/assets/stylesheets/solid_queue_web/_08_detail.css @@ -75,6 +75,13 @@ .sqd-pre--muted { color: var(--muted); } +.sqd-error-details summary { + cursor: pointer; + list-style: none; +} +.sqd-error-details summary::-webkit-details-marker { display: none; } +.sqd-error-details .sqd-pre { margin-top: 0.5rem; } + .sqd-error-header { font-size: 13px; padding: 0.5rem 0.75rem; diff --git a/app/controllers/solid_queue_web/failed_jobs/errors_controller.rb b/app/controllers/solid_queue_web/failed_jobs/errors_controller.rb new file mode 100644 index 0000000..efbfe34 --- /dev/null +++ b/app/controllers/solid_queue_web/failed_jobs/errors_controller.rb @@ -0,0 +1,9 @@ +module SolidQueueWeb + module FailedJobs + class ErrorsController < ApplicationController + def index + @groups = ErrorFrequencyReport.new.groups + end + end + end +end diff --git a/app/services/solid_queue_web/error_frequency_report.rb b/app/services/solid_queue_web/error_frequency_report.rb new file mode 100644 index 0000000..cb0318c --- /dev/null +++ b/app/services/solid_queue_web/error_frequency_report.rb @@ -0,0 +1,34 @@ +module SolidQueueWeb + class ErrorFrequencyReport + Row = Data.define(:exception_class, :message_prefix, :count, :sample_backtrace) + + MESSAGE_LIMIT = 120 + + def groups + SolidQueue::FailedExecution + .order(created_at: :desc) + .each_with_object({}) do |execution, acc| + key = [execution.exception_class.to_s, message_prefix(execution.message)] + entry = acc[key] ||= { count: 0, sample_backtrace: nil } + entry[:count] += 1 + entry[:sample_backtrace] ||= execution.backtrace + end + .map do |(exception_class, prefix), data| + Row.new( + exception_class: exception_class, + message_prefix: prefix, + count: data[:count], + sample_backtrace: data[:sample_backtrace] + ) + end + .sort_by { |row| -row.count } + end + + private + + def message_prefix(message) + return "" if message.nil? + message.length > MESSAGE_LIMIT ? "#{message[0, MESSAGE_LIMIT]}…" : message + end + end +end diff --git a/app/views/solid_queue_web/failed_jobs/errors/index.html.erb b/app/views/solid_queue_web/failed_jobs/errors/index.html.erb new file mode 100644 index 0000000..d81a3cf --- /dev/null +++ b/app/views/solid_queue_web/failed_jobs/errors/index.html.erb @@ -0,0 +1,44 @@ +
+

Error Summary

+
+ <%= link_to "← Failed Jobs", failed_jobs_path, class: "sqd-btn sqd-btn--muted sqd-btn--sm" %> +
+
+ +<% if @groups.any? %> +
+ + + + + + + + + + <% @groups.each do |group| %> + + + + + + <% end %> + +
Error ClassMessageCount
<%= group.exception_class.presence || "—" %> + <% if group.sample_backtrace.present? %> +
+ + <%= group.message_prefix.presence || "—" %> + +
<%= Array(group.sample_backtrace).first(10).join("\n") %>
+
+ <% else %> + <%= group.message_prefix.presence || "—" %> + <% end %> +
<%= group.count %>
+
+<% else %> +
+
No failed jobs. All clear!
+
+<% end %> \ No newline at end of file diff --git a/app/views/solid_queue_web/failed_jobs/index.html.erb b/app/views/solid_queue_web/failed_jobs/index.html.erb index b817251..7a82e37 100644 --- a/app/views/solid_queue_web/failed_jobs/index.html.erb +++ b/app/views/solid_queue_web/failed_jobs/index.html.erb @@ -2,6 +2,7 @@

Failed Jobs

<% if @failed_jobs.any? %>
+ <%= link_to "Error Summary", failed_job_errors_path, class: "sqd-btn sqd-btn--muted sqd-btn--sm" %> <%= link_to "Export CSV", failed_jobs_path(format: :csv, queue: @queue, q: @search, period: @period), class: "sqd-btn sqd-btn--muted", data: { turbo: false } %> <%= button_to "Retry All", retry_all_failed_jobs_path, diff --git a/config/routes.rb b/config/routes.rb index 3e10857..282e6bb 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -35,6 +35,8 @@ end end + get "failed_jobs/errors", to: "failed_jobs/errors#index", as: :failed_job_errors + resource :failed_job_selection, path: "failed_jobs/selection", only: [:create, :destroy], controller: "failed_jobs/selections" resources :failed_jobs, only: [:index, :destroy] do diff --git a/spec/requests/solid_queue_web/failed_job_errors_spec.rb b/spec/requests/solid_queue_web/failed_job_errors_spec.rb new file mode 100644 index 0000000..d172b3f --- /dev/null +++ b/spec/requests/solid_queue_web/failed_job_errors_spec.rb @@ -0,0 +1,98 @@ +require "rails_helper" + +RSpec.describe "FailedJobErrors", type: :request do + def failed_execution(class_name: "TestJob", exception_class: "RuntimeError", message: "boom", backtrace: ["app/jobs/test_job.rb:10"]) + job = SolidQueue::Job.create!( + queue_name: "default", class_name: class_name, + arguments: {}, active_job_id: SecureRandom.uuid + ) + job.ready_execution&.destroy + SolidQueue::FailedExecution.create!( + job: job, + error: { exception_class: exception_class, message: message, backtrace: backtrace } + ) + end + + describe "GET /jobs/failed_jobs/errors" do + it "returns HTTP success" do + get "/jobs/failed_jobs/errors" + expect(response).to have_http_status(:ok) + end + + it "displays the Error Summary heading" do + get "/jobs/failed_jobs/errors" + expect(response.body).to include("Error Summary") + end + + it "shows an empty state when no failed jobs exist" do + get "/jobs/failed_jobs/errors" + expect(response.body).to include("No failed jobs") + end + + it "renders a row for each distinct error class" do + failed_execution(exception_class: "ArgumentError", message: "bad arg") + failed_execution(exception_class: "TimeoutError", message: "timed out") + + get "/jobs/failed_jobs/errors" + expect(response.body).to include("ArgumentError") + expect(response.body).to include("TimeoutError") + end + + it "aggregates multiple failures with the same error class and message" do + 2.times { failed_execution(exception_class: "RuntimeError", message: "boom") } + + get "/jobs/failed_jobs/errors" + expect(response.body).to include("RuntimeError") + expect(response.body.scan("RuntimeError").size).to eq(1) + end + + it "sorts groups by count descending" do + 3.times { failed_execution(exception_class: "FrequentError", message: "often") } + 1.times { failed_execution(exception_class: "RareError", message: "once") } + + get "/jobs/failed_jobs/errors" + frequent_pos = response.body.index("FrequentError") + rare_pos = response.body.index("RareError") + expect(frequent_pos).to be < rare_pos + end + + it "truncates long messages to MESSAGE_LIMIT characters" do + long_message = "x" * 200 + failed_execution(exception_class: "RuntimeError", message: long_message) + + get "/jobs/failed_jobs/errors" + expect(response.body).not_to include(long_message) + expect(response.body).to include("x" * 120) + end + + it "renders a backtrace inside a details element when present" do + failed_execution(exception_class: "RuntimeError", message: "boom", backtrace: ["app/jobs/test_job.rb:10"]) + + get "/jobs/failed_jobs/errors" + expect(response.body).to include(" Date: Wed, 27 May 2026 14:50:09 -0400 Subject: [PATCH 2/4] docs: update CHANGELOG for error frequency report Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19f28da..d53d795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Error frequency report — a new Error Summary page (`/jobs/failed_jobs/errors`) groups all failed jobs by error class and message prefix, shows a count per group, and displays a sample backtrace (first 10 lines) in an expandable `
` element; groups are sorted by count descending so the most common errors appear first; accessible via an "Error Summary" button on the Failed Jobs page + ## [1.1.0] - 2026-05-21 ### Added From 9b1f23383ecd1045baf8c0dcca304b127eee298c Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Wed, 27 May 2026 14:51:07 -0400 Subject: [PATCH 3/4] docs: update README for error frequency report Co-Authored-By: Claude Sonnet 4.6 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index fa3ca5d..9d6dd99 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ SolidQueueWeb surfaces all of this in a browser UI available at any route you ch - **Slow job detection** — when `slow_job_threshold` is configured, claimed jobs running longer than the threshold are flagged with an orange row, a "slow" badge, and a "Running For" duration column on the Running tab; a "Slow Jobs" warning card appears on the dashboard with a link to the Running tab - **Webhook alerts** — set `alert_webhook_url` and `alert_failure_threshold` to receive a POST request whenever the failed job count meets or exceeds the threshold; fires asynchronously so dashboard performance is unaffected; a configurable cooldown (default 1 h) prevents repeated alerts while the count stays elevated - **Performance analytics** — per-job-class statistics at `/jobs/performance` showing run count, average, p50, p95, min, and max duration; sorted by p95 descending so the slowest classes surface first; period filter scopes to 1h / 24h / 7d or all time; each class name links to the filtered History view +- **Error frequency report** — `GET /jobs/failed_jobs/errors` groups all failed jobs by error class and message prefix, shows a count per group, and surfaces a sample backtrace in an expandable row; sorted by count descending so the most common errors appear first; accessible via the "Error Summary" button on the Failed Jobs page - **Metrics / health endpoint** — `GET /jobs/metrics.json` returns a machine-readable JSON document with job counts, throughput, per-queue depth and pause state, and process health summary; suitable for Prometheus scraping, uptime monitors, or external dashboards; `slow_jobs` count included when `slow_job_threshold` is configured ## Compatibility From 0f67a41d9646fa90fb30fa0f619f142d35f9f312 Mon Sep 17 00:00:00 2001 From: Chuck Smith Date: Wed, 27 May 2026 14:51:45 -0400 Subject: [PATCH 4/4] docs: mark error frequency report as shipped in ROADMAP Co-Authored-By: Claude Sonnet 4.6 --- ROADMAP.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ROADMAP.md b/ROADMAP.md index b1fb0bd..fe8bd55 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -12,7 +12,7 @@ Pull requests for any of these are welcome. See [Contributing](README.md#contrib | Feature | Notes | |---|---| -| **Error frequency report** | Group all failed jobs by error class + message prefix, show count and a sample backtrace. When you have hundreds of failed jobs, you want to see "ArgumentError (x212), TimeoutError (x88)" at a glance. | +| ~~**Error frequency report**~~ | ✓ Shipped — `/jobs/failed_jobs/errors` groups all failed jobs by error class and message prefix, shows count and a sample backtrace in an expandable row; sorted by count descending; accessible via "Error Summary" button on the Failed Jobs page. | | **Failed job trend chart** | A "Failures — Last 12 Hours" sparkline on the dashboard (same pattern as the existing throughput and queue depth charts). Makes failure spikes visible before you click into the failed jobs list. | | **P99 + std dev in performance analytics** | Extend `JobPerformanceStats` with a 99th percentile and standard deviation column. High std dev signals inconsistent jobs worth investigating. |