From e9b0def39317b119ff0587e0fee32504947abb12 Mon Sep 17 00:00:00 2001 From: Gil Desmarais Date: Fri, 27 Mar 2026 23:57:39 +0100 Subject: [PATCH 1/3] feat(config): add E1 browserless configs and fetch lane --- Makefile | 5 ++- README.md | 13 +++++++ bin/rspec_browserless_configs | 19 ++++++++++ lib/html2rss/configs/apple.com/newsroom.yml | 18 ++++++++++ lib/html2rss/configs/deepmind.google/blog.yml | 19 ++++++++++ lib/html2rss/configs/notion.com/blog.yml | 19 ++++++++++ lib/html2rss/configs/shopify.com/blog.yml | 16 +++++++++ lib/html2rss/configs/spotify.com/newsroom.yml | 17 +++++++++ spec/browserless_fetch_configs_spec.rb | 36 +++++++++++++++++++ spec/support/browserless_fetch_configs.rb | 30 ++++++++++++++++ .../shared_examples/config.yml_spec.rb | 31 ++++++++-------- 11 files changed, 208 insertions(+), 15 deletions(-) create mode 100755 bin/rspec_browserless_configs create mode 100644 lib/html2rss/configs/apple.com/newsroom.yml create mode 100644 lib/html2rss/configs/deepmind.google/blog.yml create mode 100644 lib/html2rss/configs/notion.com/blog.yml create mode 100644 lib/html2rss/configs/shopify.com/blog.yml create mode 100644 lib/html2rss/configs/spotify.com/newsroom.yml create mode 100644 spec/browserless_fetch_configs_spec.rb create mode 100644 spec/support/browserless_fetch_configs.rb diff --git a/Makefile b/Makefile index 4126c24..71c26de 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,10 @@ test-fetch-changed-configs: bin/rspec_changed_configs test-fetch-all-configs: - bundle exec rspec --tag fetch spec/html2rss/configs + bundle exec rspec --tag fetch spec/html2rss/configs_dynamic_spec.rb + +test-fetch-browserless-configs: + bin/rspec_browserless_configs test-all: test test-fetch-all-configs diff --git a/README.md b/README.md index ae1cbbb..b62430b 100644 --- a/README.md +++ b/README.md @@ -76,10 +76,23 @@ make test-config CONFIG=github.com/releases.yml # Test domain make test-domain DOMAIN=github.com + +# Run live fetch tests for the full corpus +make test-fetch-all-configs + +# Run the Browserless-backed fetch subset +BROWSERLESS_IO_WEBSOCKET_URL=ws://127.0.0.1:4002 \ +BROWSERLESS_IO_API_TOKEN=... \ +make test-fetch-browserless-configs ``` **Adding new configs**: Create the YAML file, run `make validate`, then run the generated tests. No dedicated spec file is needed. +The fetch suite has two lanes: + +- `make test-fetch-all-configs` runs all `:fetch` examples. Configs marked as Browserless-backed are skipped unless Browserless env vars are configured. +- `make test-fetch-browserless-configs` runs only the Browserless-backed config subset and requires `BROWSERLESS_IO_WEBSOCKET_URL`. Custom endpoints also require `BROWSERLESS_IO_API_TOKEN`. + **Config folder convention**: Place configs under the registrable domain folder (e.g., `example.com/` or `bbc.co.uk/`). Legacy subdomain folders (e.g., `news.example.com/`) are allowed but not preferred. ## Editor Setup (JSON Schema) diff --git a/bin/rspec_browserless_configs b/bin/rspec_browserless_configs new file mode 100755 index 0000000..7af76f2 --- /dev/null +++ b/bin/rspec_browserless_configs @@ -0,0 +1,19 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require_relative '../spec/support/browserless_fetch_configs' + +unless BrowserlessFetchConfigs.browserless_env_configured? + warn 'BROWSERLESS_IO_WEBSOCKET_URL is required for browserless fetch tests.' + warn 'Set BROWSERLESS_IO_API_TOKEN as well when using a custom websocket endpoint.' + exit 1 +end + +args = ['bundle', 'exec', 'rspec', '--tag', 'fetch'] +BrowserlessFetchConfigs::CONFIGS.each do |config| + args << '--example' + args << config +end +args << 'spec/html2rss/configs_dynamic_spec.rb' + +exec(*args) diff --git a/lib/html2rss/configs/apple.com/newsroom.yml b/lib/html2rss/configs/apple.com/newsroom.yml new file mode 100644 index 0000000..a5b7b46 --- /dev/null +++ b/lib/html2rss/configs/apple.com/newsroom.yml @@ -0,0 +1,18 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://www.apple.com/newsroom/ + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: 'li.tile-item a.tile-hero, li.tile-item a.tile-2up, li.tile-item a.tile-3up, li.tile-item a.tile-list' + enhance: false + title: + selector: .tile__headline + url: + extractor: href + published_at: + selector: .tile__timestamp diff --git a/lib/html2rss/configs/deepmind.google/blog.yml b/lib/html2rss/configs/deepmind.google/blog.yml new file mode 100644 index 0000000..21c3b05 --- /dev/null +++ b/lib/html2rss/configs/deepmind.google/blog.yml @@ -0,0 +1,19 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://deepmind.google/blog/ + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: .card__inner + enhance: false + title: + selector: h3 + url: + selector: .card__overlay-link + extractor: href + published_at: + selector: time diff --git a/lib/html2rss/configs/notion.com/blog.yml b/lib/html2rss/configs/notion.com/blog.yml new file mode 100644 index 0000000..bbeecd9 --- /dev/null +++ b/lib/html2rss/configs/notion.com/blog.yml @@ -0,0 +1,19 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://www.notion.com/blog + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: article.post-preview + enhance: false + title: + selector: h3 a[href*="/blog/"] + url: + selector: h3 a[href*="/blog/"] + extractor: href + description: + selector: '> a[href*="/blog/"]:not([title])' diff --git a/lib/html2rss/configs/shopify.com/blog.yml b/lib/html2rss/configs/shopify.com/blog.yml new file mode 100644 index 0000000..6c796c9 --- /dev/null +++ b/lib/html2rss/configs/shopify.com/blog.yml @@ -0,0 +1,16 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://www.shopify.com/blog/latest + time_zone: UTC + ttl: 360 +selectors: + items: + selector: article.article--index + enhance: false + title: + selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])' + url: + selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])' + extractor: href diff --git a/lib/html2rss/configs/spotify.com/newsroom.yml b/lib/html2rss/configs/spotify.com/newsroom.yml new file mode 100644 index 0000000..6638e66 --- /dev/null +++ b/lib/html2rss/configs/spotify.com/newsroom.yml @@ -0,0 +1,17 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://newsroom.spotify.com/ + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: '.post-box.v2' + enhance: false + title: + selector: 'h3 a[href*="/20"]' + url: + selector: 'h3 a[href*="/20"]' + extractor: href diff --git a/spec/browserless_fetch_configs_spec.rb b/spec/browserless_fetch_configs_spec.rb new file mode 100644 index 0000000..3337533 --- /dev/null +++ b/spec/browserless_fetch_configs_spec.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +RSpec.describe BrowserlessFetchConfigs do + describe '.browserless_env_configured?' do + around do |example| + original_ws_url = ENV['BROWSERLESS_IO_WEBSOCKET_URL'] + original_api_token = ENV['BROWSERLESS_IO_API_TOKEN'] + + example.run + ensure + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = original_ws_url + ENV['BROWSERLESS_IO_API_TOKEN'] = original_api_token + end + + it 'accepts the documented local Browserless websocket URL without a token' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:4002' + ENV['BROWSERLESS_IO_API_TOKEN'] = '' + + expect(described_class.browserless_env_configured?).to be(true) + end + + it 'accepts the legacy local Browserless websocket URL without a token' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:3000' + ENV['BROWSERLESS_IO_API_TOKEN'] = '' + + expect(described_class.browserless_env_configured?).to be(true) + end + + it 'requires a token for non-local websocket URLs' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'wss://production.browserless.example/ws' + ENV['BROWSERLESS_IO_API_TOKEN'] = '' + + expect(described_class.browserless_env_configured?).to be(false) + end + end +end diff --git a/spec/support/browserless_fetch_configs.rb b/spec/support/browserless_fetch_configs.rb new file mode 100644 index 0000000..c4ebd6d --- /dev/null +++ b/spec/support/browserless_fetch_configs.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module BrowserlessFetchConfigs + LOCAL_WS_URLS = %w[ + ws://127.0.0.1:3000 + ws://127.0.0.1:4002 + ].freeze + + CONFIGS = %w[ + apple.com/newsroom.yml + deepmind.google/blog.yml + notion.com/blog.yml + shopify.com/blog.yml + spotify.com/newsroom.yml + ].freeze + + module_function + + def include?(file_name) + CONFIGS.include?(file_name) + end + + def browserless_env_configured? + ws_url = ENV['BROWSERLESS_IO_WEBSOCKET_URL'].to_s + return false if ws_url.empty? + return true if LOCAL_WS_URLS.include?(ws_url) + + !ENV['BROWSERLESS_IO_API_TOKEN'].to_s.empty? + end +end diff --git a/spec/support/shared_examples/config.yml_spec.rb b/spec/support/shared_examples/config.yml_spec.rb index 0b7a788..31ba8f6 100644 --- a/spec/support/shared_examples/config.yml_spec.rb +++ b/spec/support/shared_examples/config.yml_spec.rb @@ -9,23 +9,12 @@ File.expand_path(File.join(__dir__, '..', '..', '..', 'lib', 'html2rss', 'configs', file_name)) end - let(:global_config) do - { - 'headers' => { - 'User-Agent': <<~UA.delete("\n") - Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) - AppleWebKit/537.36 (KHTML, like Gecko) - Chrome/134.0.0.0 - Safari/537.36' - UA - } - } - end let(:config) do feed_name = file_path.split(File::Separator)[-2..].join(File::Separator) config = {}.merge Html2rss::Configs.find_by_name(feed_name) - - config.merge!(global_config.dup) + # Reuse runtime browser defaults so fetch specs exercise the same header shape as production. + config[:headers] = Html2rss::Config::RequestHeaders.browser_defaults.merge(config.fetch(:headers, {})) + config[:strategy] = :browserless if BrowserlessFetchConfigs.include?(file_name) # Use provided params or extract defaults from parameters section if params @@ -115,6 +104,13 @@ context "when fetching #{params}", :fetch do subject(:feed) { Html2rss.feed(config.dup) } + before do + next unless BrowserlessFetchConfigs.include?(file_name) + next if BrowserlessFetchConfigs.browserless_env_configured? + + skip "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, for custom endpoints, BROWSERLESS_IO_API_TOKEN" + end + it 'has positive amount of items' do expect(feed.items.count).to be_positive, <<~MSG No items fetched. @@ -141,6 +137,13 @@ let(:specified_attributes) { Html2rss::Selectors::ITEM_TAGS & %w[title description author category] } let(:text_attributes) { specified_attributes & %w[title description author] } + before do + next unless BrowserlessFetchConfigs.include?(file_name) + next if BrowserlessFetchConfigs.browserless_env_configured? + + skip "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, for custom endpoints, BROWSERLESS_IO_API_TOKEN" + end + it 'has no empty text attributes', :aggregate_failures do text_attributes.each do |attribute_name| expect(item.public_send(attribute_name).to_s).not_to be_empty, attribute_name.to_s From f96147bd5a0a96834327eba8faa9143c52cfa4f4 Mon Sep 17 00:00:00 2001 From: Gil Desmarais Date: Sat, 28 Mar 2026 00:07:28 +0100 Subject: [PATCH 2/3] test: fix rubocop offenses in browserless specs --- spec/browserless_fetch_configs_spec.rb | 4 ++-- spec/support/shared_examples/config.yml_spec.rb | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/spec/browserless_fetch_configs_spec.rb b/spec/browserless_fetch_configs_spec.rb index 3337533..af1e190 100644 --- a/spec/browserless_fetch_configs_spec.rb +++ b/spec/browserless_fetch_configs_spec.rb @@ -3,8 +3,8 @@ RSpec.describe BrowserlessFetchConfigs do describe '.browserless_env_configured?' do around do |example| - original_ws_url = ENV['BROWSERLESS_IO_WEBSOCKET_URL'] - original_api_token = ENV['BROWSERLESS_IO_API_TOKEN'] + original_ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', nil) + original_api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', nil) example.run ensure diff --git a/spec/support/shared_examples/config.yml_spec.rb b/spec/support/shared_examples/config.yml_spec.rb index 31ba8f6..f01a58c 100644 --- a/spec/support/shared_examples/config.yml_spec.rb +++ b/spec/support/shared_examples/config.yml_spec.rb @@ -30,7 +30,7 @@ config end - context 'with the file' do # rubocop:disable RSpec/MultipleMemoizedHelpers + context 'with the file' do let(:host_name) { Helper.url_to_host_name yaml['channel']['url'] } let(:domain_name) { Helper.url_to_registrable_domain yaml['channel']['url'] } let(:dirname) { File.dirname(file_path).split(File::Separator).last } @@ -108,7 +108,10 @@ next unless BrowserlessFetchConfigs.include?(file_name) next if BrowserlessFetchConfigs.browserless_env_configured? - skip "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, for custom endpoints, BROWSERLESS_IO_API_TOKEN" + skip( + "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, " \ + 'for custom endpoints, BROWSERLESS_IO_API_TOKEN' + ) end it 'has positive amount of items' do @@ -141,7 +144,10 @@ next unless BrowserlessFetchConfigs.include?(file_name) next if BrowserlessFetchConfigs.browserless_env_configured? - skip "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, for custom endpoints, BROWSERLESS_IO_API_TOKEN" + skip( + "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, " \ + 'for custom endpoints, BROWSERLESS_IO_API_TOKEN' + ) end it 'has no empty text attributes', :aggregate_failures do From f0a22f3563676b372b3f199ef24b3d0f21be1ee4 Mon Sep 17 00:00:00 2001 From: Gil Desmarais Date: Sat, 28 Mar 2026 00:10:30 +0100 Subject: [PATCH 3/3] test: address PR review feedback for browserless lane --- README.md | 1 - lib/html2rss/configs/apple.com/newsroom.yml | 2 +- lib/html2rss/configs/spotify.com/newsroom.yml | 2 +- spec/browserless_fetch_configs_spec.rb | 7 +++++++ spec/support/shared_examples/config.yml_spec.rb | 5 ++--- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b62430b..c585bb4 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,6 @@ make test-fetch-all-configs # Run the Browserless-backed fetch subset BROWSERLESS_IO_WEBSOCKET_URL=ws://127.0.0.1:4002 \ -BROWSERLESS_IO_API_TOKEN=... \ make test-fetch-browserless-configs ``` diff --git a/lib/html2rss/configs/apple.com/newsroom.yml b/lib/html2rss/configs/apple.com/newsroom.yml index a5b7b46..cfb0a85 100644 --- a/lib/html2rss/configs/apple.com/newsroom.yml +++ b/lib/html2rss/configs/apple.com/newsroom.yml @@ -8,7 +8,7 @@ channel: ttl: 360 selectors: items: - selector: 'li.tile-item a.tile-hero, li.tile-item a.tile-2up, li.tile-item a.tile-3up, li.tile-item a.tile-list' + selector: "li.tile-item a.tile-hero, li.tile-item a.tile-2up, li.tile-item a.tile-3up, li.tile-item a.tile-list" enhance: false title: selector: .tile__headline diff --git a/lib/html2rss/configs/spotify.com/newsroom.yml b/lib/html2rss/configs/spotify.com/newsroom.yml index 6638e66..9fc281c 100644 --- a/lib/html2rss/configs/spotify.com/newsroom.yml +++ b/lib/html2rss/configs/spotify.com/newsroom.yml @@ -8,7 +8,7 @@ channel: ttl: 360 selectors: items: - selector: '.post-box.v2' + selector: ".post-box.v2" enhance: false title: selector: 'h3 a[href*="/20"]' diff --git a/spec/browserless_fetch_configs_spec.rb b/spec/browserless_fetch_configs_spec.rb index af1e190..a46848b 100644 --- a/spec/browserless_fetch_configs_spec.rb +++ b/spec/browserless_fetch_configs_spec.rb @@ -32,5 +32,12 @@ expect(described_class.browserless_env_configured?).to be(false) end + + it 'accepts non-local websocket URLs when a token is present' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'wss://production.browserless.example/ws' + ENV['BROWSERLESS_IO_API_TOKEN'] = 'secret-token' + + expect(described_class.browserless_env_configured?).to be(true) + end end end diff --git a/spec/support/shared_examples/config.yml_spec.rb b/spec/support/shared_examples/config.yml_spec.rb index f01a58c..a2daa4f 100644 --- a/spec/support/shared_examples/config.yml_spec.rb +++ b/spec/support/shared_examples/config.yml_spec.rb @@ -14,7 +14,6 @@ config = {}.merge Html2rss::Configs.find_by_name(feed_name) # Reuse runtime browser defaults so fetch specs exercise the same header shape as production. config[:headers] = Html2rss::Config::RequestHeaders.browser_defaults.merge(config.fetch(:headers, {})) - config[:strategy] = :browserless if BrowserlessFetchConfigs.include?(file_name) # Use provided params or extract defaults from parameters section if params @@ -105,7 +104,7 @@ subject(:feed) { Html2rss.feed(config.dup) } before do - next unless BrowserlessFetchConfigs.include?(file_name) + next unless config[:strategy].to_s == 'browserless' next if BrowserlessFetchConfigs.browserless_env_configured? skip( @@ -141,7 +140,7 @@ let(:text_attributes) { specified_attributes & %w[title description author] } before do - next unless BrowserlessFetchConfigs.include?(file_name) + next unless config[:strategy].to_s == 'browserless' next if BrowserlessFetchConfigs.browserless_env_configured? skip(