diff --git a/Makefile b/Makefile index 4126c24..71c26de 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,10 @@ test-fetch-changed-configs: bin/rspec_changed_configs test-fetch-all-configs: - bundle exec rspec --tag fetch spec/html2rss/configs + bundle exec rspec --tag fetch spec/html2rss/configs_dynamic_spec.rb + +test-fetch-browserless-configs: + bin/rspec_browserless_configs test-all: test test-fetch-all-configs diff --git a/README.md b/README.md index ae1cbbb..c585bb4 100644 --- a/README.md +++ b/README.md @@ -76,10 +76,22 @@ make test-config CONFIG=github.com/releases.yml # Test domain make test-domain DOMAIN=github.com + +# Run live fetch tests for the full corpus +make test-fetch-all-configs + +# Run the Browserless-backed fetch subset +BROWSERLESS_IO_WEBSOCKET_URL=ws://127.0.0.1:4002 \ +make test-fetch-browserless-configs ``` **Adding new configs**: Create the YAML file, run `make validate`, then run the generated tests. No dedicated spec file is needed. +The fetch suite has two lanes: + +- `make test-fetch-all-configs` runs all `:fetch` examples. Configs marked as Browserless-backed are skipped unless Browserless env vars are configured. +- `make test-fetch-browserless-configs` runs only the Browserless-backed config subset and requires `BROWSERLESS_IO_WEBSOCKET_URL`. Custom endpoints also require `BROWSERLESS_IO_API_TOKEN`. + **Config folder convention**: Place configs under the registrable domain folder (e.g., `example.com/` or `bbc.co.uk/`). Legacy subdomain folders (e.g., `news.example.com/`) are allowed but not preferred. ## Editor Setup (JSON Schema) diff --git a/bin/rspec_browserless_configs b/bin/rspec_browserless_configs new file mode 100755 index 0000000..7af76f2 --- /dev/null +++ b/bin/rspec_browserless_configs @@ -0,0 +1,19 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require_relative '../spec/support/browserless_fetch_configs' + +unless BrowserlessFetchConfigs.browserless_env_configured? + warn 'BROWSERLESS_IO_WEBSOCKET_URL is required for browserless fetch tests.' + warn 'Set BROWSERLESS_IO_API_TOKEN as well when using a custom websocket endpoint.' + exit 1 +end + +args = ['bundle', 'exec', 'rspec', '--tag', 'fetch'] +BrowserlessFetchConfigs::CONFIGS.each do |config| + args << '--example' + args << config +end +args << 'spec/html2rss/configs_dynamic_spec.rb' + +exec(*args) diff --git a/lib/html2rss/configs/apple.com/newsroom.yml b/lib/html2rss/configs/apple.com/newsroom.yml new file mode 100644 index 0000000..cfb0a85 --- /dev/null +++ b/lib/html2rss/configs/apple.com/newsroom.yml @@ -0,0 +1,18 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://www.apple.com/newsroom/ + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: "li.tile-item a.tile-hero, li.tile-item a.tile-2up, li.tile-item a.tile-3up, li.tile-item a.tile-list" + enhance: false + title: + selector: .tile__headline + url: + extractor: href + published_at: + selector: .tile__timestamp diff --git a/lib/html2rss/configs/deepmind.google/blog.yml b/lib/html2rss/configs/deepmind.google/blog.yml new file mode 100644 index 0000000..21c3b05 --- /dev/null +++ b/lib/html2rss/configs/deepmind.google/blog.yml @@ -0,0 +1,19 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://deepmind.google/blog/ + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: .card__inner + enhance: false + title: + selector: h3 + url: + selector: .card__overlay-link + extractor: href + published_at: + selector: time diff --git a/lib/html2rss/configs/notion.com/blog.yml b/lib/html2rss/configs/notion.com/blog.yml new file mode 100644 index 0000000..bbeecd9 --- /dev/null +++ b/lib/html2rss/configs/notion.com/blog.yml @@ -0,0 +1,19 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://www.notion.com/blog + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: article.post-preview + enhance: false + title: + selector: h3 a[href*="/blog/"] + url: + selector: h3 a[href*="/blog/"] + extractor: href + description: + selector: '> a[href*="/blog/"]:not([title])' diff --git a/lib/html2rss/configs/shopify.com/blog.yml b/lib/html2rss/configs/shopify.com/blog.yml new file mode 100644 index 0000000..6c796c9 --- /dev/null +++ b/lib/html2rss/configs/shopify.com/blog.yml @@ -0,0 +1,16 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://www.shopify.com/blog/latest + time_zone: UTC + ttl: 360 +selectors: + items: + selector: article.article--index + enhance: false + title: + selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])' + url: + selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])' + extractor: href diff --git a/lib/html2rss/configs/spotify.com/newsroom.yml b/lib/html2rss/configs/spotify.com/newsroom.yml new file mode 100644 index 0000000..9fc281c --- /dev/null +++ b/lib/html2rss/configs/spotify.com/newsroom.yml @@ -0,0 +1,17 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +strategy: browserless + +channel: + url: https://newsroom.spotify.com/ + language: en + time_zone: UTC + ttl: 360 +selectors: + items: + selector: ".post-box.v2" + enhance: false + title: + selector: 'h3 a[href*="/20"]' + url: + selector: 'h3 a[href*="/20"]' + extractor: href diff --git a/spec/browserless_fetch_configs_spec.rb b/spec/browserless_fetch_configs_spec.rb new file mode 100644 index 0000000..a46848b --- /dev/null +++ b/spec/browserless_fetch_configs_spec.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +RSpec.describe BrowserlessFetchConfigs do + describe '.browserless_env_configured?' do + around do |example| + original_ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', nil) + original_api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', nil) + + example.run + ensure + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = original_ws_url + ENV['BROWSERLESS_IO_API_TOKEN'] = original_api_token + end + + it 'accepts the documented local Browserless websocket URL without a token' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:4002' + ENV['BROWSERLESS_IO_API_TOKEN'] = '' + + expect(described_class.browserless_env_configured?).to be(true) + end + + it 'accepts the legacy local Browserless websocket URL without a token' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:3000' + ENV['BROWSERLESS_IO_API_TOKEN'] = '' + + expect(described_class.browserless_env_configured?).to be(true) + end + + it 'requires a token for non-local websocket URLs' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'wss://production.browserless.example/ws' + ENV['BROWSERLESS_IO_API_TOKEN'] = '' + + expect(described_class.browserless_env_configured?).to be(false) + end + + it 'accepts non-local websocket URLs when a token is present' do + ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'wss://production.browserless.example/ws' + ENV['BROWSERLESS_IO_API_TOKEN'] = 'secret-token' + + expect(described_class.browserless_env_configured?).to be(true) + end + end +end diff --git a/spec/support/browserless_fetch_configs.rb b/spec/support/browserless_fetch_configs.rb new file mode 100644 index 0000000..c4ebd6d --- /dev/null +++ b/spec/support/browserless_fetch_configs.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module BrowserlessFetchConfigs + LOCAL_WS_URLS = %w[ + ws://127.0.0.1:3000 + ws://127.0.0.1:4002 + ].freeze + + CONFIGS = %w[ + apple.com/newsroom.yml + deepmind.google/blog.yml + notion.com/blog.yml + shopify.com/blog.yml + spotify.com/newsroom.yml + ].freeze + + module_function + + def include?(file_name) + CONFIGS.include?(file_name) + end + + def browserless_env_configured? + ws_url = ENV['BROWSERLESS_IO_WEBSOCKET_URL'].to_s + return false if ws_url.empty? + return true if LOCAL_WS_URLS.include?(ws_url) + + !ENV['BROWSERLESS_IO_API_TOKEN'].to_s.empty? + end +end diff --git a/spec/support/shared_examples/config.yml_spec.rb b/spec/support/shared_examples/config.yml_spec.rb index 0b7a788..a2daa4f 100644 --- a/spec/support/shared_examples/config.yml_spec.rb +++ b/spec/support/shared_examples/config.yml_spec.rb @@ -9,23 +9,11 @@ File.expand_path(File.join(__dir__, '..', '..', '..', 'lib', 'html2rss', 'configs', file_name)) end - let(:global_config) do - { - 'headers' => { - 'User-Agent': <<~UA.delete("\n") - Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) - AppleWebKit/537.36 (KHTML, like Gecko) - Chrome/134.0.0.0 - Safari/537.36' - UA - } - } - end let(:config) do feed_name = file_path.split(File::Separator)[-2..].join(File::Separator) config = {}.merge Html2rss::Configs.find_by_name(feed_name) - - config.merge!(global_config.dup) + # Reuse runtime browser defaults so fetch specs exercise the same header shape as production. + config[:headers] = Html2rss::Config::RequestHeaders.browser_defaults.merge(config.fetch(:headers, {})) # Use provided params or extract defaults from parameters section if params @@ -41,7 +29,7 @@ config end - context 'with the file' do # rubocop:disable RSpec/MultipleMemoizedHelpers + context 'with the file' do let(:host_name) { Helper.url_to_host_name yaml['channel']['url'] } let(:domain_name) { Helper.url_to_registrable_domain yaml['channel']['url'] } let(:dirname) { File.dirname(file_path).split(File::Separator).last } @@ -115,6 +103,16 @@ context "when fetching #{params}", :fetch do subject(:feed) { Html2rss.feed(config.dup) } + before do + next unless config[:strategy].to_s == 'browserless' + next if BrowserlessFetchConfigs.browserless_env_configured? + + skip( + "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, " \ + 'for custom endpoints, BROWSERLESS_IO_API_TOKEN' + ) + end + it 'has positive amount of items' do expect(feed.items.count).to be_positive, <<~MSG No items fetched. @@ -141,6 +139,16 @@ let(:specified_attributes) { Html2rss::Selectors::ITEM_TAGS & %w[title description author category] } let(:text_attributes) { specified_attributes & %w[title description author] } + before do + next unless config[:strategy].to_s == 'browserless' + next if BrowserlessFetchConfigs.browserless_env_configured? + + skip( + "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, " \ + 'for custom endpoints, BROWSERLESS_IO_API_TOKEN' + ) + end + it 'has no empty text attributes', :aggregate_failures do text_attributes.each do |attribute_name| expect(item.public_send(attribute_name).to_s).not_to be_empty, attribute_name.to_s