Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ test-fetch-changed-configs:
bin/rspec_changed_configs

test-fetch-all-configs:
bundle exec rspec --tag fetch spec/html2rss/configs
bundle exec rspec --tag fetch spec/html2rss/configs_dynamic_spec.rb

test-fetch-browserless-configs:
bin/rspec_browserless_configs

test-all: test test-fetch-all-configs

Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,22 @@ make test-config CONFIG=github.com/releases.yml

# Test domain
make test-domain DOMAIN=github.com

# Run live fetch tests for the full corpus
make test-fetch-all-configs

# Run the Browserless-backed fetch subset
BROWSERLESS_IO_WEBSOCKET_URL=ws://127.0.0.1:4002 \
make test-fetch-browserless-configs
```

**Adding new configs**: Create the YAML file, run `make validate`, then run the generated tests. No dedicated spec file is needed.

The fetch suite has two lanes:

- `make test-fetch-all-configs` runs all `:fetch` examples. Configs marked as Browserless-backed are skipped unless Browserless env vars are configured.
- `make test-fetch-browserless-configs` runs only the Browserless-backed config subset and requires `BROWSERLESS_IO_WEBSOCKET_URL`. Custom endpoints also require `BROWSERLESS_IO_API_TOKEN`.

**Config folder convention**: Place configs under the registrable domain folder (e.g., `example.com/` or `bbc.co.uk/`). Legacy subdomain folders (e.g., `news.example.com/`) are allowed but not preferred.

## Editor Setup (JSON Schema)
Expand Down
19 changes: 19 additions & 0 deletions bin/rspec_browserless_configs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require_relative '../spec/support/browserless_fetch_configs'

unless BrowserlessFetchConfigs.browserless_env_configured?
warn 'BROWSERLESS_IO_WEBSOCKET_URL is required for browserless fetch tests.'
warn 'Set BROWSERLESS_IO_API_TOKEN as well when using a custom websocket endpoint.'
exit 1
end

args = ['bundle', 'exec', 'rspec', '--tag', 'fetch']
BrowserlessFetchConfigs::CONFIGS.each do |config|
args << '--example'
args << config
end
args << 'spec/html2rss/configs_dynamic_spec.rb'

exec(*args)
18 changes: 18 additions & 0 deletions lib/html2rss/configs/apple.com/newsroom.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://www.apple.com/newsroom/
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: "li.tile-item a.tile-hero, li.tile-item a.tile-2up, li.tile-item a.tile-3up, li.tile-item a.tile-list"
enhance: false
title:
selector: .tile__headline
url:
extractor: href
published_at:
selector: .tile__timestamp
19 changes: 19 additions & 0 deletions lib/html2rss/configs/deepmind.google/blog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://deepmind.google/blog/
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: .card__inner
enhance: false
title:
selector: h3
url:
selector: .card__overlay-link
extractor: href
published_at:
selector: time
19 changes: 19 additions & 0 deletions lib/html2rss/configs/notion.com/blog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://www.notion.com/blog
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: article.post-preview
enhance: false
title:
selector: h3 a[href*="/blog/"]
url:
selector: h3 a[href*="/blog/"]
extractor: href
description:
selector: '> a[href*="/blog/"]:not([title])'
16 changes: 16 additions & 0 deletions lib/html2rss/configs/shopify.com/blog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://www.shopify.com/blog/latest
time_zone: UTC
ttl: 360
selectors:
items:
selector: article.article--index
enhance: false
title:
selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])'
url:
selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])'
extractor: href
17 changes: 17 additions & 0 deletions lib/html2rss/configs/spotify.com/newsroom.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://newsroom.spotify.com/
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: ".post-box.v2"
enhance: false
title:
selector: 'h3 a[href*="/20"]'
url:
selector: 'h3 a[href*="/20"]'
extractor: href
43 changes: 43 additions & 0 deletions spec/browserless_fetch_configs_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true

RSpec.describe BrowserlessFetchConfigs do
describe '.browserless_env_configured?' do
around do |example|
original_ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', nil)
original_api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', nil)

example.run
ensure
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = original_ws_url
ENV['BROWSERLESS_IO_API_TOKEN'] = original_api_token
end

it 'accepts the documented local Browserless websocket URL without a token' do
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:4002'
ENV['BROWSERLESS_IO_API_TOKEN'] = ''

expect(described_class.browserless_env_configured?).to be(true)
end

it 'accepts the legacy local Browserless websocket URL without a token' do
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:3000'
ENV['BROWSERLESS_IO_API_TOKEN'] = ''

expect(described_class.browserless_env_configured?).to be(true)
end

it 'requires a token for non-local websocket URLs' do
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'wss://production.browserless.example/ws'
ENV['BROWSERLESS_IO_API_TOKEN'] = ''

expect(described_class.browserless_env_configured?).to be(false)
end
Comment thread
gildesmarais marked this conversation as resolved.

it 'accepts non-local websocket URLs when a token is present' do
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'wss://production.browserless.example/ws'
ENV['BROWSERLESS_IO_API_TOKEN'] = 'secret-token'

expect(described_class.browserless_env_configured?).to be(true)
end
end
end
30 changes: 30 additions & 0 deletions spec/support/browserless_fetch_configs.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

module BrowserlessFetchConfigs
LOCAL_WS_URLS = %w[
ws://127.0.0.1:3000
ws://127.0.0.1:4002
].freeze

CONFIGS = %w[
apple.com/newsroom.yml
deepmind.google/blog.yml
notion.com/blog.yml
shopify.com/blog.yml
spotify.com/newsroom.yml
].freeze

module_function

def include?(file_name)
CONFIGS.include?(file_name)
end

def browserless_env_configured?
ws_url = ENV['BROWSERLESS_IO_WEBSOCKET_URL'].to_s
return false if ws_url.empty?
return true if LOCAL_WS_URLS.include?(ws_url)

!ENV['BROWSERLESS_IO_API_TOKEN'].to_s.empty?
end
end
38 changes: 23 additions & 15 deletions spec/support/shared_examples/config.yml_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,11 @@
File.expand_path(File.join(__dir__, '..', '..', '..', 'lib', 'html2rss', 'configs', file_name))
end

let(:global_config) do
{
'headers' => {
'User-Agent': <<~UA.delete("\n")
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/134.0.0.0
Safari/537.36'
UA
}
}
end
let(:config) do
feed_name = file_path.split(File::Separator)[-2..].join(File::Separator)
config = {}.merge Html2rss::Configs.find_by_name(feed_name)

config.merge!(global_config.dup)
# Reuse runtime browser defaults so fetch specs exercise the same header shape as production.
config[:headers] = Html2rss::Config::RequestHeaders.browser_defaults.merge(config.fetch(:headers, {}))

# Use provided params or extract defaults from parameters section
if params
Expand All @@ -41,7 +29,7 @@
config
end

context 'with the file' do # rubocop:disable RSpec/MultipleMemoizedHelpers
context 'with the file' do
let(:host_name) { Helper.url_to_host_name yaml['channel']['url'] }
let(:domain_name) { Helper.url_to_registrable_domain yaml['channel']['url'] }
let(:dirname) { File.dirname(file_path).split(File::Separator).last }
Expand Down Expand Up @@ -115,6 +103,16 @@
context "when fetching #{params}", :fetch do
subject(:feed) { Html2rss.feed(config.dup) }

before do
next unless config[:strategy].to_s == 'browserless'
next if BrowserlessFetchConfigs.browserless_env_configured?

skip(
"Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, " \
'for custom endpoints, BROWSERLESS_IO_API_TOKEN'
)
end

it 'has positive amount of items' do
expect(feed.items.count).to be_positive, <<~MSG
No items fetched.
Expand All @@ -141,6 +139,16 @@
let(:specified_attributes) { Html2rss::Selectors::ITEM_TAGS & %w[title description author category] }
let(:text_attributes) { specified_attributes & %w[title description author] }

before do
next unless config[:strategy].to_s == 'browserless'
next if BrowserlessFetchConfigs.browserless_env_configured?

skip(
"Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, " \
'for custom endpoints, BROWSERLESS_IO_API_TOKEN'
)
end

it 'has no empty text attributes', :aggregate_failures do
text_attributes.each do |attribute_name|
expect(item.public_send(attribute_name).to_s).not_to be_empty, attribute_name.to_s
Expand Down
Loading