Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ test-fetch-changed-configs:
bin/rspec_changed_configs

test-fetch-all-configs:
bundle exec rspec --tag fetch spec/html2rss/configs
bundle exec rspec --tag fetch spec/html2rss/configs_dynamic_spec.rb

test-fetch-browserless-configs:
bin/rspec_browserless_configs

test-all: test test-fetch-all-configs

Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,23 @@ make test-config CONFIG=github.com/releases.yml

# Test domain
make test-domain DOMAIN=github.com

# Run live fetch tests for the full corpus
make test-fetch-all-configs

# Run the Browserless-backed fetch subset
BROWSERLESS_IO_WEBSOCKET_URL=ws://127.0.0.1:4002 \
BROWSERLESS_IO_API_TOKEN=... \
Comment thread
gildesmarais marked this conversation as resolved.
Outdated
make test-fetch-browserless-configs
```

**Adding new configs**: Create the YAML file, run `make validate`, then run the generated tests. No dedicated spec file is needed.

The fetch suite has two lanes:

- `make test-fetch-all-configs` runs all `:fetch` examples. Configs marked as Browserless-backed are skipped unless Browserless env vars are configured.
- `make test-fetch-browserless-configs` runs only the Browserless-backed config subset and requires `BROWSERLESS_IO_WEBSOCKET_URL`. Custom endpoints also require `BROWSERLESS_IO_API_TOKEN`.

**Config folder convention**: Place configs under the registrable domain folder (e.g., `example.com/` or `bbc.co.uk/`). Legacy subdomain folders (e.g., `news.example.com/`) are allowed but not preferred.

## Editor Setup (JSON Schema)
Expand Down
19 changes: 19 additions & 0 deletions bin/rspec_browserless_configs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require_relative '../spec/support/browserless_fetch_configs'

unless BrowserlessFetchConfigs.browserless_env_configured?
warn 'BROWSERLESS_IO_WEBSOCKET_URL is required for browserless fetch tests.'
warn 'Set BROWSERLESS_IO_API_TOKEN as well when using a custom websocket endpoint.'
exit 1
end

args = ['bundle', 'exec', 'rspec', '--tag', 'fetch']
BrowserlessFetchConfigs::CONFIGS.each do |config|
args << '--example'
args << config
end
args << 'spec/html2rss/configs_dynamic_spec.rb'

exec(*args)
18 changes: 18 additions & 0 deletions lib/html2rss/configs/apple.com/newsroom.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://www.apple.com/newsroom/
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: 'li.tile-item a.tile-hero, li.tile-item a.tile-2up, li.tile-item a.tile-3up, li.tile-item a.tile-list'
enhance: false
title:
selector: .tile__headline
url:
extractor: href
published_at:
selector: .tile__timestamp
19 changes: 19 additions & 0 deletions lib/html2rss/configs/deepmind.google/blog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://deepmind.google/blog/
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: .card__inner
enhance: false
title:
selector: h3
url:
selector: .card__overlay-link
extractor: href
published_at:
selector: time
19 changes: 19 additions & 0 deletions lib/html2rss/configs/notion.com/blog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://www.notion.com/blog
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: article.post-preview
enhance: false
title:
selector: h3 a[href*="/blog/"]
url:
selector: h3 a[href*="/blog/"]
extractor: href
description:
selector: '> a[href*="/blog/"]:not([title])'
16 changes: 16 additions & 0 deletions lib/html2rss/configs/shopify.com/blog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://www.shopify.com/blog/latest
time_zone: UTC
ttl: 360
selectors:
items:
selector: article.article--index
enhance: false
title:
selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])'
url:
selector: '.blogPost a[href*="/blog/"]:not([href*="/topics/"])'
extractor: href
17 changes: 17 additions & 0 deletions lib/html2rss/configs/spotify.com/newsroom.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json
strategy: browserless

channel:
url: https://newsroom.spotify.com/
language: en
time_zone: UTC
ttl: 360
selectors:
items:
selector: '.post-box.v2'
enhance: false
title:
selector: 'h3 a[href*="/20"]'
url:
selector: 'h3 a[href*="/20"]'
extractor: href
36 changes: 36 additions & 0 deletions spec/browserless_fetch_configs_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# frozen_string_literal: true

RSpec.describe BrowserlessFetchConfigs do
describe '.browserless_env_configured?' do
around do |example|
original_ws_url = ENV['BROWSERLESS_IO_WEBSOCKET_URL']
original_api_token = ENV['BROWSERLESS_IO_API_TOKEN']

example.run
ensure
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = original_ws_url
ENV['BROWSERLESS_IO_API_TOKEN'] = original_api_token
end

it 'accepts the documented local Browserless websocket URL without a token' do
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:4002'
ENV['BROWSERLESS_IO_API_TOKEN'] = ''

expect(described_class.browserless_env_configured?).to be(true)
end

it 'accepts the legacy local Browserless websocket URL without a token' do
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'ws://127.0.0.1:3000'
ENV['BROWSERLESS_IO_API_TOKEN'] = ''

expect(described_class.browserless_env_configured?).to be(true)
end

it 'requires a token for non-local websocket URLs' do
ENV['BROWSERLESS_IO_WEBSOCKET_URL'] = 'wss://production.browserless.example/ws'
ENV['BROWSERLESS_IO_API_TOKEN'] = ''

expect(described_class.browserless_env_configured?).to be(false)
end
Comment thread
gildesmarais marked this conversation as resolved.
end
end
30 changes: 30 additions & 0 deletions spec/support/browserless_fetch_configs.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

module BrowserlessFetchConfigs
LOCAL_WS_URLS = %w[
ws://127.0.0.1:3000
ws://127.0.0.1:4002
].freeze

CONFIGS = %w[
apple.com/newsroom.yml
deepmind.google/blog.yml
notion.com/blog.yml
shopify.com/blog.yml
spotify.com/newsroom.yml
].freeze

module_function

def include?(file_name)
CONFIGS.include?(file_name)
end

def browserless_env_configured?
ws_url = ENV['BROWSERLESS_IO_WEBSOCKET_URL'].to_s
return false if ws_url.empty?
return true if LOCAL_WS_URLS.include?(ws_url)

!ENV['BROWSERLESS_IO_API_TOKEN'].to_s.empty?
end
end
31 changes: 17 additions & 14 deletions spec/support/shared_examples/config.yml_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,12 @@
File.expand_path(File.join(__dir__, '..', '..', '..', 'lib', 'html2rss', 'configs', file_name))
end

let(:global_config) do
{
'headers' => {
'User-Agent': <<~UA.delete("\n")
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/134.0.0.0
Safari/537.36'
UA
}
}
end
let(:config) do
feed_name = file_path.split(File::Separator)[-2..].join(File::Separator)
config = {}.merge Html2rss::Configs.find_by_name(feed_name)

config.merge!(global_config.dup)
# Reuse runtime browser defaults so fetch specs exercise the same header shape as production.
config[:headers] = Html2rss::Config::RequestHeaders.browser_defaults.merge(config.fetch(:headers, {}))
config[:strategy] = :browserless if BrowserlessFetchConfigs.include?(file_name)
Comment thread
gildesmarais marked this conversation as resolved.
Outdated

# Use provided params or extract defaults from parameters section
if params
Expand Down Expand Up @@ -115,6 +104,13 @@
context "when fetching #{params}", :fetch do
subject(:feed) { Html2rss.feed(config.dup) }

before do
next unless BrowserlessFetchConfigs.include?(file_name)
next if BrowserlessFetchConfigs.browserless_env_configured?

skip "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, for custom endpoints, BROWSERLESS_IO_API_TOKEN"
end

it 'has positive amount of items' do
expect(feed.items.count).to be_positive, <<~MSG
No items fetched.
Expand All @@ -141,6 +137,13 @@
let(:specified_attributes) { Html2rss::Selectors::ITEM_TAGS & %w[title description author category] }
let(:text_attributes) { specified_attributes & %w[title description author] }

before do
next unless BrowserlessFetchConfigs.include?(file_name)
next if BrowserlessFetchConfigs.browserless_env_configured?

skip "Browserless fetch for #{file_name} requires BROWSERLESS_IO_WEBSOCKET_URL and, for custom endpoints, BROWSERLESS_IO_API_TOKEN"
end

it 'has no empty text attributes', :aggregate_failures do
text_attributes.each do |attribute_name|
expect(item.public_send(attribute_name).to_s).not_to be_empty, attribute_name.to_s
Expand Down
Loading