diff --git a/AGENTS.md b/AGENTS.md index 968ef33..e31829d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -68,6 +68,13 @@ Recommended sequence: 4. Confirm the title and URL live inside that boundary. 5. Record the final URL if the page redirects by locale or renders a different surface than expected. +If Chrome MCP is unavailable (`Transport closed` or page-lock errors), do this recovery sequence: + +1. Kill stale Chrome MCP processes (`pkill -9 -f 'chrome-devtools-mcp|Chrome for Testing'`). +2. Retry Chrome MCP once before continuing. +3. If still unavailable, continue with `curl -I -L`, runtime `feed`, and HTML inspection in a temporary file. +4. Explicitly report Chrome MCP outage in the final handoff. + ## Browserless Use Browserless when: @@ -158,6 +165,20 @@ bundle exec rspec --tag fetch --example 'example.com/feed.yml' spec/html2rss/con - the chosen surface is too noisy or too dynamic - the candidate should be downgraded or dropped +7. Cross-runtime mismatch check (required when core feed works but fetch specs fail): + +- confirm canonical URL with redirect tracing: + +```bash +curl -I -L -s https://example.com | sed -n '1,20p' +``` + +- compare behavior in both runtimes: + - core repo (`../html2rss`) via `html2rss feed` + - configs repo fetch lane (`bundle exec rspec --tag fetch --example ...`) +- if selectors are valid in core but fetch lane still returns zero items, treat this as request-strategy/runtime mismatch, not selector success. +- in that case: prefer Browserless-backed verification if available; otherwise mark as downgraded/deferred with evidence. + ## Runtime Debugging Use the core CLI as the authority for single-config debugging. The quickest loop is: @@ -170,6 +191,13 @@ Use the core CLI as the authority for single-config debugging. The quickest loop If Browserless works but Faraday does not, keep the config narrow and classify it as Browserless-backed instead of trying to rescue it with brittle tweaks. +Additional high-value checks: + +- Always normalize `channel.url` to the final canonical host/path (`www` vs non-`www`, retired legacy paths). +- Prefer selectors anchored to content links (`h3 a`, `a[href*='/article/']`) over container-only selectors. +- Remove optional fields first when quality drops (`categories`, synthetic IDs, weak descriptions) before adding selector complexity. +- Set `enhance: false` early if enhancement starts pulling nav/hero/market widgets. + ## Auto-Source Use `auto` for reconnaissance, not as proof that a config is ready. @@ -211,3 +239,5 @@ When finishing config work, report: - dropped or deferred candidates and why - commands actually run - residual risks, especially selector drift, localization dependence, or Browserless dependence +- whether Chrome MCP was available during validation +- whether focused fetch specs matched core runtime behavior diff --git a/lib/html2rss/configs/deraktionaer.de/meistgelesen.yml b/lib/html2rss/configs/deraktionaer.de/meistgelesen.yml index e538909..9b493e8 100644 --- a/lib/html2rss/configs/deraktionaer.de/meistgelesen.yml +++ b/lib/html2rss/configs/deraktionaer.de/meistgelesen.yml @@ -1,21 +1,16 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json channel: title: "deraktionaer.de: meistgelesen" - url: https://deraktionaer.de/ + url: https://www.deraktionaer.de/ time_zone: Europe/Berlin ttl: 360 language: de +enhance: false +strategy: browserless selectors: items: - selector: "#most-viewed ol > li" + selector: "section#top-articles article.top-article a.top-article-content[href^='/artikel/']" title: - selector: "> a" + extractor: "text" url: - selector: "> a" extractor: "href" - isin: - selector: ".stock-info" - extractor: attribute - attribute: "data-quote" - categories: - - isin diff --git a/lib/html2rss/configs/elastic.co/elasticsearch-release-notes.yml b/lib/html2rss/configs/elastic.co/elasticsearch-release-notes.yml new file mode 100644 index 0000000..2018008 --- /dev/null +++ b/lib/html2rss/configs/elastic.co/elasticsearch-release-notes.yml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +channel: + url: https://www.elastic.co/docs/release-notes/elasticsearch + language: en + time_zone: UTC + ttl: 360 +strategy: browserless +selectors: + items: + selector: 'a[href^="#elasticsearch-"][href$="-release-notes"]' + enhance: false + title: + extractor: text + url: + extractor: href diff --git a/lib/html2rss/configs/avherald.com/index.yml b/lib/html2rss/configs/go.dev/release-history.yml similarity index 61% rename from lib/html2rss/configs/avherald.com/index.yml rename to lib/html2rss/configs/go.dev/release-history.yml index 92ad4cc..ac79323 100644 --- a/lib/html2rss/configs/avherald.com/index.yml +++ b/lib/html2rss/configs/go.dev/release-history.yml @@ -1,14 +1,15 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json ---- channel: - url: https://avherald.com/ + url: https://go.dev/doc/devel/release language: en - ttl: 120 time_zone: UTC + ttl: 360 +strategy: browserless selectors: items: - selector: "table table a" + selector: 'a[href^="/doc/go1."]' + enhance: false title: - selector: span + extractor: text url: extractor: href diff --git a/lib/html2rss/configs/grafana.com/whatsnew.yml b/lib/html2rss/configs/grafana.com/whatsnew.yml new file mode 100644 index 0000000..ce765a2 --- /dev/null +++ b/lib/html2rss/configs/grafana.com/whatsnew.yml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +channel: + url: https://grafana.com/docs/grafana/latest/whatsnew/ + language: en + time_zone: UTC + ttl: 360 +strategy: browserless +selectors: + items: + selector: 'a.docs__menu-a[href^="/docs/grafana/latest/whatsnew/whats-new-in-v"]' + enhance: false + title: + extractor: text + url: + extractor: href diff --git a/lib/html2rss/configs/iaapa.org/news.yml b/lib/html2rss/configs/iaapa.org/news.yml index d4820b5..a1f5d9b 100644 --- a/lib/html2rss/configs/iaapa.org/news.yml +++ b/lib/html2rss/configs/iaapa.org/news.yml @@ -1,15 +1,15 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json channel: - url: https://www.iaapa.org/news + url: https://iaapa.org/news-funworld time_zone: UTC ttl: 720 +enhance: false +strategy: browserless selectors: items: selector: ".views-row > article" title: selector: h3 - description: - selector: ".event-card__summary" url: selector: "a" extractor: "href" diff --git a/lib/html2rss/configs/mozilla.org/security-advisories.yml b/lib/html2rss/configs/mozilla.org/security-advisories.yml new file mode 100644 index 0000000..13dc8af --- /dev/null +++ b/lib/html2rss/configs/mozilla.org/security-advisories.yml @@ -0,0 +1,16 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json +channel: + url: https://www.mozilla.org/en-US/security/advisories/ + language: en + time_zone: UTC + ttl: 360 +strategy: browserless +selectors: + items: + selector: "main li" + enhance: false + title: + selector: 'a[href*="/security/advisories/mfsa"]' + url: + selector: 'a[href*="/security/advisories/mfsa"]' + extractor: href diff --git a/lib/html2rss/configs/tourismusnetzwerk-brandenburg.de/aktuelle_nachrichten.yml b/lib/html2rss/configs/tourismusnetzwerk-brandenburg.de/aktuelle_nachrichten.yml index 1892fef..7e4be2a 100644 --- a/lib/html2rss/configs/tourismusnetzwerk-brandenburg.de/aktuelle_nachrichten.yml +++ b/lib/html2rss/configs/tourismusnetzwerk-brandenburg.de/aktuelle_nachrichten.yml @@ -1,20 +1,16 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/html2rss/html2rss/refs/heads/master/schema/html2rss-config.schema.json channel: - url: https://www.tourismusnetzwerk-brandenburg.de/nc/aktuelle-nachrichten/ + url: https://tourismusnetzwerk-brandenburg.de/ time_zone: Europe/Berlin ttl: 720 language: de +enhance: false +strategy: browserless selectors: items: - selector: "article.article" + selector: "article.node.article.wall-floating" title: - selector: "h3" + selector: "h3.title a[rel='bookmark']" url: - selector: "a" + selector: "h3.title a[rel='bookmark']" extractor: "href" - topic: - selector: ".field--item" - categories: - - topic - description: - selector: "p"