Skip to content

Commit f816526

Browse files
author
Vladimir Prelovac
committed
meta refresh
1 parent 1b930ec commit f816526

2 files changed

Lines changed: 82 additions & 18 deletions

File tree

src/fastfeedparser/main.py

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
except ImportError:
1616
HAS_BROTLI = False
1717
from typing import Any, Callable, Optional, TYPE_CHECKING, Literal
18+
from urllib.parse import urljoin
1819
from urllib.request import (
1920
HTTPErrorProcessor,
2021
HTTPRedirectHandler,
@@ -597,6 +598,31 @@ def _raise_for_non_feed_root(
597598
raise ValueError(f"Not a valid feed: {root_tag_local} element found - {error_msg[:100]}")
598599

599600

601+
_RE_META_REFRESH_URL = re.compile(
602+
r'url\s*=\s*["\']?\s*([^"\'>\s]+)', re.IGNORECASE
603+
)
604+
605+
606+
def _extract_meta_refresh_url(content: str | bytes, base_url: str) -> str | None:
607+
"""Extract redirect URL from an HTML meta-refresh tag."""
608+
html_bytes = content.encode("utf-8") if isinstance(content, str) else content
609+
try:
610+
doc = etree.fromstring(html_bytes, parser=etree.HTMLParser())
611+
except Exception:
612+
return None
613+
if doc is None:
614+
return None
615+
616+
for meta in doc.iter("meta"):
617+
if (meta.get("http-equiv") or "").lower() == "refresh":
618+
match = _RE_META_REFRESH_URL.search(meta.get("content", ""))
619+
if match:
620+
url = urljoin(base_url, match.group(1))
621+
if url != base_url:
622+
return url
623+
return None
624+
625+
600626
def _detect_feed_structure(
601627
root: _Element, xml_content: bytes, root_tag_local: str
602628
) -> tuple[_FeedType, _Element, list[_Element], Optional[str]]:
@@ -704,24 +730,8 @@ def _detect_feed_structure(
704730
raise ValueError(f"Unknown feed type: {root.tag}")
705731

706732

707-
def parse(source: str | bytes) -> FastFeedParserDict:
708-
"""Parse a feed from a URL or XML content.
709-
710-
Args:
711-
source: URL string or XML content string/bytes
712-
713-
Returns:
714-
FastFeedParserDict containing parsed feed data
715-
716-
Raises:
717-
ValueError: If content is empty or invalid
718-
HTTPError: If URL fetch fails
719-
"""
720-
if isinstance(source, str) and source.startswith(("http://", "https://")):
721-
xml_content = _fetch_url_content(source)
722-
else:
723-
xml_content = source
724-
733+
def _parse_content(xml_content: str | bytes) -> FastFeedParserDict:
734+
"""Parse feed content (XML or JSON) that has already been fetched."""
725735
json_feed = _maybe_parse_json_feed(xml_content)
726736
if json_feed is not None:
727737
return json_feed
@@ -750,6 +760,39 @@ def parse(source: str | bytes) -> FastFeedParserDict:
750760
return feed
751761

752762

763+
def parse(source: str | bytes) -> FastFeedParserDict:
764+
"""Parse a feed from a URL or XML content.
765+
766+
Args:
767+
source: URL string or XML content string/bytes
768+
769+
Returns:
770+
FastFeedParserDict containing parsed feed data
771+
772+
Raises:
773+
ValueError: If content is empty or invalid
774+
HTTPError: If URL fetch fails
775+
"""
776+
is_url = isinstance(source, str) and source.startswith(("http://", "https://"))
777+
if is_url:
778+
content = _fetch_url_content(source)
779+
else:
780+
content = source
781+
782+
try:
783+
return _parse_content(content)
784+
except ValueError as e:
785+
if not is_url:
786+
raise
787+
err_msg = str(e)
788+
if "HTML" not in err_msg and "not a valid RSS/Atom feed" not in err_msg:
789+
raise
790+
redirect_url = _extract_meta_refresh_url(content, source)
791+
if redirect_url is None:
792+
raise
793+
return parse(redirect_url)
794+
795+
753796
def _parse_feed_info(
754797
channel: _Element, feed_type: _FeedType, atom_namespace: Optional[str] = None
755798
) -> FastFeedParserDict:

tests/test_encoding.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from fastfeedparser import parse
2+
from fastfeedparser.main import _extract_meta_refresh_url
23

34

45
def test_parse_str_with_non_utf8_xml_declaration():
@@ -30,3 +31,23 @@ def test_parse_bytes_with_non_utf8_encoding():
3031
assert feed.feed.title == "café"
3132
assert feed.entries[0].title == "café"
3233

34+
35+
def test_meta_refresh_extraction():
36+
html = '<!doctype html><html><head><meta http-equiv=refresh content="0; url=https://example.com/feed.xml"></head></html>'
37+
assert _extract_meta_refresh_url(html, "https://example.com/feed/") == "https://example.com/feed.xml"
38+
39+
40+
def test_meta_refresh_relative_url():
41+
html = b'<html><head><meta http-equiv="refresh" content="0;url=/index.xml"></head></html>'
42+
assert _extract_meta_refresh_url(html, "https://example.com/feed/") == "https://example.com/index.xml"
43+
44+
45+
def test_meta_refresh_none_when_missing():
46+
html = "<html><head><title>Hello</title></head><body></body></html>"
47+
assert _extract_meta_refresh_url(html, "https://example.com/") is None
48+
49+
50+
def test_meta_refresh_none_when_same_url():
51+
html = '<html><head><meta http-equiv="refresh" content="0; url=https://example.com/"></head></html>'
52+
assert _extract_meta_refresh_url(html, "https://example.com/") is None
53+

0 commit comments

Comments
 (0)