|
15 | 15 | except ImportError: |
16 | 16 | HAS_BROTLI = False |
17 | 17 | from typing import Any, Callable, Optional, TYPE_CHECKING, Literal |
| 18 | +from urllib.parse import urljoin |
18 | 19 | from urllib.request import ( |
19 | 20 | HTTPErrorProcessor, |
20 | 21 | HTTPRedirectHandler, |
@@ -597,6 +598,31 @@ def _raise_for_non_feed_root( |
597 | 598 | raise ValueError(f"Not a valid feed: {root_tag_local} element found - {error_msg[:100]}") |
598 | 599 |
|
599 | 600 |
|
| 601 | +_RE_META_REFRESH_URL = re.compile( |
| 602 | + r'url\s*=\s*["\']?\s*([^"\'>\s]+)', re.IGNORECASE |
| 603 | +) |
| 604 | + |
| 605 | + |
| 606 | +def _extract_meta_refresh_url(content: str | bytes, base_url: str) -> str | None: |
| 607 | + """Extract redirect URL from an HTML meta-refresh tag.""" |
| 608 | + html_bytes = content.encode("utf-8") if isinstance(content, str) else content |
| 609 | + try: |
| 610 | + doc = etree.fromstring(html_bytes, parser=etree.HTMLParser()) |
| 611 | + except Exception: |
| 612 | + return None |
| 613 | + if doc is None: |
| 614 | + return None |
| 615 | + |
| 616 | + for meta in doc.iter("meta"): |
| 617 | + if (meta.get("http-equiv") or "").lower() == "refresh": |
| 618 | + match = _RE_META_REFRESH_URL.search(meta.get("content", "")) |
| 619 | + if match: |
| 620 | + url = urljoin(base_url, match.group(1)) |
| 621 | + if url != base_url: |
| 622 | + return url |
| 623 | + return None |
| 624 | + |
| 625 | + |
600 | 626 | def _detect_feed_structure( |
601 | 627 | root: _Element, xml_content: bytes, root_tag_local: str |
602 | 628 | ) -> tuple[_FeedType, _Element, list[_Element], Optional[str]]: |
@@ -704,24 +730,8 @@ def _detect_feed_structure( |
704 | 730 | raise ValueError(f"Unknown feed type: {root.tag}") |
705 | 731 |
|
706 | 732 |
|
707 | | -def parse(source: str | bytes) -> FastFeedParserDict: |
708 | | - """Parse a feed from a URL or XML content. |
709 | | -
|
710 | | - Args: |
711 | | - source: URL string or XML content string/bytes |
712 | | -
|
713 | | - Returns: |
714 | | - FastFeedParserDict containing parsed feed data |
715 | | -
|
716 | | - Raises: |
717 | | - ValueError: If content is empty or invalid |
718 | | - HTTPError: If URL fetch fails |
719 | | - """ |
720 | | - if isinstance(source, str) and source.startswith(("http://", "https://")): |
721 | | - xml_content = _fetch_url_content(source) |
722 | | - else: |
723 | | - xml_content = source |
724 | | - |
| 733 | +def _parse_content(xml_content: str | bytes) -> FastFeedParserDict: |
| 734 | + """Parse feed content (XML or JSON) that has already been fetched.""" |
725 | 735 | json_feed = _maybe_parse_json_feed(xml_content) |
726 | 736 | if json_feed is not None: |
727 | 737 | return json_feed |
@@ -750,6 +760,39 @@ def parse(source: str | bytes) -> FastFeedParserDict: |
750 | 760 | return feed |
751 | 761 |
|
752 | 762 |
|
| 763 | +def parse(source: str | bytes) -> FastFeedParserDict: |
| 764 | + """Parse a feed from a URL or XML content. |
| 765 | +
|
| 766 | + Args: |
| 767 | + source: URL string or XML content string/bytes |
| 768 | +
|
| 769 | + Returns: |
| 770 | + FastFeedParserDict containing parsed feed data |
| 771 | +
|
| 772 | + Raises: |
| 773 | + ValueError: If content is empty or invalid |
| 774 | + HTTPError: If URL fetch fails |
| 775 | + """ |
| 776 | + is_url = isinstance(source, str) and source.startswith(("http://", "https://")) |
| 777 | + if is_url: |
| 778 | + content = _fetch_url_content(source) |
| 779 | + else: |
| 780 | + content = source |
| 781 | + |
| 782 | + try: |
| 783 | + return _parse_content(content) |
| 784 | + except ValueError as e: |
| 785 | + if not is_url: |
| 786 | + raise |
| 787 | + err_msg = str(e) |
| 788 | + if "HTML" not in err_msg and "not a valid RSS/Atom feed" not in err_msg: |
| 789 | + raise |
| 790 | + redirect_url = _extract_meta_refresh_url(content, source) |
| 791 | + if redirect_url is None: |
| 792 | + raise |
| 793 | + return parse(redirect_url) |
| 794 | + |
| 795 | + |
753 | 796 | def _parse_feed_info( |
754 | 797 | channel: _Element, feed_type: _FeedType, atom_namespace: Optional[str] = None |
755 | 798 | ) -> FastFeedParserDict: |
|
0 commit comments