Skip to content

Commit b90c8b8

Browse files
Vladimir Prelovacclaude
andcommitted
feat(parser): add meta-refresh fallback, fast RSS path, media/enclosure opts
- Follow HTML meta-refresh redirects when URL fetch returns HTML - Add _parse_rss_feed_entry_fast() for optimized RSS item parsing - Skip media_content/enclosure parsing when namespace not present - Lazy whitespace normalization in _parse_date() - Bump version to 0.4.9 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f816526 commit b90c8b8

3 files changed

Lines changed: 169 additions & 17 deletions

File tree

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = fastfeedparser
3-
version = 0.4.8
3+
version = 0.4.9
44
author = Vladimir Prelovac
55
author_email = vlad@kagi.com
66
description = High performance RSS, Atom, JSON and RDF feed parser in Python

src/fastfeedparser/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .main import parse, FastFeedParserDict
22

3-
__version__ = "0.4.4"
3+
__version__ = "0.4.9"
44
__all__ = ["parse", "FastFeedParserDict"]

src/fastfeedparser/main.py

Lines changed: 167 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,24 @@ def _detect_feed_structure(
730730
raise ValueError(f"Unknown feed type: {root.tag}")
731731

732732

733+
def _should_parse_media_content(root: _Element, xml_content: bytes) -> bool:
734+
"""Check if feed likely contains Media RSS fields."""
735+
ns_values = root.nsmap.values() if root.nsmap else ()
736+
for ns_value in ns_values:
737+
if not ns_value:
738+
continue
739+
if "search.yahoo.com/mrss" in ns_value:
740+
return True
741+
742+
# Fallback for feeds with undeclared/late namespace usage.
743+
return b"search.yahoo.com/mrss" in xml_content or b"<media:" in xml_content
744+
745+
746+
def _should_parse_enclosures(feed_type: _FeedType, xml_content: bytes) -> bool:
747+
"""Check if feed likely contains RSS enclosure elements."""
748+
return feed_type == "rss" and b"<enclosure" in xml_content
749+
750+
733751
def _parse_content(xml_content: str | bytes) -> FastFeedParserDict:
734752
"""Parse feed content (XML or JSON) that has already been fetched."""
735753
json_feed = _maybe_parse_json_feed(xml_content)
@@ -744,14 +762,22 @@ def _parse_content(xml_content: str | bytes) -> FastFeedParserDict:
744762
feed_type, channel, items, atom_namespace = _detect_feed_structure(
745763
root, xml_content, root_tag_local
746764
)
765+
parse_media_content = _should_parse_media_content(root, xml_content)
766+
parse_enclosures = _should_parse_enclosures(feed_type, xml_content)
747767

748768
feed = _parse_feed_info(channel, feed_type, atom_namespace)
749769

750770
# Parse entries
751771
entries: list[FastFeedParserDict] = []
752772
feed["entries"] = entries
753773
for item in items:
754-
entry = _parse_feed_entry(item, feed_type, atom_namespace)
774+
entry = _parse_feed_entry(
775+
item,
776+
feed_type,
777+
atom_namespace,
778+
parse_media_content=parse_media_content,
779+
parse_enclosures=parse_enclosures,
780+
)
755781
# Ensure that titles and descriptions are always present
756782
entry["title"] = entry.get("title", "").strip()
757783
entry["description"] = entry.get("description", "").strip()
@@ -1197,12 +1223,138 @@ def _parse_enclosures(item: _Element) -> list[dict[str, Any]] | None:
11971223
return enclosures or None
11981224

11991225

1226+
def _normalize_local_tag_name(tag: str) -> str:
1227+
local = tag.rsplit("}", 1)[-1].lower()
1228+
if ":" in local:
1229+
local = local.split(":", 1)[1]
1230+
return local
1231+
1232+
1233+
def _build_rss_item_text_maps(item: _Element) -> tuple[dict[str, Optional[str]], dict[str, Optional[str]]]:
1234+
by_local: dict[str, Optional[str]] = {}
1235+
by_full: dict[str, Optional[str]] = {}
1236+
for child in item:
1237+
tag = child.tag
1238+
if not isinstance(tag, str):
1239+
continue
1240+
text_value = child.text.strip() if child.text else None
1241+
if tag not in by_full:
1242+
by_full[tag] = text_value
1243+
local = _normalize_local_tag_name(tag)
1244+
if local not in by_local:
1245+
by_local[local] = text_value
1246+
return by_local, by_full
1247+
1248+
1249+
def _first_non_empty(mapping: dict[str, Optional[str]], keys: tuple[str, ...]) -> Optional[str]:
1250+
for key in keys:
1251+
value = mapping.get(key)
1252+
if value:
1253+
return value
1254+
return None
1255+
1256+
1257+
def _parse_rss_feed_entry_fast(
1258+
item: _Element,
1259+
atom_ns: str,
1260+
parse_media_content: bool = True,
1261+
parse_enclosures: bool = True,
1262+
) -> FastFeedParserDict:
1263+
text_by_local, text_by_full = _build_rss_item_text_maps(item)
1264+
1265+
entry = FastFeedParserDict()
1266+
atom_id = text_by_full.get(f"{{{atom_ns}}}id")
1267+
rss_guid = text_by_local.get("guid")
1268+
rdf_about = item.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about")
1269+
entry_id: Optional[str] = atom_id or rss_guid or rdf_about
1270+
if entry_id:
1271+
entry["id"] = entry_id.strip()
1272+
1273+
title = text_by_local.get("title")
1274+
if title:
1275+
entry["title"] = title
1276+
1277+
description = _first_non_empty(text_by_local, ("description", "summary"))
1278+
if description:
1279+
entry["description"] = description
1280+
1281+
link = text_by_local.get("link")
1282+
if link:
1283+
entry["link"] = link
1284+
1285+
published_source = _first_non_empty(text_by_local, ("pubdate", "published", "issued", "date"))
1286+
if published_source:
1287+
published = _parse_date(published_source)
1288+
if published:
1289+
entry["published"] = published
1290+
1291+
updated_source = _first_non_empty(text_by_local, ("lastbuilddate", "updated", "modified"))
1292+
if updated_source:
1293+
updated = _parse_date(updated_source)
1294+
if updated:
1295+
entry["updated"] = updated
1296+
1297+
if "published" not in entry and rss_guid:
1298+
guid_date = _parse_date(rss_guid)
1299+
if guid_date:
1300+
entry["published"] = guid_date
1301+
1302+
if "updated" in entry and "published" not in entry:
1303+
entry["published"] = entry["updated"]
1304+
1305+
_populate_entry_links(entry, item, atom_ns)
1306+
if "id" not in entry and "link" in entry:
1307+
entry["id"] = entry["link"]
1308+
1309+
_populate_entry_content(entry, item, "rss", atom_ns)
1310+
1311+
if parse_media_content:
1312+
media_contents = _parse_media_content(item)
1313+
if media_contents:
1314+
entry["media_content"] = media_contents
1315+
1316+
if parse_enclosures:
1317+
enclosures = _parse_enclosures(item)
1318+
if enclosures:
1319+
entry["enclosures"] = enclosures
1320+
1321+
author = _first_non_empty(text_by_local, ("author", "creator"))
1322+
if not author:
1323+
atom_author = item.find(f"{{{atom_ns}}}author/{{{atom_ns}}}name")
1324+
author = atom_author.text.strip() if atom_author is not None and atom_author.text else None
1325+
if author:
1326+
entry["author"] = author
1327+
1328+
comments = text_by_local.get("comments")
1329+
if comments:
1330+
entry["comments"] = comments
1331+
1332+
tags = _parse_tags(item, "rss", atom_ns)
1333+
if tags:
1334+
entry["tags"] = tags
1335+
1336+
return entry
1337+
1338+
12001339
def _parse_feed_entry(
1201-
item: _Element, feed_type: _FeedType, atom_namespace: Optional[str] = None
1340+
item: _Element,
1341+
feed_type: _FeedType,
1342+
atom_namespace: Optional[str] = None,
1343+
*,
1344+
parse_media_content: bool = True,
1345+
parse_enclosures: bool = True,
12021346
) -> FastFeedParserDict:
12031347
# Use dynamic atom namespace or fallback to default
12041348
atom_ns = atom_namespace or "http://www.w3.org/2005/Atom"
12051349

1350+
if feed_type == "rss":
1351+
return _parse_rss_feed_entry_fast(
1352+
item,
1353+
atom_ns,
1354+
parse_media_content=parse_media_content,
1355+
parse_enclosures=parse_enclosures,
1356+
)
1357+
12061358
# Check if this is Atom 0.3 to use different date field names
12071359
is_atom_03 = atom_ns == "http://purl.org/atom/ns#"
12081360

@@ -1315,13 +1467,15 @@ def _parse_feed_entry(
13151467

13161468
_populate_entry_content(entry, item, feed_type, atom_ns)
13171469

1318-
media_contents = _parse_media_content(item)
1319-
if media_contents:
1320-
entry["media_content"] = media_contents
1470+
if parse_media_content:
1471+
media_contents = _parse_media_content(item)
1472+
if media_contents:
1473+
entry["media_content"] = media_contents
13211474

1322-
enclosures = _parse_enclosures(item)
1323-
if enclosures:
1324-
entry["enclosures"] = enclosures
1475+
if parse_enclosures:
1476+
enclosures = _parse_enclosures(item)
1477+
if enclosures:
1478+
entry["enclosures"] = enclosures
13251479

13261480
author = (
13271481
get_field_value(
@@ -1342,11 +1496,6 @@ def _parse_feed_entry(
13421496
if author:
13431497
entry["author"] = author
13441498

1345-
if feed_type == "rss":
1346-
comments = element_get("comments")
1347-
if comments:
1348-
entry["comments"] = comments
1349-
13501499
# Parse entry-level tags/categories
13511500
tags = _parse_tags(item, feed_type, atom_ns)
13521501
if tags:
@@ -1615,9 +1764,11 @@ def _parse_date(date_str: str) -> Optional[str]:
16151764
if not date_str:
16161765
return None
16171766

1618-
candidate = _RE_WHITESPACE.sub(" ", date_str.strip())
1767+
candidate = date_str.strip()
16191768
if not candidate:
16201769
return None
1770+
if "\n" in candidate or "\r" in candidate or "\t" in candidate or " " in candidate:
1771+
candidate = _RE_WHITESPACE.sub(" ", candidate)
16211772

16221773
# Fix invalid leap year dates (Feb 29 in non-leap years)
16231774
# This handles feeds with incorrect dates like "2023-02-29"
@@ -1635,7 +1786,8 @@ def _parse_date(date_str: str) -> Optional[str]:
16351786

16361787
dt: Optional[datetime.datetime] = None
16371788

1638-
if _RE_ISO_LIKE.match(candidate):
1789+
is_iso_like = _RE_ISO_LIKE.match(candidate) is not None
1790+
if is_iso_like:
16391791
iso_candidate = _normalize_iso_datetime_string(candidate)
16401792
try:
16411793
dt = datetime.datetime.fromisoformat(iso_candidate)

0 commit comments

Comments
 (0)