@@ -730,6 +730,24 @@ def _detect_feed_structure(
730730 raise ValueError (f"Unknown feed type: { root .tag } " )
731731
732732
733+ def _should_parse_media_content (root : _Element , xml_content : bytes ) -> bool :
734+ """Check if feed likely contains Media RSS fields."""
735+ ns_values = root .nsmap .values () if root .nsmap else ()
736+ for ns_value in ns_values :
737+ if not ns_value :
738+ continue
739+ if "search.yahoo.com/mrss" in ns_value :
740+ return True
741+
742+ # Fallback for feeds with undeclared/late namespace usage.
743+ return b"search.yahoo.com/mrss" in xml_content or b"<media:" in xml_content
744+
745+
746+ def _should_parse_enclosures (feed_type : _FeedType , xml_content : bytes ) -> bool :
747+ """Check if feed likely contains RSS enclosure elements."""
748+ return feed_type == "rss" and b"<enclosure" in xml_content
749+
750+
733751def _parse_content (xml_content : str | bytes ) -> FastFeedParserDict :
734752 """Parse feed content (XML or JSON) that has already been fetched."""
735753 json_feed = _maybe_parse_json_feed (xml_content )
@@ -744,14 +762,22 @@ def _parse_content(xml_content: str | bytes) -> FastFeedParserDict:
744762 feed_type , channel , items , atom_namespace = _detect_feed_structure (
745763 root , xml_content , root_tag_local
746764 )
765+ parse_media_content = _should_parse_media_content (root , xml_content )
766+ parse_enclosures = _should_parse_enclosures (feed_type , xml_content )
747767
748768 feed = _parse_feed_info (channel , feed_type , atom_namespace )
749769
750770 # Parse entries
751771 entries : list [FastFeedParserDict ] = []
752772 feed ["entries" ] = entries
753773 for item in items :
754- entry = _parse_feed_entry (item , feed_type , atom_namespace )
774+ entry = _parse_feed_entry (
775+ item ,
776+ feed_type ,
777+ atom_namespace ,
778+ parse_media_content = parse_media_content ,
779+ parse_enclosures = parse_enclosures ,
780+ )
755781 # Ensure that titles and descriptions are always present
756782 entry ["title" ] = entry .get ("title" , "" ).strip ()
757783 entry ["description" ] = entry .get ("description" , "" ).strip ()
@@ -1197,12 +1223,138 @@ def _parse_enclosures(item: _Element) -> list[dict[str, Any]] | None:
11971223 return enclosures or None
11981224
11991225
1226+ def _normalize_local_tag_name (tag : str ) -> str :
1227+ local = tag .rsplit ("}" , 1 )[- 1 ].lower ()
1228+ if ":" in local :
1229+ local = local .split (":" , 1 )[1 ]
1230+ return local
1231+
1232+
1233+ def _build_rss_item_text_maps (item : _Element ) -> tuple [dict [str , Optional [str ]], dict [str , Optional [str ]]]:
1234+ by_local : dict [str , Optional [str ]] = {}
1235+ by_full : dict [str , Optional [str ]] = {}
1236+ for child in item :
1237+ tag = child .tag
1238+ if not isinstance (tag , str ):
1239+ continue
1240+ text_value = child .text .strip () if child .text else None
1241+ if tag not in by_full :
1242+ by_full [tag ] = text_value
1243+ local = _normalize_local_tag_name (tag )
1244+ if local not in by_local :
1245+ by_local [local ] = text_value
1246+ return by_local , by_full
1247+
1248+
1249+ def _first_non_empty (mapping : dict [str , Optional [str ]], keys : tuple [str , ...]) -> Optional [str ]:
1250+ for key in keys :
1251+ value = mapping .get (key )
1252+ if value :
1253+ return value
1254+ return None
1255+
1256+
1257+ def _parse_rss_feed_entry_fast (
1258+ item : _Element ,
1259+ atom_ns : str ,
1260+ parse_media_content : bool = True ,
1261+ parse_enclosures : bool = True ,
1262+ ) -> FastFeedParserDict :
1263+ text_by_local , text_by_full = _build_rss_item_text_maps (item )
1264+
1265+ entry = FastFeedParserDict ()
1266+ atom_id = text_by_full .get (f"{{{ atom_ns } }}id" )
1267+ rss_guid = text_by_local .get ("guid" )
1268+ rdf_about = item .get ("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about" )
1269+ entry_id : Optional [str ] = atom_id or rss_guid or rdf_about
1270+ if entry_id :
1271+ entry ["id" ] = entry_id .strip ()
1272+
1273+ title = text_by_local .get ("title" )
1274+ if title :
1275+ entry ["title" ] = title
1276+
1277+ description = _first_non_empty (text_by_local , ("description" , "summary" ))
1278+ if description :
1279+ entry ["description" ] = description
1280+
1281+ link = text_by_local .get ("link" )
1282+ if link :
1283+ entry ["link" ] = link
1284+
1285+ published_source = _first_non_empty (text_by_local , ("pubdate" , "published" , "issued" , "date" ))
1286+ if published_source :
1287+ published = _parse_date (published_source )
1288+ if published :
1289+ entry ["published" ] = published
1290+
1291+ updated_source = _first_non_empty (text_by_local , ("lastbuilddate" , "updated" , "modified" ))
1292+ if updated_source :
1293+ updated = _parse_date (updated_source )
1294+ if updated :
1295+ entry ["updated" ] = updated
1296+
1297+ if "published" not in entry and rss_guid :
1298+ guid_date = _parse_date (rss_guid )
1299+ if guid_date :
1300+ entry ["published" ] = guid_date
1301+
1302+ if "updated" in entry and "published" not in entry :
1303+ entry ["published" ] = entry ["updated" ]
1304+
1305+ _populate_entry_links (entry , item , atom_ns )
1306+ if "id" not in entry and "link" in entry :
1307+ entry ["id" ] = entry ["link" ]
1308+
1309+ _populate_entry_content (entry , item , "rss" , atom_ns )
1310+
1311+ if parse_media_content :
1312+ media_contents = _parse_media_content (item )
1313+ if media_contents :
1314+ entry ["media_content" ] = media_contents
1315+
1316+ if parse_enclosures :
1317+ enclosures = _parse_enclosures (item )
1318+ if enclosures :
1319+ entry ["enclosures" ] = enclosures
1320+
1321+ author = _first_non_empty (text_by_local , ("author" , "creator" ))
1322+ if not author :
1323+ atom_author = item .find (f"{{{ atom_ns } }}author/{{{ atom_ns } }}name" )
1324+ author = atom_author .text .strip () if atom_author is not None and atom_author .text else None
1325+ if author :
1326+ entry ["author" ] = author
1327+
1328+ comments = text_by_local .get ("comments" )
1329+ if comments :
1330+ entry ["comments" ] = comments
1331+
1332+ tags = _parse_tags (item , "rss" , atom_ns )
1333+ if tags :
1334+ entry ["tags" ] = tags
1335+
1336+ return entry
1337+
1338+
12001339def _parse_feed_entry (
1201- item : _Element , feed_type : _FeedType , atom_namespace : Optional [str ] = None
1340+ item : _Element ,
1341+ feed_type : _FeedType ,
1342+ atom_namespace : Optional [str ] = None ,
1343+ * ,
1344+ parse_media_content : bool = True ,
1345+ parse_enclosures : bool = True ,
12021346) -> FastFeedParserDict :
12031347 # Use dynamic atom namespace or fallback to default
12041348 atom_ns = atom_namespace or "http://www.w3.org/2005/Atom"
12051349
1350+ if feed_type == "rss" :
1351+ return _parse_rss_feed_entry_fast (
1352+ item ,
1353+ atom_ns ,
1354+ parse_media_content = parse_media_content ,
1355+ parse_enclosures = parse_enclosures ,
1356+ )
1357+
12061358 # Check if this is Atom 0.3 to use different date field names
12071359 is_atom_03 = atom_ns == "http://purl.org/atom/ns#"
12081360
@@ -1315,13 +1467,15 @@ def _parse_feed_entry(
13151467
13161468 _populate_entry_content (entry , item , feed_type , atom_ns )
13171469
1318- media_contents = _parse_media_content (item )
1319- if media_contents :
1320- entry ["media_content" ] = media_contents
1470+ if parse_media_content :
1471+ media_contents = _parse_media_content (item )
1472+ if media_contents :
1473+ entry ["media_content" ] = media_contents
13211474
1322- enclosures = _parse_enclosures (item )
1323- if enclosures :
1324- entry ["enclosures" ] = enclosures
1475+ if parse_enclosures :
1476+ enclosures = _parse_enclosures (item )
1477+ if enclosures :
1478+ entry ["enclosures" ] = enclosures
13251479
13261480 author = (
13271481 get_field_value (
@@ -1342,11 +1496,6 @@ def _parse_feed_entry(
13421496 if author :
13431497 entry ["author" ] = author
13441498
1345- if feed_type == "rss" :
1346- comments = element_get ("comments" )
1347- if comments :
1348- entry ["comments" ] = comments
1349-
13501499 # Parse entry-level tags/categories
13511500 tags = _parse_tags (item , feed_type , atom_ns )
13521501 if tags :
@@ -1615,9 +1764,11 @@ def _parse_date(date_str: str) -> Optional[str]:
16151764 if not date_str :
16161765 return None
16171766
1618- candidate = _RE_WHITESPACE . sub ( " " , date_str .strip () )
1767+ candidate = date_str .strip ()
16191768 if not candidate :
16201769 return None
1770+ if "\n " in candidate or "\r " in candidate or "\t " in candidate or " " in candidate :
1771+ candidate = _RE_WHITESPACE .sub (" " , candidate )
16211772
16221773 # Fix invalid leap year dates (Feb 29 in non-leap years)
16231774 # This handles feeds with incorrect dates like "2023-02-29"
@@ -1635,7 +1786,8 @@ def _parse_date(date_str: str) -> Optional[str]:
16351786
16361787 dt : Optional [datetime .datetime ] = None
16371788
1638- if _RE_ISO_LIKE .match (candidate ):
1789+ is_iso_like = _RE_ISO_LIKE .match (candidate ) is not None
1790+ if is_iso_like :
16391791 iso_candidate = _normalize_iso_datetime_string (candidate )
16401792 try :
16411793 dt = datetime .datetime .fromisoformat (iso_candidate )
0 commit comments