5656_RE_ISO_TZ_NO_COLON = re .compile (r"([+-]\d{2})(\d{2})$" )
5757_RE_ISO_TZ_HOUR_ONLY = re .compile (r"([+-]\d{2})$" )
5858_RE_ISO_FRACTION = re .compile (r"\.(\d{7,})(?=(?:[+-]\d{2}:?\d{2}|Z|$))" , re .IGNORECASE )
59+ _RE_RFC822 = re .compile (
60+ r"(?:\w{3},\s+)?(\d{1,2})\s+(\w{3})\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([+-]\d{4}|[A-Z]{2,5})"
61+ )
62+ _MONTHS_RFC822 : dict [str , int ] = {
63+ "jan" : 1 , "feb" : 2 , "mar" : 3 , "apr" : 4 , "may" : 5 , "jun" : 6 ,
64+ "jul" : 7 , "aug" : 8 , "sep" : 9 , "oct" : 10 , "nov" : 11 , "dec" : 12 ,
65+ }
66+ _TZ_OFFSETS_RFC822 : dict [str , int ] = {
67+ "GMT" : 0 , "UTC" : 0 , "UT" : 0 ,
68+ "EST" : - 18000 , "EDT" : - 14400 , "CST" : - 21600 , "CDT" : - 18000 ,
69+ "MST" : - 25200 , "MDT" : - 21600 , "PST" : - 28800 , "PDT" : - 25200 ,
70+ }
5971
6072
6173class FastFeedParserDict (dict ):
@@ -382,24 +394,26 @@ def _maybe_parse_json_feed(content: str | bytes) -> FastFeedParserDict | None:
382394 return None
383395
384396
397+ _STRICT_XML_PARSER = etree .XMLParser (
398+ ns_clean = True ,
399+ recover = False ,
400+ collect_ids = False ,
401+ resolve_entities = False ,
402+ )
403+ _RECOVER_XML_PARSER = etree .XMLParser (
404+ ns_clean = True ,
405+ recover = True ,
406+ collect_ids = False ,
407+ resolve_entities = False ,
408+ )
409+
410+
385411def _parse_xml_root (xml_content : bytes ) -> _Element :
386412 try :
387- strict_parser = etree .XMLParser (
388- ns_clean = True ,
389- recover = False ,
390- collect_ids = False ,
391- resolve_entities = False ,
392- )
393- root = etree .fromstring (xml_content , parser = strict_parser )
413+ root = etree .fromstring (xml_content , parser = _STRICT_XML_PARSER )
394414 except etree .XMLSyntaxError :
395- recover_parser = etree .XMLParser (
396- ns_clean = True ,
397- recover = True ,
398- collect_ids = False ,
399- resolve_entities = False ,
400- )
401415 try :
402- root = etree .fromstring (xml_content , parser = recover_parser )
416+ root = etree .fromstring (xml_content , parser = _RECOVER_XML_PARSER )
403417 except etree .XMLSyntaxError as e :
404418 raise ValueError (f"Failed to parse XML content: { str (e )} " )
405419
@@ -642,6 +656,9 @@ def _parse_content(xml_content: str | bytes) -> FastFeedParserDict:
642656
643657 feed = _parse_feed_info (channel , feed_type , atom_namespace )
644658
659+ # Detect once whether media namespace is used anywhere in the document
660+ has_media_ns = b"search.yahoo.com/mrss" in xml_content if isinstance (xml_content , bytes ) else "search.yahoo.com/mrss" in xml_content
661+
645662 # Parse entries
646663 entries : list [FastFeedParserDict ] = []
647664 feed ["entries" ] = entries
@@ -650,6 +667,7 @@ def _parse_content(xml_content: str | bytes) -> FastFeedParserDict:
650667 item ,
651668 feed_type ,
652669 atom_namespace ,
670+ has_media_ns ,
653671 )
654672 # Ensure that titles and descriptions are always present
655673 entry ["title" ] = entry .get ("title" , "" ).strip ()
@@ -998,7 +1016,10 @@ def _populate_entry_content(
9981016 if "<" in content_value :
9991017 content_value = _RE_HTML_TAGS .sub (" " , content_value [:2048 ])
10001018 content_value = _html_mod .unescape (content_value )
1001- content_value = _RE_WHITESPACE .sub (" " , content_value ).strip ()
1019+ if " " in content_value or "\n " in content_value or "\t " in content_value or "\r " in content_value :
1020+ content_value = _RE_WHITESPACE .sub (" " , content_value ).strip ()
1021+ else :
1022+ content_value = content_value .strip ()
10021023 entry ["description" ] = content_value [:512 ]
10031024
10041025
@@ -1099,9 +1120,13 @@ def _build_rss_item_text_maps(item: _Element) -> tuple[dict[str, Optional[str]],
10991120 text_value = child .text or None
11001121 if tag not in by_full :
11011122 by_full [tag ] = text_value
1102- local = tag .rsplit ("}" , 1 )[- 1 ].lower ()
1103- if ":" in local :
1104- local = local .split (":" , 1 )[1 ]
1123+ # Fast path: ~80% of RSS tags have no namespace or colon prefix
1124+ if "{" in tag :
1125+ local = tag .rsplit ("}" , 1 )[1 ].lower ()
1126+ elif ":" in tag :
1127+ local = tag .split (":" , 1 )[1 ].lower ()
1128+ else :
1129+ local = tag .lower ()
11051130 if local not in by_local :
11061131 by_local [local ] = text_value
11071132 return by_local , by_full
@@ -1118,6 +1143,7 @@ def _first_non_empty(mapping: dict[str, Optional[str]], keys: tuple[str, ...]) -
11181143def _parse_rss_feed_entry_fast (
11191144 item : _Element ,
11201145 atom_ns : str ,
1146+ has_media_ns : bool = True ,
11211147) -> FastFeedParserDict :
11221148 text_by_local , text_by_full = _build_rss_item_text_maps (item )
11231149
@@ -1161,15 +1187,26 @@ def _parse_rss_feed_entry_fast(
11611187 if "updated" in entry and "published" not in entry :
11621188 entry ["published" ] = entry ["updated" ]
11631189
1164- _populate_entry_links (entry , item , atom_ns )
1190+ # Inline link population for RSS (avoids redundant findall/find for 98.8% of entries)
1191+ atom_links = item .findall (f"{{{ atom_ns } }}link" )
1192+ if atom_links :
1193+ # Has atom:link elements - use full logic
1194+ _populate_entry_links (entry , item , atom_ns )
1195+ else :
1196+ # Common RSS case: no atom:link elements
1197+ entry ["links" ] = []
1198+ if "link" not in entry and rss_guid and rss_guid .startswith (("http://" , "https://" )):
1199+ entry ["link" ] = rss_guid
1200+
11651201 if "id" not in entry and "link" in entry :
11661202 entry ["id" ] = entry ["link" ]
11671203
11681204 _populate_entry_content (entry , item , "rss" , atom_ns )
11691205
1170- media_contents = _parse_media_content (item )
1171- if media_contents :
1172- entry ["media_content" ] = media_contents
1206+ if has_media_ns :
1207+ media_contents = _parse_media_content (item )
1208+ if media_contents :
1209+ entry ["media_content" ] = media_contents
11731210
11741211 enclosures = _parse_enclosures (item )
11751212 if enclosures :
@@ -1196,6 +1233,7 @@ def _parse_rss_feed_entry_fast(
11961233def _parse_atom_feed_entry_fast (
11971234 item : _Element ,
11981235 atom_ns : str ,
1236+ has_media_ns : bool = True ,
11991237) -> FastFeedParserDict :
12001238 ns = f"{{{ atom_ns } }}"
12011239 entry = FastFeedParserDict ()
@@ -1266,9 +1304,10 @@ def _parse_atom_feed_entry_fast(
12661304
12671305 _populate_entry_content (entry , item , "atom" , atom_ns )
12681306
1269- media_contents = _parse_media_content (item )
1270- if media_contents :
1271- entry ["media_content" ] = media_contents
1307+ if has_media_ns :
1308+ media_contents = _parse_media_content (item )
1309+ if media_contents :
1310+ entry ["media_content" ] = media_contents
12721311
12731312 enclosures = _parse_enclosures (item )
12741313 if enclosures :
@@ -1290,15 +1329,16 @@ def _parse_feed_entry(
12901329 item : _Element ,
12911330 feed_type : _FeedType ,
12921331 atom_namespace : Optional [str ] = None ,
1332+ has_media_ns : bool = True ,
12931333) -> FastFeedParserDict :
12941334 # Use dynamic atom namespace or fallback to default
12951335 atom_ns = atom_namespace or "http://www.w3.org/2005/Atom"
12961336
12971337 if feed_type == "rss" :
1298- return _parse_rss_feed_entry_fast (item , atom_ns )
1338+ return _parse_rss_feed_entry_fast (item , atom_ns , has_media_ns )
12991339
13001340 if feed_type == "atom" :
1301- return _parse_atom_feed_entry_fast (item , atom_ns )
1341+ return _parse_atom_feed_entry_fast (item , atom_ns , has_media_ns )
13021342
13031343 # RDF path uses the generic field machinery
13041344 # Check if this is Atom 0.3 to use different date field names
@@ -1412,9 +1452,10 @@ def _parse_feed_entry(
14121452
14131453 _populate_entry_content (entry , item , feed_type , atom_ns )
14141454
1415- media_contents = _parse_media_content (item )
1416- if media_contents :
1417- entry ["media_content" ] = media_contents
1455+ if has_media_ns :
1456+ media_contents = _parse_media_content (item )
1457+ if media_contents :
1458+ entry ["media_content" ] = media_contents
14181459
14191460 enclosures = _parse_enclosures (item )
14201461 if enclosures :
@@ -1616,8 +1657,54 @@ def _ensure_utc(dt: datetime.datetime) -> Optional[datetime.datetime]:
16161657 return None
16171658
16181659
1660+ def _fast_rfc822_to_iso (value : str ) -> Optional [str ]:
1661+ """Fast RFC-822 date to ISO string, bypassing datetime objects for UTC dates."""
1662+ m = _RE_RFC822 .match (value )
1663+ if not m :
1664+ return None
1665+ day , mon_str , year , hour , minute , second , tz = m .groups ()
1666+ month = _MONTHS_RFC822 .get (mon_str .lower ())
1667+ if month is None :
1668+ return None
1669+ if tz [0 ] in "+-" :
1670+ tz_offset_seconds = (int (tz [1 :3 ]) * 3600 + int (tz [3 :5 ]) * 60 ) * (
1671+ 1 if tz [0 ] == "+" else - 1
1672+ )
1673+ else :
1674+ tz_offset_seconds = _TZ_OFFSETS_RFC822 .get (tz )
1675+ if tz_offset_seconds is None :
1676+ return None # Unknown tz name, fall through to full parser
1677+ # Python requires offset strictly between -24h and +24h
1678+ if not (- 86400 < tz_offset_seconds < 86400 ):
1679+ return None
1680+ d = int (day )
1681+ h = int (hour )
1682+ mi = int (minute )
1683+ s = int (second )
1684+ # Hour 24 is invalid (even ISO only allows 24:00:00); roll to next day at 00:mm:ss
1685+ if h == 24 :
1686+ base = datetime .date (int (year ), month , d ) + datetime .timedelta (days = 1 )
1687+ h = 0
1688+ if tz_offset_seconds == 0 :
1689+ return f"{ base .year :04d} -{ base .month :02d} -{ base .day :02d} T{ h :02d} :{ mi :02d} :{ s :02d} +00:00"
1690+ dt = datetime .datetime (
1691+ base .year , base .month , base .day , h , mi , s ,
1692+ tzinfo = datetime .timezone (datetime .timedelta (seconds = tz_offset_seconds )),
1693+ )
1694+ utc = dt .astimezone (_UTC )
1695+ return f"{ utc .year :04d} -{ utc .month :02d} -{ utc .day :02d} T{ utc .hour :02d} :{ utc .minute :02d} :{ utc .second :02d} +00:00"
1696+ if tz_offset_seconds == 0 :
1697+ return f"{ year } -{ month :02d} -{ d :02d} T{ hour } :{ minute } :{ second } +00:00"
1698+ dt = datetime .datetime (
1699+ int (year ), month , d , h , mi , s ,
1700+ tzinfo = datetime .timezone (datetime .timedelta (seconds = tz_offset_seconds )),
1701+ )
1702+ utc = dt .astimezone (_UTC )
1703+ return f"{ utc .year :04d} -{ utc .month :02d} -{ utc .day :02d} T{ utc .hour :02d} :{ utc .minute :02d} :{ utc .second :02d} +00:00"
1704+
1705+
16191706def _parsedate_to_utc (value : str ) -> Optional [datetime .datetime ]:
1620- """Fast RFC-822 / RFC-2822 parsing via email.utils."""
1707+ """RFC-822 / RFC-2822 parsing via email.utils (fallback) ."""
16211708 try :
16221709 parsed = parsedate_to_datetime (value )
16231710 except (TypeError , ValueError , IndexError ):
@@ -1717,16 +1804,19 @@ def _parse_date(date_str: str) -> Optional[str]:
17171804 last = candidate [- 1 ]
17181805 # Most common: ends with 'Z' (e.g., 2024-01-15T10:30:00Z)
17191806 if last in ("Z" , "z" ):
1807+ iso = candidate [:- 1 ] + "+00:00"
17201808 try :
1721- dt = datetime .datetime .fromisoformat (candidate [: - 1 ] + "+00:00" )
1809+ dt = datetime .datetime .fromisoformat (iso )
17221810 return dt .isoformat ()
17231811 except ValueError :
17241812 pass # Fall through to full parsing
17251813 # Second most common: ends with +HH:MM (e.g., 2024-01-15T10:30:00+00:00)
17261814 elif clen > 6 and candidate [- 6 ] in ("+" , "-" ) and candidate [- 3 ] == ":" :
17271815 try :
17281816 dt = datetime .datetime .fromisoformat (candidate )
1729- utc_dt = dt .replace (tzinfo = _UTC ) if dt .tzinfo is None else dt .astimezone (_UTC )
1817+ if dt .tzinfo is _UTC :
1818+ return dt .isoformat ()
1819+ utc_dt = dt .astimezone (_UTC )
17301820 return utc_dt .isoformat ()
17311821 except (ValueError , OverflowError ):
17321822 pass # Fall through to full parsing
@@ -1743,10 +1833,13 @@ def _parse_date(date_str: str) -> Optional[str]:
17431833 if not ((year % 4 == 0 and year % 100 != 0 ) or (year % 400 == 0 )):
17441834 candidate = candidate .replace (f"{ year } -02-29" , f"{ year } -02-28" )
17451835
1746- if "24:00" in candidate :
1747- candidate = candidate .replace ("24:00:00" , "00:00:00" ).replace (
1748- " 24:00" , " 00:00"
1749- )
1836+ if "T24:" in candidate or " 24:" in candidate :
1837+ m24 = re .search (r"(\d{4}-\d{2}-\d{2})[T ]24:(\d{2}):(\d{2})" , candidate )
1838+ if m24 :
1839+ base = datetime .date .fromisoformat (m24 .group (1 ))
1840+ mins , secs = int (m24 .group (2 )), int (m24 .group (3 ))
1841+ next_day = base + datetime .timedelta (days = 1 )
1842+ candidate = candidate [:m24 .start ()] + f"{ next_day } T00:{ mins :02d} :{ secs :02d} " + candidate [m24 .end ():]
17501843
17511844 dt : Optional [datetime .datetime ] = None
17521845
@@ -1762,6 +1855,10 @@ def _parse_date(date_str: str) -> Optional[str]:
17621855 if utc_dt is not None :
17631856 return utc_dt .isoformat ()
17641857
1858+ rfc822_result = _fast_rfc822_to_iso (candidate )
1859+ if rfc822_result is not None :
1860+ return rfc822_result
1861+
17651862 dt = _parsedate_to_utc (candidate )
17661863 if dt is not None :
17671864 return dt .isoformat ()
0 commit comments