33import datetime
44from email .utils import parsedate_to_datetime
55import gzip
6+ import html as _html_mod
67import json
78import re
89import zlib
5960 br"<link([^>]*[^/])>\s*(?=\n\s*<(?!/link\s*>))" , re .MULTILINE
6061)
6162_RE_FEB29 = re .compile (r"(\d{4})-02-29" )
63+ _RE_HTML_TAGS = re .compile (r"<[^>]+>" )
6264_RE_WHITESPACE = re .compile (r"\s+" )
63- _RE_ISO_LIKE = re .compile (r"^\d{4}-\d{2}-\d{2}" )
6465_RE_ISO_TZ_NO_COLON = re .compile (r"([+-]\d{2})(\d{2})$" )
6566_RE_ISO_TZ_HOUR_ONLY = re .compile (r"([+-]\d{2})$" )
6667_RE_ISO_FRACTION = re .compile (r"\.(\d{7,})(?=(?:[+-]\d{2}:?\d{2}|Z|$))" , re .IGNORECASE )
@@ -595,7 +596,6 @@ def _raise_for_non_feed_root(
595596 raise ValueError (
596597 "Received XML sitemap instead of feed (sitemap is for search engines, not a feed)"
597598 )
598- raise ValueError (f"Not a valid feed: { root_tag_local } element found - { error_msg [:100 ]} " )
599599
600600
601601_RE_META_REFRESH_URL = re .compile (
@@ -1123,16 +1123,9 @@ def _populate_entry_content(
11231123 content_value = entry ["content" ][0 ]["value" ]
11241124 if content_value :
11251125 if "<" in content_value :
1126- try :
1127- html_content = etree .HTML (content_value )
1128- if html_content is not None :
1129- content_text = html_content .xpath ("string()" )
1130- if isinstance (content_text , str ):
1131- content_value = _RE_WHITESPACE .sub (" " , content_text )
1132- except etree .ParserError :
1133- pass
1134- else :
1135- content_value = _RE_WHITESPACE .sub (" " , content_value )
1126+ content_value = _RE_HTML_TAGS .sub (" " , content_value [:2048 ])
1127+ content_value = _html_mod .unescape (content_value )
1128+ content_value = _RE_WHITESPACE .sub (" " , content_value ).strip ()
11361129 entry ["description" ] = content_value [:512 ]
11371130
11381131
@@ -1223,13 +1216,6 @@ def _parse_enclosures(item: _Element) -> list[dict[str, Any]] | None:
12231216 return enclosures or None
12241217
12251218
1226- def _normalize_local_tag_name (tag : str ) -> str :
1227- local = tag .rsplit ("}" , 1 )[- 1 ].lower ()
1228- if ":" in local :
1229- local = local .split (":" , 1 )[1 ]
1230- return local
1231-
1232-
12331219def _build_rss_item_text_maps (item : _Element ) -> tuple [dict [str , Optional [str ]], dict [str , Optional [str ]]]:
12341220 by_local : dict [str , Optional [str ]] = {}
12351221 by_full : dict [str , Optional [str ]] = {}
@@ -1240,7 +1226,9 @@ def _build_rss_item_text_maps(item: _Element) -> tuple[dict[str, Optional[str]],
12401226 text_value = child .text .strip () if child .text else None
12411227 if tag not in by_full :
12421228 by_full [tag ] = text_value
1243- local = _normalize_local_tag_name (tag )
1229+ local = tag .rsplit ("}" , 1 )[- 1 ].lower ()
1230+ if ":" in local :
1231+ local = local .split (":" , 1 )[1 ]
12441232 if local not in by_local :
12451233 by_local [local ] = text_value
12461234 return by_local , by_full
@@ -1477,22 +1465,14 @@ def _parse_feed_entry(
14771465 if enclosures :
14781466 entry ["enclosures" ] = enclosures
14791467
1480- author = (
1481- get_field_value (
1482- "author" ,
1483- f"{{{ atom_ns } }}author/{{{ atom_ns } }}name" ,
1484- "{http://purl.org/dc/elements/1.1/}creator" ,
1485- False ,
1486- )
1487- or get_field_value (
1488- "{http://purl.org/dc/elements/1.1/}creator" ,
1489- "{http://purl.org/dc/elements/1.1/}creator" ,
1490- "{http://purl.org/dc/elements/1.1/}creator" ,
1491- False ,
1492- )
1493- or element_get ("{http://purl.org/dc/elements/1.1/}creator" )
1494- or element_get ("author" )
1468+ author = get_field_value (
1469+ "author" ,
1470+ f"{{{ atom_ns } }}author/{{{ atom_ns } }}name" ,
1471+ "{http://purl.org/dc/elements/1.1/}creator" ,
1472+ False ,
14951473 )
1474+ if not author :
1475+ author = element_get ("{http://purl.org/dc/elements/1.1/}creator" ) or element_get ("author" )
14961476 if author :
14971477 entry ["author" ] = author
14981478
@@ -1648,7 +1628,7 @@ def _normalize_iso_datetime_string(value: str) -> str:
16481628 if cleaned .endswith (("Z" , "z" )):
16491629 cleaned = cleaned [:- 1 ] + "+00:00"
16501630
1651- if " " in cleaned and "T" not in cleaned [:11 ] and _RE_ISO_LIKE . match (cleaned ):
1631+ if " " in cleaned and "T" not in cleaned [:11 ] and len (cleaned ) >= 10 and cleaned [ 4 ] == "-" and cleaned [ 0 : 4 ]. isdigit ( ):
16521632 date_part , rest = cleaned .split (" " , 1 )
16531633 if rest and rest [0 ].isdigit ():
16541634 cleaned = f"{ date_part } T{ rest } "
@@ -1772,12 +1752,12 @@ def _parse_date(date_str: str) -> Optional[str]:
17721752
17731753 # Fix invalid leap year dates (Feb 29 in non-leap years)
17741754 # This handles feeds with incorrect dates like "2023-02-29"
1775- year_match = _RE_FEB29 . match ( candidate )
1776- if year_match :
1777- year = int ( year_match . group ( 1 ))
1778- if not (( year % 4 == 0 and year % 100 != 0 ) or ( year % 400 == 0 )):
1779- # Not a leap year, change Feb 29 to Feb 28
1780- candidate = candidate .replace (f"{ year } -02-29" , f"{ year } -02-28" )
1755+ if "-02-29" in candidate :
1756+ year_match = _RE_FEB29 . match ( candidate )
1757+ if year_match :
1758+ year = int ( year_match . group ( 1 ))
1759+ if not (( year % 4 == 0 and year % 100 != 0 ) or ( year % 400 == 0 )):
1760+ candidate = candidate .replace (f"{ year } -02-29" , f"{ year } -02-28" )
17811761
17821762 if "24:00" in candidate :
17831763 candidate = candidate .replace ("24:00:00" , "00:00:00" ).replace (
@@ -1786,7 +1766,7 @@ def _parse_date(date_str: str) -> Optional[str]:
17861766
17871767 dt : Optional [datetime .datetime ] = None
17881768
1789- is_iso_like = _RE_ISO_LIKE . match (candidate ) is not None
1769+ is_iso_like = len (candidate ) >= 10 and candidate [ 4 ] == "-" and candidate [ 0 : 4 ]. isdigit ()
17901770 if is_iso_like :
17911771 iso_candidate = _normalize_iso_datetime_string (candidate )
17921772 try :
0 commit comments