0.5.0

vprelovac · vprelovac · commit d85f4d4f75dc · 2026-02-17T10:59:42.000-08:00
diff --git a/benchmark.py b/benchmark.py
@@ -12,7 +12,7 @@
     "https://glineq.blogspot.com/feeds/posts/default",
     "https://stml.tumblr.com/rss",
     "http://feeds.feedburner.com/mishadoff",
-    "https://lisacharlottemuth.com/atom.xml",
+    "https://www.speakingbody.com/rss/",
     "https://emacsninja.com/feed.atom",
     "http://causality.cs.ucla.edu/blog/index.php/feed/",
     "https://blog.railsapps.org/rss",
@@ -29,21 +29,21 @@
     "https://fanf.dreamwidth.org/data/rss",
     "https://bernsteinbear.com/feed.xml",
     "https://feeds.kottke.org/main",
-    "https://alefesouza.com/feed/",
-    "https://amitg.blog/feed.atom",
-    "https://www.alwaystwisted.com/rss.php",
+    "https://dkg.fifthhorseman.net/blog/feeds/all.atom.xml",
+    "https://zhubert.com/index.xml",
+    "https://lovergne.dev/rss.xml",
     "https://blog.kagi.com/rss.xml",
-    "https://aaronfrancis.com/feed",
+    "https://battlefieldanomalies.com/home/feed/",
     "http://davidbau.com/index.rdf",
-    "https://jesperbylund.com/rss",
-    "https://aarvik.dk/rss/index.html",
+    "https://blog.kagamino.dev/index.xml",
+    "https://feeds.transistor.fm/fallthrough",
     "http://dontcodetired.com/blog/syndication.axd",
     "https://aivarsk.com/atom.xml",
     "http://markcoddington.com/feed/",
-    "https://andresb.net/blog/feed/",
-    "http://feeds.d15.biz/Daniel15",
+    "https://bendauphinee.com/writing/feed/",
+    "https://www.oscardom.dev/index.xml",
     "https://alwaystwisted.com/feed.xml",
-    "https://aly.arriqaaq.com/rss/",
+    "https://killjoy.bearblog.dev/rss.xml",
     "https://nithinbekal.com/feed.xml",
     "https://blog.emacsen.net/atom.xml",
     "https://therecouldhavebeensnakes.wordpress.com/feed/",
@@ -58,23 +58,23 @@
     "https://herman.bearblog.dev/feed/",
     "https://dylanharris.org/feed-me.rss",
     "https://eliot-jones.com/rss",
-    "https://www.byjp.me/index.xml",
+    "https://blog.kroy.io/feed/",
     "https://jfg-mysql.blogspot.com/feeds/posts/default",
     "https://dzidas.com/atom.xml",
     "https://ariannasimpson.com/blog/feed/",
     "https://www.everydayislikewednesday.com/atom.xml",
     "https://www.bastibl.net/atom.xml",
     "https://yuxi.ml/feeds.xml",
     "https://bugramming.dev/index.xml",
-    "https://blog.iangilman.com/rss.xml",
+    "https://evanfields.github.io/feed.xml",
     "https://raahel.bearblog.dev/atom/",
     "https://mahdytech.com/rss.xml",
     "https://fogblog-hermansheephouse.blogspot.com/feeds/posts/default",
     "https://ctoomey.com/atom.xml",
     "https://blog.lasheen.dev/index.xml",
     "https://markheath.net/feed/rss",
     "https://stancarney.co/rss/",
-    "https://bigmachine.io/feed.xml",
+    "https://thecretefleet.com/blog/f.atom",
     "https://anteru.net/rss.xml",
     "https://blog.drewolson.org/index.xml",
     "https://blog.noredink.com/rss",
@@ -83,7 +83,7 @@
     "https://abcnews.go.com/abcnews/internationalheadlines",
     "https://aljazeera.com/xml/rss/all.xml",
     "https://allafrica.com/tools/headlines/rdf/latest/headlines.rdf",
-    "https://api.axios.com/feed/world",
+    "https://remimercier.com/feed.xml",
     "https://en.mercopress.com/rss/",
     "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
     "https://feeds.bbci.co.uk/news/world/rss.xml",
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = fastfeedparser
-version = 0.4.9
+version = 0.5.0
 author = Vladimir Prelovac
 author_email = vlad@kagi.com
 description = High performance RSS, Atom, JSON and RDF feed parser in Python
diff --git a/src/fastfeedparser/__init__.py b/src/fastfeedparser/__init__.py
@@ -1,4 +1,4 @@
 from .main import parse, FastFeedParserDict
 
-__version__ = "0.4.9"
+__version__ = "0.5.0"
 __all__ = ["parse", "FastFeedParserDict"]
diff --git a/src/fastfeedparser/main.py b/src/fastfeedparser/main.py
@@ -3,6 +3,7 @@
 import datetime
 from email.utils import parsedate_to_datetime
 import gzip
+import html as _html_mod
 import json
 import re
 import zlib
@@ -59,8 +60,8 @@
     br"<link([^>]*[^/])>\s*(?=\n\s*<(?!/link\s*>))", re.MULTILINE
 )
 _RE_FEB29 = re.compile(r"(\d{4})-02-29")
+_RE_HTML_TAGS = re.compile(r"<[^>]+>")
 _RE_WHITESPACE = re.compile(r"\s+")
-_RE_ISO_LIKE = re.compile(r"^\d{4}-\d{2}-\d{2}")
 _RE_ISO_TZ_NO_COLON = re.compile(r"([+-]\d{2})(\d{2})$")
 _RE_ISO_TZ_HOUR_ONLY = re.compile(r"([+-]\d{2})$")
 _RE_ISO_FRACTION = re.compile(r"\.(\d{7,})(?=(?:[+-]\d{2}:?\d{2}|Z|$))", re.IGNORECASE)
@@ -595,7 +596,6 @@ def _raise_for_non_feed_root(
         raise ValueError(
             "Received XML sitemap instead of feed (sitemap is for search engines, not a feed)"
         )
-    raise ValueError(f"Not a valid feed: {root_tag_local} element found - {error_msg[:100]}")
 
 
 _RE_META_REFRESH_URL = re.compile(
@@ -1123,16 +1123,9 @@ def _populate_entry_content(
         content_value = entry["content"][0]["value"]
         if content_value:
             if "<" in content_value:
-                try:
-                    html_content = etree.HTML(content_value)
-                    if html_content is not None:
-                        content_text = html_content.xpath("string()")
-                        if isinstance(content_text, str):
-                            content_value = _RE_WHITESPACE.sub(" ", content_text)
-                except etree.ParserError:
-                    pass
-            else:
-                content_value = _RE_WHITESPACE.sub(" ", content_value)
+                content_value = _RE_HTML_TAGS.sub(" ", content_value[:2048])
+                content_value = _html_mod.unescape(content_value)
+            content_value = _RE_WHITESPACE.sub(" ", content_value).strip()
         entry["description"] = content_value[:512]
 
 
@@ -1223,13 +1216,6 @@ def _parse_enclosures(item: _Element) -> list[dict[str, Any]] | None:
     return enclosures or None
 
 
-def _normalize_local_tag_name(tag: str) -> str:
-    local = tag.rsplit("}", 1)[-1].lower()
-    if ":" in local:
-        local = local.split(":", 1)[1]
-    return local
-
-
 def _build_rss_item_text_maps(item: _Element) -> tuple[dict[str, Optional[str]], dict[str, Optional[str]]]:
     by_local: dict[str, Optional[str]] = {}
     by_full: dict[str, Optional[str]] = {}
@@ -1240,7 +1226,9 @@ def _build_rss_item_text_maps(item: _Element) -> tuple[dict[str, Optional[str]],
         text_value = child.text.strip() if child.text else None
         if tag not in by_full:
             by_full[tag] = text_value
-        local = _normalize_local_tag_name(tag)
+        local = tag.rsplit("}", 1)[-1].lower()
+        if ":" in local:
+            local = local.split(":", 1)[1]
         if local not in by_local:
             by_local[local] = text_value
     return by_local, by_full
@@ -1477,22 +1465,14 @@ def _parse_feed_entry(
         if enclosures:
             entry["enclosures"] = enclosures
 
-    author = (
-        get_field_value(
-            "author",
-            f"{{{atom_ns}}}author/{{{atom_ns}}}name",
-            "{http://purl.org/dc/elements/1.1/}creator",
-            False,
-        )
-        or get_field_value(
-            "{http://purl.org/dc/elements/1.1/}creator",
-            "{http://purl.org/dc/elements/1.1/}creator",
-            "{http://purl.org/dc/elements/1.1/}creator",
-            False,
-        )
-        or element_get("{http://purl.org/dc/elements/1.1/}creator")
-        or element_get("author")
+    author = get_field_value(
+        "author",
+        f"{{{atom_ns}}}author/{{{atom_ns}}}name",
+        "{http://purl.org/dc/elements/1.1/}creator",
+        False,
     )
+    if not author:
+        author = element_get("{http://purl.org/dc/elements/1.1/}creator") or element_get("author")
     if author:
         entry["author"] = author
 
@@ -1648,7 +1628,7 @@ def _normalize_iso_datetime_string(value: str) -> str:
     if cleaned.endswith(("Z", "z")):
         cleaned = cleaned[:-1] + "+00:00"
 
-    if " " in cleaned and "T" not in cleaned[:11] and _RE_ISO_LIKE.match(cleaned):
+    if " " in cleaned and "T" not in cleaned[:11] and len(cleaned) >= 10 and cleaned[4] == "-" and cleaned[0:4].isdigit():
         date_part, rest = cleaned.split(" ", 1)
         if rest and rest[0].isdigit():
             cleaned = f"{date_part}T{rest}"
@@ -1772,12 +1752,12 @@ def _parse_date(date_str: str) -> Optional[str]:
 
     # Fix invalid leap year dates (Feb 29 in non-leap years)
     # This handles feeds with incorrect dates like "2023-02-29"
-    year_match = _RE_FEB29.match(candidate)
-    if year_match:
-        year = int(year_match.group(1))
-        if not ((year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)):
-            # Not a leap year, change Feb 29 to Feb 28
-            candidate = candidate.replace(f"{year}-02-29", f"{year}-02-28")
+    if "-02-29" in candidate:
+        year_match = _RE_FEB29.match(candidate)
+        if year_match:
+            year = int(year_match.group(1))
+            if not ((year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)):
+                candidate = candidate.replace(f"{year}-02-29", f"{year}-02-28")
 
     if "24:00" in candidate:
         candidate = candidate.replace("24:00:00", "00:00:00").replace(
@@ -1786,7 +1766,7 @@ def _parse_date(date_str: str) -> Optional[str]:
 
     dt: Optional[datetime.datetime] = None
 
-    is_iso_like = _RE_ISO_LIKE.match(candidate) is not None
+    is_iso_like = len(candidate) >= 10 and candidate[4] == "-" and candidate[0:4].isdigit()
     if is_iso_like:
         iso_candidate = _normalize_iso_datetime_string(candidate)
         try:
diff --git a/tests/integration/kagi.json b/tests/integration/kagi.json
diff --git a/tests/integration/osm.json b/tests/integration/osm.json