Skip to content

Commit d85f4d4

Browse files
committed
0.5.0
1 parent b90c8b8 commit d85f4d4

6 files changed

Lines changed: 65 additions & 85 deletions

File tree

benchmark.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"https://glineq.blogspot.com/feeds/posts/default",
1313
"https://stml.tumblr.com/rss",
1414
"http://feeds.feedburner.com/mishadoff",
15-
"https://lisacharlottemuth.com/atom.xml",
15+
"https://www.speakingbody.com/rss/",
1616
"https://emacsninja.com/feed.atom",
1717
"http://causality.cs.ucla.edu/blog/index.php/feed/",
1818
"https://blog.railsapps.org/rss",
@@ -29,21 +29,21 @@
2929
"https://fanf.dreamwidth.org/data/rss",
3030
"https://bernsteinbear.com/feed.xml",
3131
"https://feeds.kottke.org/main",
32-
"https://alefesouza.com/feed/",
33-
"https://amitg.blog/feed.atom",
34-
"https://www.alwaystwisted.com/rss.php",
32+
"https://dkg.fifthhorseman.net/blog/feeds/all.atom.xml",
33+
"https://zhubert.com/index.xml",
34+
"https://lovergne.dev/rss.xml",
3535
"https://blog.kagi.com/rss.xml",
36-
"https://aaronfrancis.com/feed",
36+
"https://battlefieldanomalies.com/home/feed/",
3737
"http://davidbau.com/index.rdf",
38-
"https://jesperbylund.com/rss",
39-
"https://aarvik.dk/rss/index.html",
38+
"https://blog.kagamino.dev/index.xml",
39+
"https://feeds.transistor.fm/fallthrough",
4040
"http://dontcodetired.com/blog/syndication.axd",
4141
"https://aivarsk.com/atom.xml",
4242
"http://markcoddington.com/feed/",
43-
"https://andresb.net/blog/feed/",
44-
"http://feeds.d15.biz/Daniel15",
43+
"https://bendauphinee.com/writing/feed/",
44+
"https://www.oscardom.dev/index.xml",
4545
"https://alwaystwisted.com/feed.xml",
46-
"https://aly.arriqaaq.com/rss/",
46+
"https://killjoy.bearblog.dev/rss.xml",
4747
"https://nithinbekal.com/feed.xml",
4848
"https://blog.emacsen.net/atom.xml",
4949
"https://therecouldhavebeensnakes.wordpress.com/feed/",
@@ -58,23 +58,23 @@
5858
"https://herman.bearblog.dev/feed/",
5959
"https://dylanharris.org/feed-me.rss",
6060
"https://eliot-jones.com/rss",
61-
"https://www.byjp.me/index.xml",
61+
"https://blog.kroy.io/feed/",
6262
"https://jfg-mysql.blogspot.com/feeds/posts/default",
6363
"https://dzidas.com/atom.xml",
6464
"https://ariannasimpson.com/blog/feed/",
6565
"https://www.everydayislikewednesday.com/atom.xml",
6666
"https://www.bastibl.net/atom.xml",
6767
"https://yuxi.ml/feeds.xml",
6868
"https://bugramming.dev/index.xml",
69-
"https://blog.iangilman.com/rss.xml",
69+
"https://evanfields.github.io/feed.xml",
7070
"https://raahel.bearblog.dev/atom/",
7171
"https://mahdytech.com/rss.xml",
7272
"https://fogblog-hermansheephouse.blogspot.com/feeds/posts/default",
7373
"https://ctoomey.com/atom.xml",
7474
"https://blog.lasheen.dev/index.xml",
7575
"https://markheath.net/feed/rss",
7676
"https://stancarney.co/rss/",
77-
"https://bigmachine.io/feed.xml",
77+
"https://thecretefleet.com/blog/f.atom",
7878
"https://anteru.net/rss.xml",
7979
"https://blog.drewolson.org/index.xml",
8080
"https://blog.noredink.com/rss",
@@ -83,7 +83,7 @@
8383
"https://abcnews.go.com/abcnews/internationalheadlines",
8484
"https://aljazeera.com/xml/rss/all.xml",
8585
"https://allafrica.com/tools/headlines/rdf/latest/headlines.rdf",
86-
"https://api.axios.com/feed/world",
86+
"https://remimercier.com/feed.xml",
8787
"https://en.mercopress.com/rss/",
8888
"https://feeds.a.dj.com/rss/RSSWorldNews.xml",
8989
"https://feeds.bbci.co.uk/news/world/rss.xml",

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = fastfeedparser
3-
version = 0.4.9
3+
version = 0.5.0
44
author = Vladimir Prelovac
55
author_email = vlad@kagi.com
66
description = High performance RSS, Atom, JSON and RDF feed parser in Python

src/fastfeedparser/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .main import parse, FastFeedParserDict
22

3-
__version__ = "0.4.9"
3+
__version__ = "0.5.0"
44
__all__ = ["parse", "FastFeedParserDict"]

src/fastfeedparser/main.py

Lines changed: 23 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import datetime
44
from email.utils import parsedate_to_datetime
55
import gzip
6+
import html as _html_mod
67
import json
78
import re
89
import zlib
@@ -59,8 +60,8 @@
5960
br"<link([^>]*[^/])>\s*(?=\n\s*<(?!/link\s*>))", re.MULTILINE
6061
)
6162
_RE_FEB29 = re.compile(r"(\d{4})-02-29")
63+
_RE_HTML_TAGS = re.compile(r"<[^>]+>")
6264
_RE_WHITESPACE = re.compile(r"\s+")
63-
_RE_ISO_LIKE = re.compile(r"^\d{4}-\d{2}-\d{2}")
6465
_RE_ISO_TZ_NO_COLON = re.compile(r"([+-]\d{2})(\d{2})$")
6566
_RE_ISO_TZ_HOUR_ONLY = re.compile(r"([+-]\d{2})$")
6667
_RE_ISO_FRACTION = re.compile(r"\.(\d{7,})(?=(?:[+-]\d{2}:?\d{2}|Z|$))", re.IGNORECASE)
@@ -595,7 +596,6 @@ def _raise_for_non_feed_root(
595596
raise ValueError(
596597
"Received XML sitemap instead of feed (sitemap is for search engines, not a feed)"
597598
)
598-
raise ValueError(f"Not a valid feed: {root_tag_local} element found - {error_msg[:100]}")
599599

600600

601601
_RE_META_REFRESH_URL = re.compile(
@@ -1123,16 +1123,9 @@ def _populate_entry_content(
11231123
content_value = entry["content"][0]["value"]
11241124
if content_value:
11251125
if "<" in content_value:
1126-
try:
1127-
html_content = etree.HTML(content_value)
1128-
if html_content is not None:
1129-
content_text = html_content.xpath("string()")
1130-
if isinstance(content_text, str):
1131-
content_value = _RE_WHITESPACE.sub(" ", content_text)
1132-
except etree.ParserError:
1133-
pass
1134-
else:
1135-
content_value = _RE_WHITESPACE.sub(" ", content_value)
1126+
content_value = _RE_HTML_TAGS.sub(" ", content_value[:2048])
1127+
content_value = _html_mod.unescape(content_value)
1128+
content_value = _RE_WHITESPACE.sub(" ", content_value).strip()
11361129
entry["description"] = content_value[:512]
11371130

11381131

@@ -1223,13 +1216,6 @@ def _parse_enclosures(item: _Element) -> list[dict[str, Any]] | None:
12231216
return enclosures or None
12241217

12251218

1226-
def _normalize_local_tag_name(tag: str) -> str:
1227-
local = tag.rsplit("}", 1)[-1].lower()
1228-
if ":" in local:
1229-
local = local.split(":", 1)[1]
1230-
return local
1231-
1232-
12331219
def _build_rss_item_text_maps(item: _Element) -> tuple[dict[str, Optional[str]], dict[str, Optional[str]]]:
12341220
by_local: dict[str, Optional[str]] = {}
12351221
by_full: dict[str, Optional[str]] = {}
@@ -1240,7 +1226,9 @@ def _build_rss_item_text_maps(item: _Element) -> tuple[dict[str, Optional[str]],
12401226
text_value = child.text.strip() if child.text else None
12411227
if tag not in by_full:
12421228
by_full[tag] = text_value
1243-
local = _normalize_local_tag_name(tag)
1229+
local = tag.rsplit("}", 1)[-1].lower()
1230+
if ":" in local:
1231+
local = local.split(":", 1)[1]
12441232
if local not in by_local:
12451233
by_local[local] = text_value
12461234
return by_local, by_full
@@ -1477,22 +1465,14 @@ def _parse_feed_entry(
14771465
if enclosures:
14781466
entry["enclosures"] = enclosures
14791467

1480-
author = (
1481-
get_field_value(
1482-
"author",
1483-
f"{{{atom_ns}}}author/{{{atom_ns}}}name",
1484-
"{http://purl.org/dc/elements/1.1/}creator",
1485-
False,
1486-
)
1487-
or get_field_value(
1488-
"{http://purl.org/dc/elements/1.1/}creator",
1489-
"{http://purl.org/dc/elements/1.1/}creator",
1490-
"{http://purl.org/dc/elements/1.1/}creator",
1491-
False,
1492-
)
1493-
or element_get("{http://purl.org/dc/elements/1.1/}creator")
1494-
or element_get("author")
1468+
author = get_field_value(
1469+
"author",
1470+
f"{{{atom_ns}}}author/{{{atom_ns}}}name",
1471+
"{http://purl.org/dc/elements/1.1/}creator",
1472+
False,
14951473
)
1474+
if not author:
1475+
author = element_get("{http://purl.org/dc/elements/1.1/}creator") or element_get("author")
14961476
if author:
14971477
entry["author"] = author
14981478

@@ -1648,7 +1628,7 @@ def _normalize_iso_datetime_string(value: str) -> str:
16481628
if cleaned.endswith(("Z", "z")):
16491629
cleaned = cleaned[:-1] + "+00:00"
16501630

1651-
if " " in cleaned and "T" not in cleaned[:11] and _RE_ISO_LIKE.match(cleaned):
1631+
if " " in cleaned and "T" not in cleaned[:11] and len(cleaned) >= 10 and cleaned[4] == "-" and cleaned[0:4].isdigit():
16521632
date_part, rest = cleaned.split(" ", 1)
16531633
if rest and rest[0].isdigit():
16541634
cleaned = f"{date_part}T{rest}"
@@ -1772,12 +1752,12 @@ def _parse_date(date_str: str) -> Optional[str]:
17721752

17731753
# Fix invalid leap year dates (Feb 29 in non-leap years)
17741754
# This handles feeds with incorrect dates like "2023-02-29"
1775-
year_match = _RE_FEB29.match(candidate)
1776-
if year_match:
1777-
year = int(year_match.group(1))
1778-
if not ((year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)):
1779-
# Not a leap year, change Feb 29 to Feb 28
1780-
candidate = candidate.replace(f"{year}-02-29", f"{year}-02-28")
1755+
if "-02-29" in candidate:
1756+
year_match = _RE_FEB29.match(candidate)
1757+
if year_match:
1758+
year = int(year_match.group(1))
1759+
if not ((year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)):
1760+
candidate = candidate.replace(f"{year}-02-29", f"{year}-02-28")
17811761

17821762
if "24:00" in candidate:
17831763
candidate = candidate.replace("24:00:00", "00:00:00").replace(
@@ -1786,7 +1766,7 @@ def _parse_date(date_str: str) -> Optional[str]:
17861766

17871767
dt: Optional[datetime.datetime] = None
17881768

1789-
is_iso_like = _RE_ISO_LIKE.match(candidate) is not None
1769+
is_iso_like = len(candidate) >= 10 and candidate[4] == "-" and candidate[0:4].isdigit()
17901770
if is_iso_like:
17911771
iso_candidate = _normalize_iso_datetime_string(candidate)
17921772
try:

0 commit comments

Comments
 (0)