Skip to content

Commit c1ae3dc

Browse files
committed
updated benchmark
1 parent c655dbc commit c1ae3dc

2 files changed

Lines changed: 188 additions & 44 deletions

File tree

benchmark.py

Lines changed: 185 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import argparse
12
import time
23

34
import fastfeedparser
@@ -78,6 +79,34 @@
7879
"https://blog.drewolson.org/index.xml",
7980
"https://blog.noredink.com/rss",
8081
"https://glasspetalsmoke.blogspot.com/feeds/posts/default",
82+
"https://feeds.washingtonpost.com/rss/world",
83+
"https://abcnews.go.com/abcnews/internationalheadlines",
84+
"https://aljazeera.com/xml/rss/all.xml",
85+
"https://allafrica.com/tools/headlines/rdf/latest/headlines.rdf",
86+
"https://api.axios.com/feed/world",
87+
"https://en.mercopress.com/rss/",
88+
"https://feeds.a.dj.com/rss/RSSWorldNews.xml",
89+
"https://feeds.bbci.co.uk/news/world/rss.xml",
90+
"https://feeds.elpais.com/mrss-s/pages/ep/site/english.elpais.com/portada",
91+
"https://feeds.feedburner.com/ndtvnews-world-news",
92+
"https://feeds.npr.org/1004/rss.xml",
93+
"https://foreignpolicy.com/feed/",
94+
"https://japantoday.com/category/world/feed",
95+
"https://restofworld.org/feed/latest",
96+
"https://rss.csmonitor.com/feeds/all",
97+
"https://rss.dw.com/rdf/rss-en-all",
98+
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
99+
"https://theweek.com/feeds.xml",
100+
"https://time.com/world/feed",
101+
"https://www.abc.net.au/news/feed/45910/rss.xml",
102+
"https://www.al-monitor.com/rss",
103+
"https://www.boston.com/tag/world-news/feed/",
104+
"https://www.cbsnews.com/latest/rss/world",
105+
"https://www.dawn.com/feeds/world/",
106+
"https://www.economist.com/asia/rss.xml",
107+
"https://time.com/tech/feed/",
108+
"http://stratechery.com/feed/",
109+
"https://www.404media.co/rss/",
81110
]
82111

83112

@@ -87,68 +116,181 @@
87116
"Connection": "close",
88117
}
89118

90-
client = httpx.Client(verify=False)
91119

120+
def process_feed(url, skip_feedparser=False, iterations=3):
121+
"""Process a single feed and return timing results."""
122+
result = {
123+
"url": url,
124+
"ffp_time": 0,
125+
"fp_time": 0,
126+
"ffp_entries": 0,
127+
"fp_entries": 0,
128+
"success": False,
129+
}
92130

93-
def test_parsers():
94-
print("Testing feed parsers...")
95-
print("-" * 50)
96-
97-
total_ffp_time = 0
98-
total_fp_time = 0
99-
total_ffp_entries = 0
100-
total_fp_entries = 0
101-
fp_time = 0
102-
103-
successful_feeds = 0
104-
105-
for url in feeds:
106-
print(f"\nTesting {url}")
107-
try:
131+
try:
132+
# Create client per request for ProcessPoolExecutor compatibility
133+
with httpx.Client(verify=False) as client:
108134
resp = client.get(url, timeout=20.0, follow_redirects=True, headers=headers)
109135
content = resp.content
110136

111137
# Test fastfeedparser
112138
try:
113139
start_time = time.perf_counter()
114-
feed = fastfeedparser.parse(content)
115-
ffp_time = time.perf_counter() - start_time
116-
total_ffp_entries += len(feed.entries)
117-
print(f"FastFeedParser: {len(feed.entries)} entries in {ffp_time:.3f}s")
140+
for _ in range(iterations):
141+
feed = fastfeedparser.parse(content)
142+
result["ffp_time"] = (time.perf_counter() - start_time) / iterations
143+
result["ffp_entries"] = len(feed.entries)
144+
print(
145+
f"[{url}] FastFeedParser: {len(feed.entries)} entries in {result['ffp_time']:.3f}s (avg of {iterations} runs)"
146+
)
118147
except Exception as e:
119-
ffp_time = time.perf_counter() - start_time
120-
print(f"FastFeedParser failed: {e}")
148+
result["ffp_time"] = (time.perf_counter() - start_time) / iterations
149+
print(f"[{url}] FastFeedParser failed: {e}")
121150

122151
# Test feedparser
123-
try:
124-
start_time = time.perf_counter()
125-
feed = feedparser.parse(content)
126-
fp_time = time.perf_counter() - start_time
127-
total_fp_entries += len(feed.entries)
128-
print(f"Feedparser: {len(feed.entries)} entries in {fp_time:.3f}s")
129-
except Exception as e:
130-
fp_time = time.perf_counter() - start_time
131-
print(f"Feedparser failed: {e}")
152+
if not skip_feedparser:
153+
try:
154+
start_time = time.perf_counter()
155+
for _ in range(iterations):
156+
feed = feedparser.parse(content)
157+
result["fp_time"] = (time.perf_counter() - start_time) / iterations
158+
result["fp_entries"] = len(feed.entries)
159+
print(
160+
f"[{url}] Feedparser: {len(feed.entries)} entries in {result['fp_time']:.3f}s (avg of {iterations} runs)"
161+
)
162+
except Exception as e:
163+
result["fp_time"] = (time.perf_counter() - start_time) / iterations
164+
print(f"[{url}] Feedparser failed: {e}")
165+
166+
if skip_feedparser:
167+
if result["ffp_time"] > 0:
168+
result["success"] = True
169+
else:
170+
if result["ffp_time"] > 0 and result["fp_time"] > 0:
171+
result["success"] = True
172+
print(f"[{url}] Speedup: {result['fp_time']/result['ffp_time']:.1f}x")
173+
174+
except Exception as e:
175+
print(f"[{url}] Failed to fetch feed: {e}")
176+
177+
return result
178+
132179

133-
total_ffp_time += ffp_time
134-
total_fp_time += fp_time
180+
def test_parsers(skip_feedparser=False, iterations=3):
181+
print("Testing feed parsers...")
182+
if skip_feedparser:
183+
print("Running in FastFeedParser-only mode (-s)")
184+
print(f"Processing feeds sequentially (no parallelization)...")
185+
print(f"Each feed will be parsed {iterations} times for accurate timing")
186+
print("-" * 50)
135187

136-
print(f"Speedup: {fp_time/ffp_time:.1f}x")
137-
if ffp_time > 0 and fp_time > 0:
138-
successful_feeds += 1
188+
results = []
189+
overall_start_time = time.perf_counter()
139190

191+
# Simple sequential loop - no parallelization
192+
for url in feeds:
193+
try:
194+
result = process_feed(url, skip_feedparser, iterations)
195+
results.append(result)
140196
except Exception as e:
141-
print(f"Failed to fetch feed: {e}")
197+
print(f"Exception processing {url}: {e}")
198+
199+
overall_time = time.perf_counter() - overall_start_time
200+
201+
# Calculate totals
202+
total_ffp_time = sum(r["ffp_time"] for r in results)
203+
total_fp_time = sum(r["fp_time"] for r in results)
204+
total_ffp_entries = sum(r["ffp_entries"] for r in results)
205+
total_fp_entries = sum(r["fp_entries"] for r in results)
206+
successful_feeds = sum(1 for r in results if r["success"])
207+
208+
# Find outliers
209+
entry_mismatches = []
210+
slow_feeds = []
211+
212+
for r in results:
213+
if not r["success"]:
214+
continue
215+
216+
# Check for entry count mismatches
217+
if not skip_feedparser and r["ffp_entries"] != r["fp_entries"]:
218+
entry_mismatches.append({
219+
"url": r["url"],
220+
"ffp_entries": r["ffp_entries"],
221+
"fp_entries": r["fp_entries"],
222+
"diff": r["ffp_entries"] - r["fp_entries"]
223+
})
224+
225+
# Check for slow performance (less than 1.1x speedup)
226+
if not skip_feedparser and r["fp_time"] > 0 and r["ffp_time"] > 0:
227+
speedup = r["fp_time"] / r["ffp_time"]
228+
if speedup < 1.1:
229+
slow_feeds.append({
230+
"url": r["url"],
231+
"speedup": speedup,
232+
"ffp_time": r["ffp_time"],
233+
"fp_time": r["fp_time"]
234+
})
142235

143236
print("\nSummary:")
144237
print("-" * 50)
145-
print(f"Successfully tested {successful_feeds} feeds")
238+
print(f"Total wall-clock time: {overall_time:.2f}s (with parallel execution)")
239+
print(f"Successfully tested {successful_feeds}/{len(feeds)} feeds")
146240
if successful_feeds > 0:
147-
print(f"Entries FFP: {total_ffp_entries} Entries: FP {total_fp_entries}")
148-
print(f"Average FastFeedParser time: {total_ffp_time/successful_feeds:.3f}s")
149-
print(f"Average Feedparser time: {total_fp_time/successful_feeds:.3f}s")
150-
print(f"FastFeedParser is {(total_fp_time/total_ffp_time):.1f}x faster")
241+
print(f"\nFastFeedParser:")
242+
print(f" Total entries: {total_ffp_entries}")
243+
print(f" Total parsing time: {total_ffp_time:.2f}s")
244+
print(f" Average per feed: {total_ffp_time/successful_feeds:.3f}s")
245+
246+
if not skip_feedparser:
247+
print(f"\nFeedparser:")
248+
print(f" Total entries: {total_fp_entries}")
249+
print(f" Total parsing time: {total_fp_time:.2f}s")
250+
print(f" Average per feed: {total_fp_time/successful_feeds:.3f}s")
251+
print(
252+
f"\nSpeedup: FastFeedParser is {(total_fp_time/total_ffp_time):.1f}x faster"
253+
)
254+
255+
# Report outliers
256+
if entry_mismatches:
257+
print(f"\n⚠️ OUTLIERS: Entry Count Mismatches ({len(entry_mismatches)} feeds)")
258+
print("-" * 50)
259+
for m in entry_mismatches:
260+
print(f" {m['url']}")
261+
print(f" FastFeedParser: {m['ffp_entries']} entries")
262+
print(f" Feedparser: {m['fp_entries']} entries")
263+
print(f" Difference: {m['diff']:+d}")
264+
265+
if slow_feeds:
266+
print(f"\n⚠️ OUTLIERS: Slow Performance (<1.1x speedup, {len(slow_feeds)} feeds)")
267+
print("-" * 50)
268+
slow_feeds.sort(key=lambda x: x["speedup"])
269+
for s in slow_feeds:
270+
print(f" {s['url']}")
271+
print(f" Speedup: {s['speedup']:.2f}x")
272+
print(f" FastFeedParser: {s['ffp_time']*1000:.2f}ms")
273+
print(f" Feedparser: {s['fp_time']*1000:.2f}ms")
151274

152275

153276
if __name__ == "__main__":
154-
test_parsers()
277+
parser = argparse.ArgumentParser(description="Benchmark feed parsers")
278+
parser.add_argument(
279+
"-s",
280+
"--skip-feedparser",
281+
action="store_true",
282+
help="Skip feedparser and run only fastfeedparser",
283+
)
284+
parser.add_argument(
285+
"-i",
286+
"--iterations",
287+
type=int,
288+
default=3,
289+
help="Number of iterations to run for each feed (default: 3)",
290+
)
291+
args = parser.parse_args()
292+
293+
test_parsers(
294+
skip_feedparser=args.skip_feedparser,
295+
iterations=args.iterations,
296+
)

src/fastfeedparser/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,9 @@ def parse(source: str | bytes) -> FastFeedParserDict:
514514
for tag in ["message", "title", "h1", "h2", "p", "code"]:
515515
try:
516516
# Try with and without namespace
517-
elem = root.find(f".//{tag}") or root.find(tag)
517+
elem = root.find(f".//{tag}")
518+
if elem is None:
519+
elem = root.find(tag)
518520
if elem is not None and elem.text:
519521
error_msg = elem.text
520522
break

0 commit comments

Comments
 (0)