|
| 1 | +import argparse |
1 | 2 | import time |
2 | 3 |
|
3 | 4 | import fastfeedparser |
|
78 | 79 | "https://blog.drewolson.org/index.xml", |
79 | 80 | "https://blog.noredink.com/rss", |
80 | 81 | "https://glasspetalsmoke.blogspot.com/feeds/posts/default", |
| 82 | + "https://feeds.washingtonpost.com/rss/world", |
| 83 | + "https://abcnews.go.com/abcnews/internationalheadlines", |
| 84 | + "https://aljazeera.com/xml/rss/all.xml", |
| 85 | + "https://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", |
| 86 | + "https://api.axios.com/feed/world", |
| 87 | + "https://en.mercopress.com/rss/", |
| 88 | + "https://feeds.a.dj.com/rss/RSSWorldNews.xml", |
| 89 | + "https://feeds.bbci.co.uk/news/world/rss.xml", |
| 90 | + "https://feeds.elpais.com/mrss-s/pages/ep/site/english.elpais.com/portada", |
| 91 | + "https://feeds.feedburner.com/ndtvnews-world-news", |
| 92 | + "https://feeds.npr.org/1004/rss.xml", |
| 93 | + "https://foreignpolicy.com/feed/", |
| 94 | + "https://japantoday.com/category/world/feed", |
| 95 | + "https://restofworld.org/feed/latest", |
| 96 | + "https://rss.csmonitor.com/feeds/all", |
| 97 | + "https://rss.dw.com/rdf/rss-en-all", |
| 98 | + "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", |
| 99 | + "https://theweek.com/feeds.xml", |
| 100 | + "https://time.com/world/feed", |
| 101 | + "https://www.abc.net.au/news/feed/45910/rss.xml", |
| 102 | + "https://www.al-monitor.com/rss", |
| 103 | + "https://www.boston.com/tag/world-news/feed/", |
| 104 | + "https://www.cbsnews.com/latest/rss/world", |
| 105 | + "https://www.dawn.com/feeds/world/", |
| 106 | + "https://www.economist.com/asia/rss.xml", |
| 107 | + "https://time.com/tech/feed/", |
| 108 | + "http://stratechery.com/feed/", |
| 109 | + "https://www.404media.co/rss/", |
81 | 110 | ] |
82 | 111 |
|
83 | 112 |
|
|
87 | 116 | "Connection": "close", |
88 | 117 | } |
89 | 118 |
|
90 | | -client = httpx.Client(verify=False) |
91 | 119 |
|
| 120 | +def process_feed(url, skip_feedparser=False, iterations=3): |
| 121 | + """Process a single feed and return timing results.""" |
| 122 | + result = { |
| 123 | + "url": url, |
| 124 | + "ffp_time": 0, |
| 125 | + "fp_time": 0, |
| 126 | + "ffp_entries": 0, |
| 127 | + "fp_entries": 0, |
| 128 | + "success": False, |
| 129 | + } |
92 | 130 |
|
93 | | -def test_parsers(): |
94 | | - print("Testing feed parsers...") |
95 | | - print("-" * 50) |
96 | | - |
97 | | - total_ffp_time = 0 |
98 | | - total_fp_time = 0 |
99 | | - total_ffp_entries = 0 |
100 | | - total_fp_entries = 0 |
101 | | - fp_time = 0 |
102 | | - |
103 | | - successful_feeds = 0 |
104 | | - |
105 | | - for url in feeds: |
106 | | - print(f"\nTesting {url}") |
107 | | - try: |
| 131 | + try: |
| 132 | + # Create client per request for ProcessPoolExecutor compatibility |
| 133 | + with httpx.Client(verify=False) as client: |
108 | 134 | resp = client.get(url, timeout=20.0, follow_redirects=True, headers=headers) |
109 | 135 | content = resp.content |
110 | 136 |
|
111 | 137 | # Test fastfeedparser |
112 | 138 | try: |
113 | 139 | start_time = time.perf_counter() |
114 | | - feed = fastfeedparser.parse(content) |
115 | | - ffp_time = time.perf_counter() - start_time |
116 | | - total_ffp_entries += len(feed.entries) |
117 | | - print(f"FastFeedParser: {len(feed.entries)} entries in {ffp_time:.3f}s") |
| 140 | + for _ in range(iterations): |
| 141 | + feed = fastfeedparser.parse(content) |
| 142 | + result["ffp_time"] = (time.perf_counter() - start_time) / iterations |
| 143 | + result["ffp_entries"] = len(feed.entries) |
| 144 | + print( |
| 145 | + f"[{url}] FastFeedParser: {len(feed.entries)} entries in {result['ffp_time']:.3f}s (avg of {iterations} runs)" |
| 146 | + ) |
118 | 147 | except Exception as e: |
119 | | - ffp_time = time.perf_counter() - start_time |
120 | | - print(f"FastFeedParser failed: {e}") |
| 148 | + result["ffp_time"] = (time.perf_counter() - start_time) / iterations |
| 149 | + print(f"[{url}] FastFeedParser failed: {e}") |
121 | 150 |
|
122 | 151 | # Test feedparser |
123 | | - try: |
124 | | - start_time = time.perf_counter() |
125 | | - feed = feedparser.parse(content) |
126 | | - fp_time = time.perf_counter() - start_time |
127 | | - total_fp_entries += len(feed.entries) |
128 | | - print(f"Feedparser: {len(feed.entries)} entries in {fp_time:.3f}s") |
129 | | - except Exception as e: |
130 | | - fp_time = time.perf_counter() - start_time |
131 | | - print(f"Feedparser failed: {e}") |
| 152 | + if not skip_feedparser: |
| 153 | + try: |
| 154 | + start_time = time.perf_counter() |
| 155 | + for _ in range(iterations): |
| 156 | + feed = feedparser.parse(content) |
| 157 | + result["fp_time"] = (time.perf_counter() - start_time) / iterations |
| 158 | + result["fp_entries"] = len(feed.entries) |
| 159 | + print( |
| 160 | + f"[{url}] Feedparser: {len(feed.entries)} entries in {result['fp_time']:.3f}s (avg of {iterations} runs)" |
| 161 | + ) |
| 162 | + except Exception as e: |
| 163 | + result["fp_time"] = (time.perf_counter() - start_time) / iterations |
| 164 | + print(f"[{url}] Feedparser failed: {e}") |
| 165 | + |
| 166 | + if skip_feedparser: |
| 167 | + if result["ffp_time"] > 0: |
| 168 | + result["success"] = True |
| 169 | + else: |
| 170 | + if result["ffp_time"] > 0 and result["fp_time"] > 0: |
| 171 | + result["success"] = True |
| 172 | + print(f"[{url}] Speedup: {result['fp_time']/result['ffp_time']:.1f}x") |
| 173 | + |
| 174 | + except Exception as e: |
| 175 | + print(f"[{url}] Failed to fetch feed: {e}") |
| 176 | + |
| 177 | + return result |
| 178 | + |
132 | 179 |
|
133 | | - total_ffp_time += ffp_time |
134 | | - total_fp_time += fp_time |
| 180 | +def test_parsers(skip_feedparser=False, iterations=3): |
| 181 | + print("Testing feed parsers...") |
| 182 | + if skip_feedparser: |
| 183 | + print("Running in FastFeedParser-only mode (-s)") |
| 184 | + print(f"Processing feeds sequentially (no parallelization)...") |
| 185 | + print(f"Each feed will be parsed {iterations} times for accurate timing") |
| 186 | + print("-" * 50) |
135 | 187 |
|
136 | | - print(f"Speedup: {fp_time/ffp_time:.1f}x") |
137 | | - if ffp_time > 0 and fp_time > 0: |
138 | | - successful_feeds += 1 |
| 188 | + results = [] |
| 189 | + overall_start_time = time.perf_counter() |
139 | 190 |
|
| 191 | + # Simple sequential loop - no parallelization |
| 192 | + for url in feeds: |
| 193 | + try: |
| 194 | + result = process_feed(url, skip_feedparser, iterations) |
| 195 | + results.append(result) |
140 | 196 | except Exception as e: |
141 | | - print(f"Failed to fetch feed: {e}") |
| 197 | + print(f"Exception processing {url}: {e}") |
| 198 | + |
| 199 | + overall_time = time.perf_counter() - overall_start_time |
| 200 | + |
| 201 | + # Calculate totals |
| 202 | + total_ffp_time = sum(r["ffp_time"] for r in results) |
| 203 | + total_fp_time = sum(r["fp_time"] for r in results) |
| 204 | + total_ffp_entries = sum(r["ffp_entries"] for r in results) |
| 205 | + total_fp_entries = sum(r["fp_entries"] for r in results) |
| 206 | + successful_feeds = sum(1 for r in results if r["success"]) |
| 207 | + |
| 208 | + # Find outliers |
| 209 | + entry_mismatches = [] |
| 210 | + slow_feeds = [] |
| 211 | + |
| 212 | + for r in results: |
| 213 | + if not r["success"]: |
| 214 | + continue |
| 215 | + |
| 216 | + # Check for entry count mismatches |
| 217 | + if not skip_feedparser and r["ffp_entries"] != r["fp_entries"]: |
| 218 | + entry_mismatches.append({ |
| 219 | + "url": r["url"], |
| 220 | + "ffp_entries": r["ffp_entries"], |
| 221 | + "fp_entries": r["fp_entries"], |
| 222 | + "diff": r["ffp_entries"] - r["fp_entries"] |
| 223 | + }) |
| 224 | + |
| 225 | + # Check for slow performance (less than 1.1x speedup) |
| 226 | + if not skip_feedparser and r["fp_time"] > 0 and r["ffp_time"] > 0: |
| 227 | + speedup = r["fp_time"] / r["ffp_time"] |
| 228 | + if speedup < 1.1: |
| 229 | + slow_feeds.append({ |
| 230 | + "url": r["url"], |
| 231 | + "speedup": speedup, |
| 232 | + "ffp_time": r["ffp_time"], |
| 233 | + "fp_time": r["fp_time"] |
| 234 | + }) |
142 | 235 |
|
143 | 236 | print("\nSummary:") |
144 | 237 | print("-" * 50) |
145 | | - print(f"Successfully tested {successful_feeds} feeds") |
| 238 | + print(f"Total wall-clock time: {overall_time:.2f}s (with parallel execution)") |
| 239 | + print(f"Successfully tested {successful_feeds}/{len(feeds)} feeds") |
146 | 240 | if successful_feeds > 0: |
147 | | - print(f"Entries FFP: {total_ffp_entries} Entries: FP {total_fp_entries}") |
148 | | - print(f"Average FastFeedParser time: {total_ffp_time/successful_feeds:.3f}s") |
149 | | - print(f"Average Feedparser time: {total_fp_time/successful_feeds:.3f}s") |
150 | | - print(f"FastFeedParser is {(total_fp_time/total_ffp_time):.1f}x faster") |
| 241 | + print(f"\nFastFeedParser:") |
| 242 | + print(f" Total entries: {total_ffp_entries}") |
| 243 | + print(f" Total parsing time: {total_ffp_time:.2f}s") |
| 244 | + print(f" Average per feed: {total_ffp_time/successful_feeds:.3f}s") |
| 245 | + |
| 246 | + if not skip_feedparser: |
| 247 | + print(f"\nFeedparser:") |
| 248 | + print(f" Total entries: {total_fp_entries}") |
| 249 | + print(f" Total parsing time: {total_fp_time:.2f}s") |
| 250 | + print(f" Average per feed: {total_fp_time/successful_feeds:.3f}s") |
| 251 | + print( |
| 252 | + f"\nSpeedup: FastFeedParser is {(total_fp_time/total_ffp_time):.1f}x faster" |
| 253 | + ) |
| 254 | + |
| 255 | + # Report outliers |
| 256 | + if entry_mismatches: |
| 257 | + print(f"\n⚠️ OUTLIERS: Entry Count Mismatches ({len(entry_mismatches)} feeds)") |
| 258 | + print("-" * 50) |
| 259 | + for m in entry_mismatches: |
| 260 | + print(f" {m['url']}") |
| 261 | + print(f" FastFeedParser: {m['ffp_entries']} entries") |
| 262 | + print(f" Feedparser: {m['fp_entries']} entries") |
| 263 | + print(f" Difference: {m['diff']:+d}") |
| 264 | + |
| 265 | + if slow_feeds: |
| 266 | + print(f"\n⚠️ OUTLIERS: Slow Performance (<1.1x speedup, {len(slow_feeds)} feeds)") |
| 267 | + print("-" * 50) |
| 268 | + slow_feeds.sort(key=lambda x: x["speedup"]) |
| 269 | + for s in slow_feeds: |
| 270 | + print(f" {s['url']}") |
| 271 | + print(f" Speedup: {s['speedup']:.2f}x") |
| 272 | + print(f" FastFeedParser: {s['ffp_time']*1000:.2f}ms") |
| 273 | + print(f" Feedparser: {s['fp_time']*1000:.2f}ms") |
151 | 274 |
|
152 | 275 |
|
153 | 276 | if __name__ == "__main__": |
154 | | - test_parsers() |
| 277 | + parser = argparse.ArgumentParser(description="Benchmark feed parsers") |
| 278 | + parser.add_argument( |
| 279 | + "-s", |
| 280 | + "--skip-feedparser", |
| 281 | + action="store_true", |
| 282 | + help="Skip feedparser and run only fastfeedparser", |
| 283 | + ) |
| 284 | + parser.add_argument( |
| 285 | + "-i", |
| 286 | + "--iterations", |
| 287 | + type=int, |
| 288 | + default=3, |
| 289 | + help="Number of iterations to run for each feed (default: 3)", |
| 290 | + ) |
| 291 | + args = parser.parse_args() |
| 292 | + |
| 293 | + test_parsers( |
| 294 | + skip_feedparser=args.skip_feedparser, |
| 295 | + iterations=args.iterations, |
| 296 | + ) |
0 commit comments