Skip to content

Commit 689c760

Browse files
committed
fix: harden feed timestamp parsing
Preserve item dates from updated timestamps and ignore malformed parsed tuples so feed ingestion keeps working across mixed RSS and Atom metadata.
1 parent ee3b02a commit 689c760

2 files changed

Lines changed: 114 additions & 14 deletions

File tree

tests/infrastructure/test_feed_source.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from datetime import datetime, timezone
2+
from types import SimpleNamespace
23

34
import pytest
45

56
from video_rss_aggregator.application.ports import FetchedFeed, FetchedFeedEntry
7+
from video_rss_aggregator.infrastructure import feed_source as feed_source_module
68
from video_rss_aggregator.infrastructure.feed_source import HttpFeedSource
79

810

@@ -71,3 +73,75 @@ async def test_http_feed_source_fetches_and_maps_entries() -> None:
7173
),
7274
),
7375
)
76+
77+
78+
@pytest.mark.anyio
79+
async def test_http_feed_source_falls_back_to_updated_timestamp() -> None:
80+
client = FakeAsyncClient(
81+
FakeResponse(
82+
"""
83+
<feed xmlns="http://www.w3.org/2005/Atom">
84+
<title>Example atom feed</title>
85+
<link href="https://example.com" />
86+
<entry>
87+
<title>First</title>
88+
<id>first-guid</id>
89+
<updated>2024-01-02T03:04:05Z</updated>
90+
<link href="https://example.com/watch?v=1" />
91+
</entry>
92+
</feed>
93+
"""
94+
)
95+
)
96+
adapter = HttpFeedSource(client)
97+
98+
feed = await adapter.fetch("https://example.com/feed.xml")
99+
100+
assert feed == FetchedFeed(
101+
title="Example atom feed",
102+
site_url="https://example.com",
103+
entries=(
104+
FetchedFeedEntry(
105+
source_url="https://example.com/watch?v=1",
106+
title="First",
107+
guid="first-guid",
108+
published_at=datetime(2024, 1, 2, 3, 4, 5, tzinfo=timezone.utc),
109+
),
110+
),
111+
)
112+
113+
114+
@pytest.mark.anyio
115+
async def test_http_feed_source_ignores_malformed_parsed_dates(monkeypatch) -> None:
116+
client = FakeAsyncClient(FakeResponse("ignored"))
117+
adapter = HttpFeedSource(client)
118+
119+
def fake_parse(_text: str) -> SimpleNamespace:
120+
return SimpleNamespace(
121+
feed={"title": "Example feed", "link": "https://example.com"},
122+
entries=[
123+
{
124+
"title": "First",
125+
"id": "first-guid",
126+
"link": "https://example.com/watch?v=1",
127+
"published_parsed": (2024, 1),
128+
}
129+
],
130+
)
131+
132+
monkeypatch.setattr(feed_source_module.feedparser, "parse", fake_parse)
133+
134+
feed = await adapter.fetch("https://example.com/feed.xml")
135+
136+
assert feed == FetchedFeed(
137+
title="Example feed",
138+
site_url="https://example.com",
139+
entries=(
140+
FetchedFeedEntry(
141+
source_url="https://example.com/watch?v=1",
142+
title="First",
143+
guid="first-guid",
144+
published_at=None,
145+
),
146+
),
147+
)

video_rss_aggregator/infrastructure/feed_source.py

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,48 @@ def _map_entry(entry: Any) -> FetchedFeedEntry:
3131

3232

3333
def _pick_published_at(entry: Any) -> datetime | None:
34-
published = entry.get("published")
35-
if published:
36-
try:
37-
parsed = parsedate_to_datetime(published)
38-
except (TypeError, ValueError, IndexError, OverflowError):
39-
parsed = None
40-
if parsed is not None:
41-
if parsed.tzinfo is None:
42-
return parsed.replace(tzinfo=timezone.utc)
43-
return parsed.astimezone(timezone.utc)
44-
45-
published_parsed = entry.get("published_parsed")
46-
if published_parsed is None:
34+
for field in ("published", "updated"):
35+
value = entry.get(field)
36+
if value:
37+
parsed = _parse_datetime_value(value)
38+
if parsed is not None:
39+
return parsed
40+
41+
for field in ("published_parsed", "updated_parsed"):
42+
value = entry.get(field)
43+
if value is not None:
44+
parsed = _parse_datetime_tuple(value)
45+
if parsed is not None:
46+
return parsed
47+
48+
return None
49+
50+
51+
def _parse_datetime_value(value: Any) -> datetime | None:
52+
if not isinstance(value, str):
4753
return None
4854

49-
return datetime(*published_parsed[:6], tzinfo=timezone.utc)
55+
try:
56+
parsed = parsedate_to_datetime(value)
57+
except (TypeError, ValueError, IndexError, OverflowError):
58+
parsed = None
59+
60+
if parsed is None:
61+
try:
62+
parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
63+
except ValueError:
64+
return None
65+
66+
if parsed.tzinfo is None:
67+
return parsed.replace(tzinfo=timezone.utc)
68+
return parsed.astimezone(timezone.utc)
69+
70+
71+
def _parse_datetime_tuple(value: Any) -> datetime | None:
72+
try:
73+
return datetime(*value[:6], tzinfo=timezone.utc)
74+
except (TypeError, ValueError, IndexError, OverflowError):
75+
return None
5076

5177

5278
@dataclass(frozen=True)

0 commit comments

Comments
 (0)