Skip to content

Commit 66f8d6f

Browse files
authored
Handle extremely large feeds more efficiently when viewing articles (#433)
1 parent b50975b commit 66f8d6f

11 files changed

Lines changed: 443 additions & 129 deletions

File tree

services/user-feeds-next/src/articles/parser/article-parser.ts

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,22 @@ export function flattenArticle(
6060
formatOptions?: UserFeedFormatOptions;
6161
useParserRules?: PostProcessParserRule[];
6262
}
63+
): FlattenedArticleWithoutId {
64+
const base = flattenArticleLightweight(input, options);
65+
66+
return enrichFlattenedArticle(base, options);
67+
}
68+
69+
/**
70+
* Lightweight flatten: same as flattenArticle but skips extractExtraInfo and runPostProcessRules.
71+
* Used for pagination/search where extracted::/processed:: fields aren't needed.
72+
*/
73+
export function flattenArticleLightweight(
74+
input: Record<string, unknown>,
75+
options: {
76+
formatOptions?: UserFeedFormatOptions;
77+
useParserRules?: PostProcessParserRule[];
78+
}
6379
): FlattenedArticleWithoutId {
6480
const flattened = flatten(input, {
6581
delimiter: ARTICLE_FIELD_DELIMITER,
@@ -89,7 +105,6 @@ export function flattenArticle(
89105
try {
90106
// eslint-disable-next-line no-control-regex
91107
if (/[^\x00-\x7F]/.test(requestedTimezone)) {
92-
// Non-ASCII characters (e.g., Unicode minus) aren't valid in timezones
93108
throw new Error("Invalid timezone");
94109
}
95110
dateVal = dayjs(value).tz(requestedTimezone);
@@ -139,8 +154,20 @@ export function flattenArticle(
139154
newRecord[key] = String(value);
140155
});
141156

142-
// Extract images and anchors from HTML content
143-
const entries = Object.entries(newRecord);
157+
return newRecord;
158+
}
159+
160+
/**
161+
* Enrich a lightweight-flattened article with extracted:: and processed:: fields.
162+
* Runs the expensive operations that flattenArticleLightweight skips.
163+
*/
164+
export function enrichFlattenedArticle(
165+
flattened: FlattenedArticleWithoutId,
166+
options: { useParserRules?: PostProcessParserRule[] }
167+
): FlattenedArticleWithoutId {
168+
const enriched = { ...flattened };
169+
170+
const entries = Object.entries(flattened);
144171

145172
for (let i = 0; i < entries.length; i++) {
146173
const [key, value] = entries[i]!;
@@ -149,20 +176,18 @@ export function flattenArticle(
149176

150177
if (imageList.length) {
151178
for (let j = 0; j < imageList.length; j++) {
152-
const image = imageList[j];
153-
newRecord[`extracted::${key}::image${j + 1}`] = image!;
179+
enriched[`extracted::${key}::image${j + 1}`] = imageList[j]!;
154180
}
155181
}
156182

157183
if (anchorList.length) {
158184
for (let j = 0; j < anchorList.length; j++) {
159-
const anchor = anchorList[j];
160-
newRecord[`extracted::${key}::anchor${j + 1}`] = anchor!;
185+
enriched[`extracted::${key}::anchor${j + 1}`] = anchorList[j]!;
161186
}
162187
}
163188
}
164189

165-
return runPostProcessRules(newRecord, options.useParserRules);
190+
return runPostProcessRules(enriched, options.useParserRules);
166191
}
167192

168193
/**
@@ -176,6 +201,7 @@ export async function parseArticlesFromXml(
176201
useParserRules?: PostProcessParserRule[];
177202
externalFeedProperties?: ExternalFeedProperty[];
178203
externalFetchFn?: ExternalFetchFn;
204+
lightweight?: boolean;
179205
} = {}
180206
): Promise<ParseArticlesResult> {
181207
const feedparser = new FeedParser({});
@@ -240,7 +266,10 @@ export async function parseArticlesFromXml(
240266
idType
241267
);
242268

243-
const flattened = flattenArticle(rawArticle as never, {
269+
const flattenFn = options.lightweight
270+
? flattenArticleLightweight
271+
: flattenArticle;
272+
const flattened = flattenFn(rawArticle as never, {
244273
formatOptions: options.formatOptions,
245274
useParserRules: options.useParserRules,
246275
});
@@ -287,8 +316,9 @@ export async function parseArticlesFromXml(
287316
idHashes.add(idHash);
288317
}
289318

290-
// Inject external content if configured
319+
// Inject external content if configured (skip in lightweight mode)
291320
if (
321+
!options.lightweight &&
292322
options.externalFeedProperties?.length &&
293323
options.externalFetchFn &&
294324
mappedArticles.length <= MAX_ARTICLE_INJECTION_ARTICLE_COUNT

services/user-feeds-next/src/articles/parser/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
export {
22
parseArticlesFromXml,
33
flattenArticle,
4+
flattenArticleLightweight,
5+
enrichFlattenedArticle,
46
FeedParseTimeoutException,
57
InvalidFeedException,
68
} from "./article-parser";

services/user-feeds-next/src/articles/parser/worker/feed-parser-pool.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ export interface ParseArticlesOptions {
4040
useParserRules?: PostProcessParserRule[];
4141
externalFeedProperties?: ExternalFeedProperty[];
4242
externalFetchFn?: ExternalFetchFn;
43+
lightweight?: boolean;
4344
}
4445

4546
/**

services/user-feeds-next/src/feed-fetcher/feed-fetcher.ts

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,29 @@ import {
1818

1919
const API_KEY = process.env.USER_FEEDS_FEED_REQUESTS_API_KEY || "";
2020

21+
function getErrorDetails(err: unknown): string {
22+
const parts: string[] = [];
23+
let current = err;
24+
25+
while (current instanceof Error) {
26+
const e = current as Error & { code?: string };
27+
let part = e.message;
28+
29+
if (e.code) {
30+
part += ` [${e.code}]`;
31+
}
32+
33+
parts.push(part);
34+
current = e.cause;
35+
}
36+
37+
if (current !== undefined && !(current instanceof Error)) {
38+
parts.push(String(current));
39+
}
40+
41+
return parts.join(" -> ");
42+
}
43+
2144
export async function fetchFeed(
2245
url: string,
2346
options: {
@@ -54,15 +77,15 @@ export async function fetchFeed(
5477
accept: "application/json",
5578
"api-key": API_KEY,
5679
},
57-
}),
80+
} as RequestInit),
5881
{
5982
retries: options?.retries ?? 2,
6083
randomize: true,
6184
}
6285
);
6386
} catch (err) {
6487
throw new FeedRequestNetworkException(
65-
`Failed to execute request to feed requests API: ${(err as Error).message}`
88+
`Failed to execute request to feed requests API: ${getErrorDetails(err)}`
6689
);
6790
}
6891

@@ -194,15 +217,15 @@ export async function fetchFeedForDeliveryPreview(
194217
accept: "application/json",
195218
"api-key": API_KEY,
196219
},
197-
}),
220+
} as RequestInit),
198221
{
199222
retries: 2,
200223
randomize: true,
201224
}
202225
);
203226
} catch (err) {
204227
throw new FeedRequestNetworkException(
205-
`Failed to execute request to feed requests API: ${(err as Error).message}`
228+
`Failed to execute request to feed requests API: ${getErrorDetails(err)}`
206229
);
207230
}
208231

services/user-feeds-next/src/feeds/services/articles.service.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ export interface FindOrFetchFeedArticlesOptions extends FetchFeedArticleOptions
4444
executeFetch?: boolean;
4545
executeFetchIfStale?: boolean;
4646
parsedArticlesCacheStore?: ParsedArticlesCacheStore;
47+
lightweight?: boolean;
4748
}
4849

4950
export interface FetchFeedArticlesResult {
@@ -123,8 +124,11 @@ export async function findOrFetchFeedArticles(
123124
dateTimezone: options.formatOptions?.dateTimezone,
124125
dateLocale: options.formatOptions?.dateLocale,
125126
},
126-
externalFeedProperties: options.externalFeedProperties,
127+
externalFeedProperties: options.lightweight
128+
? undefined
129+
: options.externalFeedProperties,
127130
requestLookupDetails: options.requestLookupDetails ?? undefined,
131+
lightweight: options.lightweight,
128132
};
129133

130134
// Check cache first
@@ -177,6 +181,7 @@ export async function findOrFetchFeedArticles(
177181
try {
178182
const parsed = await parseArticlesFromXml(result.body, {
179183
formatOptions: options.formatOptions,
184+
lightweight: options.lightweight,
180185
});
181186
articles = parsed.articles;
182187
feed = parsed.feed;
@@ -212,9 +217,9 @@ export async function findOrFetchFeedArticles(
212217
throw err;
213218
}
214219

215-
// Inject external content if external properties are specified
220+
// Inject external content if external properties are specified (skip in lightweight mode)
216221
let externalContentErrors: ExternalContentError[] = [];
217-
if (options.externalFeedProperties?.length) {
222+
if (!options.lightweight && options.externalFeedProperties?.length) {
218223
externalContentErrors = await injectExternalContent(
219224
articles,
220225
options.externalFeedProperties,

services/user-feeds-next/src/feeds/services/feeds.service.ts

Lines changed: 12 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -4,56 +4,48 @@
44
*/
55

66
import type { Article } from "../../articles/parser";
7-
import {
8-
evaluateExpression,
9-
buildFilterReferences,
10-
type LogicalExpression,
11-
} from "../../articles/filters";
127
import { INJECTED_ARTICLE_PLACEHOLDER_PREFIX } from "../../shared/constants";
138
import {
14-
GetUserFeedArticlesFilterReturnType,
159
SelectPropertyType,
1610
type CustomPlaceholder,
1711
} from "../../http/schemas";
1812
import { getNumbersInRange } from "./utils";
1913

20-
export interface QueryForArticlesInput {
14+
export interface PaginateArticlesInput {
2115
articles: Article[];
2216
limit: number;
2317
skip: number;
2418
random?: boolean;
2519
selectProperties?: string[];
2620
selectPropertyTypes?: SelectPropertyType[];
21+
customPlaceholders?: CustomPlaceholder[] | null;
2722
filters?: {
28-
returnType?: GetUserFeedArticlesFilterReturnType;
29-
expression?: Record<string, unknown>;
3023
articleId?: string;
3124
articleIdHashes?: string[];
3225
search?: string;
3326
};
34-
customPlaceholders?: CustomPlaceholder[] | null;
3527
}
3628

37-
export interface QueryForArticlesOutput {
29+
export interface PaginateArticlesOutput {
3830
articles: Article[];
39-
properties: string[];
4031
totalArticles: number;
41-
filterEvalResults?: Array<{ passed: boolean }>;
32+
properties: string[];
4233
}
4334

4435
/**
45-
* Query for articles with filtering, sorting, and pagination.
36+
* Paginate articles: sort, filter by ID/search, and apply skip/limit.
37+
* Does NOT trim properties or evaluate filter expressions — those happen after formatting.
4638
*/
47-
export async function queryForArticles({
39+
export function paginateArticles({
4840
articles,
4941
limit,
5042
skip,
5143
random,
5244
selectProperties,
5345
selectPropertyTypes,
54-
filters,
5546
customPlaceholders,
56-
}: QueryForArticlesInput): Promise<QueryForArticlesOutput> {
47+
filters,
48+
}: PaginateArticlesInput): PaginateArticlesOutput {
5749
const placeholdersFromCustomPlaceholders =
5850
customPlaceholders?.map((c) => c.sourcePlaceholder) || [];
5951
const properties = queryForArticleProperties(
@@ -63,12 +55,7 @@ export async function queryForArticles({
6355
);
6456

6557
if (articles.length === 0) {
66-
return {
67-
articles: [],
68-
properties,
69-
totalArticles: 0,
70-
filterEvalResults: [],
71-
};
58+
return { articles: [], properties, totalArticles: 0 };
7259
}
7360

7461
// Sort by date, latest first
@@ -106,7 +93,7 @@ export async function queryForArticles({
10693
const filtersSearch = filters?.search;
10794

10895
if (filtersSearch && typeof filtersSearch === "string") {
109-
matchedArticles = articles.filter((article) => {
96+
matchedArticles = matchedArticles.filter((article) => {
11097
return properties.some((property) =>
11198
article.flattened[property]
11299
?.toLowerCase()
@@ -134,54 +121,10 @@ export async function queryForArticles({
134121
}
135122
}
136123

137-
// Trim articles to only include selected properties
138-
const matchedArticlesWithProperties = matchedArticles.map((article) => {
139-
const trimmed: Article = {
140-
...article,
141-
flattened: {
142-
id: article.flattened.id,
143-
idHash: article.flattened.idHash,
144-
},
145-
raw: article.raw,
146-
};
147-
148-
properties.forEach((property) => {
149-
trimmed.flattened[property] = article.flattened[property] || "";
150-
});
151-
152-
return trimmed;
153-
});
154-
155-
// Evaluate filter expressions if requested
156-
let filterEvalResults: Array<{ passed: boolean }> | undefined;
157-
158-
if (
159-
filters?.returnType ===
160-
GetUserFeedArticlesFilterReturnType.IncludeEvaluationResults
161-
) {
162-
if (filters.expression) {
163-
filterEvalResults = await Promise.all(
164-
matchedArticles.map(async (article) => {
165-
const { result: passed } = evaluateExpression(
166-
filters.expression as unknown as LogicalExpression,
167-
buildFilterReferences(article)
168-
);
169-
170-
return {
171-
passed,
172-
};
173-
})
174-
);
175-
} else {
176-
filterEvalResults = matchedArticles.map(() => ({ passed: true }));
177-
}
178-
}
179-
180124
return {
181-
articles: matchedArticlesWithProperties,
125+
articles: matchedArticles,
182126
totalArticles: totalMatchedArticles,
183127
properties,
184-
filterEvalResults,
185128
};
186129
}
187130

0 commit comments

Comments
 (0)