Skip to content

Commit a7b9ce7

Browse files
authored
fix: enhance snapshot file handling and browser management (#39)
* fix: improved browser management in the crawler * fix: enhance snapshot handling and file deletion logic * chore: 1.5.3
1 parent 674bc5f commit a7b9ce7

11 files changed

Lines changed: 189 additions & 146 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 1.5.3 (2026-1-7)
2+
3+
- fix: enhance snapshot handling and file deletion logic
4+
- fix: improved browser management in the crawler
5+
- fix: add ignoreRobots params for skip robots detect
6+
17
## 1.5.2 (2026-1-5)
28

39
- chore: update deps

blocklets/snap-kit/api/src/routes/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const crawlSchema = Joi.object({
1919
cookies: Joi.array().items(Joi.object({ name: Joi.string().required(), value: Joi.string().required() })),
2020
localStorage: Joi.array().items(Joi.object({ key: Joi.string().required(), value: Joi.string().required() })),
2121
sync: Joi.boolean().default(false),
22+
ignoreRobots: Joi.boolean().default(true),
2223
});
2324
router.post('/crawl', session({ accessKey: true }), auth({ methods: ['accessKey'] }), async (req, res) => {
2425
const params = await crawlSchema.validateAsync(req.body);
@@ -98,6 +99,7 @@ const snapSchema = Joi.object({
9899
cookies: Joi.array().items(Joi.object({ name: Joi.string().required(), value: Joi.string().required() })),
99100
localStorage: Joi.array().items(Joi.object({ key: Joi.string().required(), value: Joi.string().required() })),
100101
sync: Joi.boolean().default(false),
102+
ignoreRobots: Joi.boolean().default(true),
101103
});
102104
router.post('/snap', session({ accessKey: true }), auth({ methods: ['accessKey'] }), async (req, res) => {
103105
const params = await snapSchema.validateAsync(req.body);

blocklets/snap-kit/blocklet.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ repository:
1616
type: git
1717
url: git+https://github.com/blocklet/create-blocklet.git
1818
specVersion: 1.2.8
19-
version: 1.5.2
19+
version: 1.5.3
2020
logo: logo.png
2121
files:
2222
- dist

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "crawler",
33
"private": true,
4-
"version": "1.5.2",
4+
"version": "1.5.3",
55
"scripts": {
66
"dev": "pnpm run --filter @arcblock/crawler dev & pnpm run --filter @arcblock/crawler-middleware dev & pnpm run --filter @blocklet/snap-kit dev",
77
"build:packages": "pnpm -r build",
@@ -61,4 +61,4 @@
6161
"simple-git-hooks": {
6262
"pre-commit": "npx lint-staged"
6363
}
64-
}
64+
}

packages/crawler/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@arcblock/crawler",
3-
"version": "1.5.2",
3+
"version": "1.5.3",
44
"main": "lib/cjs/index.js",
55
"module": "lib/esm/index.js",
66
"types": "lib/cjs/index.d.ts",

packages/crawler/src/crawler.ts

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import path from 'path';
88

99
import { config, logger } from './config';
1010
import { jobDurationSeconds, jobTotalLatencySeconds, jobsEnqueuedTotal, jobsTotal } from './metrics';
11-
import { initPage } from './puppeteer';
11+
import { closeBrowser, initPage, isBrowserConnectionError } from './puppeteer';
1212
import { createCarbonImage } from './services/carbon';
1313
import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
1414
import { Job, JobState, Snapshot, SnapshotModel, sequelize } from './store';
@@ -45,6 +45,7 @@ export function createCrawlQueue(queue: string, handler?: PageHandler) {
4545
options: {
4646
concurrency: config.concurrency,
4747
enableScheduledJob: true,
48+
maxRetries: 3,
4849
},
4950
onJob: async (job: JobState) => {
5051
const startTime = Date.now();
@@ -260,6 +261,21 @@ export const getPageContent = async (
260261
let screenshot: Uint8Array | null = null;
261262
const meta: { title?: string; description?: string } = {};
262263

264+
const closePageSafely = async () => {
265+
try {
266+
await page.close();
267+
} catch (error) {
268+
if (isBrowserConnectionError(error)) {
269+
try {
270+
await closeBrowser({ trimCache: false });
271+
} catch (closeError) {
272+
logger.warn('Failed to close browser after page close error', { error: closeError });
273+
}
274+
}
275+
logger.warn('Failed to close page:', { error });
276+
}
277+
};
278+
263279
try {
264280
const response = await page.goto(url, { timeout });
265281

@@ -359,11 +375,20 @@ export const getPageContent = async (
359375
logger.error('Failed to get html:', err);
360376
throw err;
361377
}
378+
379+
await closePageSafely();
362380
} catch (error) {
381+
if (isBrowserConnectionError(error)) {
382+
try {
383+
await closeBrowser({ trimCache: false });
384+
} catch (closeError) {
385+
logger.warn('Failed to close browser after page error', { error: closeError });
386+
}
387+
} else {
388+
await closePageSafely();
389+
}
363390
logger.error('Failed to get page content:', error);
364391
throw error;
365-
} finally {
366-
await page.close();
367392
}
368393

369394
return {

packages/crawler/src/metrics.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ export async function collectMetrics() {
5454
try {
5555
// 收集队列大小
5656
const jobStats = await Job.stats();
57+
// Reset first to clear queues that no longer have jobs
58+
queueSize.reset();
5759
jobStats.queues.forEach((q) => {
5860
queueSize.set({ queue: q.queue }, q.count);
5961
});

0 commit comments

Comments
 (0)