fix: Implement retry mechanism and improve error handling for scraping function

This commit is contained in:
Harsh Gupta (aider) 2024-08-14 15:46:41 +05:30
parent ddbf0030b4
commit a3a299fb38
2 changed files with 23 additions and 10 deletions

View File

@ -35,10 +35,13 @@ export const crawler = https.onRequest(async (req, res) => {
// registry.title = 'reader';
// registry.version = '0.1.0';
process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`);
process.on('unhandledRejection', (reason, promise) => {
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
// Application specific logging, throwing an error, or other logic here
});
process.on('uncaughtException', (err) => {
console.log('Uncaught exception', err);
console.error('Uncaught Exception:', err);
// Looks like Firebase runtime does not handle error properly.
// Make sure to quit the process.

View File

@ -1,7 +1,7 @@
import os from 'os';
import fs from 'fs';
import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, retry } from 'civkit';
import { Logger } from '../shared/index';
import type { Browser, CookieParam, Page } from 'puppeteer';
@ -461,6 +461,7 @@ document.addEventListener('load', handlePageLoad);
});
}
@retry({ times: 3, delay: 1000 })
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
// parsedUrl.search = '';
const url = parsedUrl.toString();
@ -468,9 +469,13 @@ document.addEventListener('load', handlePageLoad);
let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined;
let pageshot: Buffer | undefined;
const page = await this.getNextPage();
const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
let page: Page | null = null;
let sn: number | undefined;
try {
page = await this.getNextPage();
sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
if (options?.proxyUrl) {
await page.useProxy(options.proxyUrl);
}
@ -653,11 +658,16 @@ document.addEventListener('load', handlePageLoad);
throw error;
}
}
} catch (error) {
this.logger.error(`Error scraping ${url}`, { error: marshalErrorLike(error) });
throw error;
} finally {
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
if (page) {
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
page!.off('snapshot', hdl);
this.ditchPage(page!);
});
}
nextSnapshotDeferred.resolve();
}
}