mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
fix: Implement retry mechanism and improve error handling for scraping function
This commit is contained in:
parent
ddbf0030b4
commit
a3a299fb38
|
@ -35,10 +35,13 @@ export const crawler = https.onRequest(async (req, res) => {
|
|||
// registry.title = 'reader';
|
||||
// registry.version = '0.1.0';
|
||||
|
||||
process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`);
|
||||
process.on('unhandledRejection', (reason, promise) => {
|
||||
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
|
||||
// Application specific logging, throwing an error, or other logic here
|
||||
});
|
||||
|
||||
process.on('uncaughtException', (err) => {
|
||||
console.log('Uncaught exception', err);
|
||||
console.error('Uncaught Exception:', err);
|
||||
|
||||
// Looks like Firebase runtime does not handle error properly.
|
||||
// Make sure to quit the process.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os from 'os';
|
||||
import fs from 'fs';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, retry } from 'civkit';
|
||||
import { Logger } from '../shared/index';
|
||||
|
||||
import type { Browser, CookieParam, Page } from 'puppeteer';
|
||||
|
@ -461,6 +461,7 @@ document.addEventListener('load', handlePageLoad);
|
|||
});
|
||||
}
|
||||
|
||||
@retry({ times: 3, delay: 1000 })
|
||||
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||
// parsedUrl.search = '';
|
||||
const url = parsedUrl.toString();
|
||||
|
@ -468,9 +469,13 @@ document.addEventListener('load', handlePageLoad);
|
|||
let snapshot: PageSnapshot | undefined;
|
||||
let screenshot: Buffer | undefined;
|
||||
let pageshot: Buffer | undefined;
|
||||
const page = await this.getNextPage();
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||
let page: Page | null = null;
|
||||
let sn: number | undefined;
|
||||
|
||||
try {
|
||||
page = await this.getNextPage();
|
||||
sn = this.snMap.get(page);
|
||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||
if (options?.proxyUrl) {
|
||||
await page.useProxy(options.proxyUrl);
|
||||
}
|
||||
|
@ -653,11 +658,16 @@ document.addEventListener('load', handlePageLoad);
|
|||
throw error;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error(`Error scraping ${url}`, { error: marshalErrorLike(error) });
|
||||
throw error;
|
||||
} finally {
|
||||
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
||||
page.off('snapshot', hdl);
|
||||
this.ditchPage(page);
|
||||
});
|
||||
if (page) {
|
||||
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
||||
page!.off('snapshot', hdl);
|
||||
this.ditchPage(page!);
|
||||
});
|
||||
}
|
||||
nextSnapshotDeferred.resolve();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user