From a3a299fb382e5a78612ca16af46a173d08d03f4d Mon Sep 17 00:00:00 2001 From: "Harsh Gupta (aider)" Date: Wed, 14 Aug 2024 15:46:41 +0530 Subject: [PATCH] fix: Implement retry mechanism and improve error handling for scraping function --- backend/functions/src/index.ts | 7 ++++-- backend/functions/src/services/puppeteer.ts | 26 ++++++++++++++------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/backend/functions/src/index.ts b/backend/functions/src/index.ts index 700f790..2e6b661 100644 --- a/backend/functions/src/index.ts +++ b/backend/functions/src/index.ts @@ -35,10 +35,13 @@ export const crawler = https.onRequest(async (req, res) => { // registry.title = 'reader'; // registry.version = '0.1.0'; -process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`); +process.on('unhandledRejection', (reason, promise) => { + console.error('Unhandled Rejection at:', promise, 'reason:', reason); + // Application specific logging, throwing an error, or other logic here +}); process.on('uncaughtException', (err) => { - console.log('Uncaught exception', err); + console.error('Uncaught Exception:', err); // Looks like Firebase runtime does not handle error properly. // Make sure to quit the process. diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 8d53311..0247569 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -1,7 +1,7 @@ import os from 'os'; import fs from 'fs'; import { container, singleton } from 'tsyringe'; -import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; +import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, retry } from 'civkit'; import { Logger } from '../shared/index'; import type { Browser, CookieParam, Page } from 'puppeteer'; @@ -461,6 +461,7 @@ document.addEventListener('load', handlePageLoad); }); } + @retry({ times: 3, delay: 1000 }) async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator { // parsedUrl.search = ''; const url = parsedUrl.toString(); @@ -468,9 +469,13 @@ document.addEventListener('load', handlePageLoad); let snapshot: PageSnapshot | undefined; let screenshot: Buffer | undefined; let pageshot: Buffer | undefined; - const page = await this.getNextPage(); - const sn = this.snMap.get(page); - this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); + let page: Page | null = null; + let sn: number | undefined; + + try { + page = await this.getNextPage(); + sn = this.snMap.get(page); + this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); if (options?.proxyUrl) { await page.useProxy(options.proxyUrl); } @@ -653,11 +658,16 @@ document.addEventListener('load', handlePageLoad); throw error; } } + } catch (error) { + this.logger.error(`Error scraping ${url}`, { error: marshalErrorLike(error) }); + throw error; } finally { - (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { - page.off('snapshot', hdl); - this.ditchPage(page); - }); + if (page) { + (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { + page!.off('snapshot', hdl); + this.ditchPage(page!); + }); + } nextSnapshotDeferred.resolve(); } }