fix the puppeteer thingy

This commit is contained in:
Harsh Gupta 2024-08-14 16:39:59 +05:30
parent a72373f815
commit 54aae972ae

View File

@ -1,7 +1,7 @@
import os from 'os'; import os from 'os';
import fs from 'fs'; import fs from 'fs';
import { container, singleton } from 'tsyringe'; import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, retry } from 'civkit'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
import { Logger } from '../shared/index'; import { Logger } from '../shared/index';
import type { Browser, CookieParam, Page } from 'puppeteer'; import type { Browser, CookieParam, Page } from 'puppeteer';
@ -203,7 +203,7 @@ export class PuppeteerControl extends AsyncService {
_sn = 0; _sn = 0;
browser!: Browser; browser!: Browser;
logger = new Logger('PuppeteerControl') logger = new Logger('CHANGE_LOGGER_NAME')
private __healthCheckInterval?: NodeJS.Timeout; private __healthCheckInterval?: NodeJS.Timeout;
@ -461,7 +461,6 @@ document.addEventListener('load', handlePageLoad);
}); });
} }
@retry({ times: 3, delay: 1000 })
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> { async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
// parsedUrl.search = ''; // parsedUrl.search = '';
const url = parsedUrl.toString(); const url = parsedUrl.toString();
@ -469,12 +468,8 @@ document.addEventListener('load', handlePageLoad);
let snapshot: PageSnapshot | undefined; let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined; let screenshot: Buffer | undefined;
let pageshot: Buffer | undefined; let pageshot: Buffer | undefined;
let page: Page | null = null; const page = await this.getNextPage();
let sn: number | undefined; const sn = this.snMap.get(page);
try {
page = await this.getNextPage();
sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
if (options?.proxyUrl) { if (options?.proxyUrl) {
await page.useProxy(options.proxyUrl); await page.useProxy(options.proxyUrl);
@ -658,22 +653,16 @@ document.addEventListener('load', handlePageLoad);
throw error; throw error;
} }
} }
} catch (error) {
this.logger.error(`Error scraping ${url}`, { error: marshalErrorLike(error) });
throw error;
} finally { } finally {
if (page) {
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
page!.off('snapshot', hdl); page.off('snapshot', hdl);
this.ditchPage(page!); this.ditchPage(page);
}); });
}
nextSnapshotDeferred.resolve(); nextSnapshotDeferred.resolve();
} }
} }
private async salvage(url: string, page: Page) { async salvage(url: string, page: Page) {
try {
this.logger.info(`Salvaging ${url}`); this.logger.info(`Salvaging ${url}`);
const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`; const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
const resp = await fetch(googleArchiveUrl, { const resp = await fetch(googleArchiveUrl, {
@ -694,13 +683,9 @@ document.addEventListener('load', handlePageLoad);
this.logger.info(`Salvation completed.`); this.logger.info(`Salvation completed.`);
return true; return true;
} catch (error) {
this.logger.error(`Error during salvage operation for ${url}`, { error: marshalErrorLike(error) });
return null;
}
} }
private async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> { async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
const childFrames = page.mainFrame().childFrames(); const childFrames = page.mainFrame().childFrames();
const r = await Promise.all(childFrames.map(async (x) => { const r = await Promise.all(childFrames.map(async (x) => {
const thisUrl = x.url(); const thisUrl = x.url();
@ -722,4 +707,6 @@ document.addEventListener('load', handlePageLoad);
} }
export const puppeteerControl = container.resolve(PuppeteerControl); const puppeteerControl = container.resolve(PuppeteerControl);
export default puppeteerControl;