fix the puppeteer thingy

This commit is contained in:
Harsh Gupta 2024-08-14 16:39:59 +05:30
parent a72373f815
commit 54aae972ae

View File

@ -1,7 +1,7 @@
import os from 'os';
import fs from 'fs';
import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, retry } from 'civkit';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
import { Logger } from '../shared/index';
import type { Browser, CookieParam, Page } from 'puppeteer';
@ -203,7 +203,7 @@ export class PuppeteerControl extends AsyncService {
_sn = 0;
browser!: Browser;
logger = new Logger('PuppeteerControl')
logger = new Logger('CHANGE_LOGGER_NAME')
private __healthCheckInterval?: NodeJS.Timeout;
@ -461,7 +461,6 @@ document.addEventListener('load', handlePageLoad);
});
}
@retry({ times: 3, delay: 1000 })
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
// parsedUrl.search = '';
const url = parsedUrl.toString();
@ -469,12 +468,8 @@ document.addEventListener('load', handlePageLoad);
let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined;
let pageshot: Buffer | undefined;
let page: Page | null = null;
let sn: number | undefined;
try {
page = await this.getNextPage();
sn = this.snMap.get(page);
const page = await this.getNextPage();
const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
if (options?.proxyUrl) {
await page.useProxy(options.proxyUrl);
@ -658,22 +653,16 @@ document.addEventListener('load', handlePageLoad);
throw error;
}
}
} catch (error) {
this.logger.error(`Error scraping ${url}`, { error: marshalErrorLike(error) });
throw error;
} finally {
if (page) {
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
page!.off('snapshot', hdl);
this.ditchPage(page!);
page.off('snapshot', hdl);
this.ditchPage(page);
});
}
nextSnapshotDeferred.resolve();
}
}
private async salvage(url: string, page: Page) {
try {
async salvage(url: string, page: Page) {
this.logger.info(`Salvaging ${url}`);
const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
const resp = await fetch(googleArchiveUrl, {
@ -694,13 +683,9 @@ document.addEventListener('load', handlePageLoad);
this.logger.info(`Salvation completed.`);
return true;
} catch (error) {
this.logger.error(`Error during salvage operation for ${url}`, { error: marshalErrorLike(error) });
return null;
}
}
private async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
const childFrames = page.mainFrame().childFrames();
const r = await Promise.all(childFrames.map(async (x) => {
const thisUrl = x.url();
@ -722,4 +707,6 @@ document.addEventListener('load', handlePageLoad);
}
export const puppeteerControl = container.resolve(PuppeteerControl);
const puppeteerControl = container.resolve(PuppeteerControl);
export default puppeteerControl;