mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
fix the puppeteer thingy
This commit is contained in:
parent
a72373f815
commit
54aae972ae
|
@ -1,7 +1,7 @@
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, retry } from 'civkit';
|
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
||||||
import { Logger } from '../shared/index';
|
import { Logger } from '../shared/index';
|
||||||
|
|
||||||
import type { Browser, CookieParam, Page } from 'puppeteer';
|
import type { Browser, CookieParam, Page } from 'puppeteer';
|
||||||
|
@ -203,7 +203,7 @@ export class PuppeteerControl extends AsyncService {
|
||||||
|
|
||||||
_sn = 0;
|
_sn = 0;
|
||||||
browser!: Browser;
|
browser!: Browser;
|
||||||
logger = new Logger('PuppeteerControl')
|
logger = new Logger('CHANGE_LOGGER_NAME')
|
||||||
|
|
||||||
private __healthCheckInterval?: NodeJS.Timeout;
|
private __healthCheckInterval?: NodeJS.Timeout;
|
||||||
|
|
||||||
|
@ -461,7 +461,6 @@ document.addEventListener('load', handlePageLoad);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@retry({ times: 3, delay: 1000 })
|
|
||||||
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||||
// parsedUrl.search = '';
|
// parsedUrl.search = '';
|
||||||
const url = parsedUrl.toString();
|
const url = parsedUrl.toString();
|
||||||
|
@ -469,12 +468,8 @@ document.addEventListener('load', handlePageLoad);
|
||||||
let snapshot: PageSnapshot | undefined;
|
let snapshot: PageSnapshot | undefined;
|
||||||
let screenshot: Buffer | undefined;
|
let screenshot: Buffer | undefined;
|
||||||
let pageshot: Buffer | undefined;
|
let pageshot: Buffer | undefined;
|
||||||
let page: Page | null = null;
|
const page = await this.getNextPage();
|
||||||
let sn: number | undefined;
|
const sn = this.snMap.get(page);
|
||||||
|
|
||||||
try {
|
|
||||||
page = await this.getNextPage();
|
|
||||||
sn = this.snMap.get(page);
|
|
||||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||||
if (options?.proxyUrl) {
|
if (options?.proxyUrl) {
|
||||||
await page.useProxy(options.proxyUrl);
|
await page.useProxy(options.proxyUrl);
|
||||||
|
@ -658,22 +653,16 @@ document.addEventListener('load', handlePageLoad);
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
this.logger.error(`Error scraping ${url}`, { error: marshalErrorLike(error) });
|
|
||||||
throw error;
|
|
||||||
} finally {
|
} finally {
|
||||||
if (page) {
|
|
||||||
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
||||||
page!.off('snapshot', hdl);
|
page.off('snapshot', hdl);
|
||||||
this.ditchPage(page!);
|
this.ditchPage(page);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
nextSnapshotDeferred.resolve();
|
nextSnapshotDeferred.resolve();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async salvage(url: string, page: Page) {
|
async salvage(url: string, page: Page) {
|
||||||
try {
|
|
||||||
this.logger.info(`Salvaging ${url}`);
|
this.logger.info(`Salvaging ${url}`);
|
||||||
const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
|
const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
|
||||||
const resp = await fetch(googleArchiveUrl, {
|
const resp = await fetch(googleArchiveUrl, {
|
||||||
|
@ -694,13 +683,9 @@ document.addEventListener('load', handlePageLoad);
|
||||||
this.logger.info(`Salvation completed.`);
|
this.logger.info(`Salvation completed.`);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
} catch (error) {
|
|
||||||
this.logger.error(`Error during salvage operation for ${url}`, { error: marshalErrorLike(error) });
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
|
async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
|
||||||
const childFrames = page.mainFrame().childFrames();
|
const childFrames = page.mainFrame().childFrames();
|
||||||
const r = await Promise.all(childFrames.map(async (x) => {
|
const r = await Promise.all(childFrames.map(async (x) => {
|
||||||
const thisUrl = x.url();
|
const thisUrl = x.url();
|
||||||
|
@ -722,4 +707,6 @@ document.addEventListener('load', handlePageLoad);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export const puppeteerControl = container.resolve(PuppeteerControl);
|
const puppeteerControl = container.resolve(PuppeteerControl);
|
||||||
|
|
||||||
|
export default puppeteerControl;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user