mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
parent
12850d79c7
commit
953429218a
|
@ -9,11 +9,6 @@ import { AsyncContext, CloudHTTPv2, FirebaseStorageBucketControl, Logger, Output
|
|||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
import { Request, Response } from 'express';
|
||||
import { CookieParam as PuppeteerCookieParam } from 'puppeteer';
|
||||
|
||||
type CookieParam = Omit<PuppeteerCookieParam, 'expires'> & {
|
||||
expires?: number | string;
|
||||
};
|
||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||
// import { AltTextService } from '../services/alt-text';
|
||||
import TurndownService from 'turndown';
|
||||
|
@ -884,7 +879,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
|
||||
const crawlOpts: ExtraScrappingOptions = {
|
||||
proxyUrl: opts.proxyUrl,
|
||||
cookies: this.validateCookies(opts.setCookies),
|
||||
cookies: opts.setCookies,
|
||||
favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
|
||||
removeSelector: opts.removeSelector,
|
||||
targetSelector: opts.targetSelector,
|
||||
|
@ -897,56 +892,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
return crawlOpts;
|
||||
}
|
||||
|
||||
validateCookies(cookies?: CookieParam[]): PuppeteerCookieParam[] | undefined {
|
||||
if (!cookies) return undefined;
|
||||
|
||||
return cookies.filter(cookie => {
|
||||
if (!cookie.name || !cookie.value) {
|
||||
this.logger.warn(`Invalid cookie: missing name or value`, { cookie });
|
||||
return false;
|
||||
}
|
||||
|
||||
const validatedCookie: PuppeteerCookieParam = {
|
||||
name: cookie.name,
|
||||
value: cookie.value,
|
||||
domain: cookie.domain,
|
||||
path: cookie.path,
|
||||
expires: cookie.expires ? Number(new Date(cookie.expires)) / 1000 : undefined,
|
||||
httpOnly: cookie.httpOnly,
|
||||
secure: cookie.secure,
|
||||
sameSite: cookie.sameSite as 'Strict' | 'Lax' | 'None' | undefined
|
||||
};
|
||||
|
||||
if (cookie.expires && isNaN(validatedCookie.expires!)) {
|
||||
this.logger.warn(`Invalid cookie: invalid expires date`, { cookie });
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cookie.domain && typeof cookie.domain !== 'string') {
|
||||
this.logger.warn(`Invalid cookie: domain must be a string`, { cookie });
|
||||
return false;
|
||||
}
|
||||
if (cookie.path && typeof cookie.path !== 'string') {
|
||||
this.logger.warn(`Invalid cookie: path must be a string`, { cookie });
|
||||
return false;
|
||||
}
|
||||
if (cookie.secure !== undefined && typeof cookie.secure !== 'boolean') {
|
||||
this.logger.warn(`Invalid cookie: secure must be a boolean`, { cookie });
|
||||
return false;
|
||||
}
|
||||
if (cookie.httpOnly !== undefined && typeof cookie.httpOnly !== 'boolean') {
|
||||
this.logger.warn(`Invalid cookie: httpOnly must be a boolean`, { cookie });
|
||||
return false;
|
||||
}
|
||||
if (cookie.sameSite && !['Strict', 'Lax', 'None'].includes(cookie.sameSite)) {
|
||||
this.logger.warn(`Invalid cookie: sameSite must be 'Strict', 'Lax', or 'None'`, { cookie });
|
||||
return false;
|
||||
}
|
||||
|
||||
return validatedCookie;
|
||||
}) as PuppeteerCookieParam[];
|
||||
}
|
||||
|
||||
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
||||
const it = this.scrap(url, { ...opts, minIntervalMs: 500 });
|
||||
|
||||
|
|
|
@ -203,92 +203,74 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
|
|||
const customMode = getHeader('X-Respond-With') || getHeader('X-Return-Format');
|
||||
if (customMode) {
|
||||
instance.respondWith = customMode;
|
||||
console.log('Set respondWith:', instance.respondWith);
|
||||
}
|
||||
|
||||
const withGeneratedAlt = getHeader('X-With-Generated-Alt');
|
||||
if (withGeneratedAlt !== undefined) {
|
||||
instance.withGeneratedAlt = withGeneratedAlt.toLowerCase() === 'true';
|
||||
console.log('Set withGeneratedAlt:', instance.withGeneratedAlt);
|
||||
}
|
||||
|
||||
const withLinksSummary = getHeader('x-with-links-summary');
|
||||
if (withLinksSummary !== undefined) {
|
||||
instance.withLinksSummary = Boolean(withLinksSummary);
|
||||
console.log('Set withLinksSummary:', instance.withLinksSummary);
|
||||
}
|
||||
|
||||
const withImagesSummary = getHeader('x-with-images-summary');
|
||||
if (withImagesSummary !== undefined) {
|
||||
instance.withImagesSummary = Boolean(withImagesSummary);
|
||||
console.log('Set withImagesSummary:', instance.withImagesSummary);
|
||||
}
|
||||
|
||||
const noCache = getHeader('x-no-cache');
|
||||
if (noCache !== undefined) {
|
||||
instance.noCache = Boolean(noCache);
|
||||
console.log('Set noCache:', instance.noCache);
|
||||
}
|
||||
|
||||
if (instance.noCache && instance.cacheTolerance === undefined) {
|
||||
instance.cacheTolerance = 0;
|
||||
console.log('Set cacheTolerance to 0 due to noCache');
|
||||
}
|
||||
|
||||
let cacheTolerance = parseInt(getHeader('x-cache-tolerance') || '');
|
||||
if (!isNaN(cacheTolerance)) {
|
||||
instance.cacheTolerance = cacheTolerance;
|
||||
console.log('Set cacheTolerance:', instance.cacheTolerance);
|
||||
}
|
||||
|
||||
let timeoutSeconds = parseInt(getHeader('x-timeout') || '');
|
||||
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
|
||||
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
|
||||
console.log('Set timeout:', instance.timeout);
|
||||
} else if (getHeader('x-timeout')) {
|
||||
instance.timeout = null;
|
||||
console.log('Set timeout to null');
|
||||
}
|
||||
|
||||
const removeSelector = getHeader('x-remove-selector')?.split(', ');
|
||||
instance.removeSelector ??= removeSelector;
|
||||
console.log('Set removeSelector:', instance.removeSelector);
|
||||
|
||||
const targetSelector = getHeader('x-target-selector')?.split(', ');
|
||||
instance.targetSelector ??= targetSelector;
|
||||
console.log('Set targetSelector:', instance.targetSelector);
|
||||
|
||||
const waitForSelector = getHeader('x-wait-for-selector')?.split(', ');
|
||||
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
||||
console.log('Set waitForSelector:', instance.waitForSelector);
|
||||
|
||||
instance.targetSelector = filterSelector(instance.targetSelector);
|
||||
console.log('Filtered targetSelector:', instance.targetSelector);
|
||||
|
||||
const overrideUserAgent = getHeader('x-user-agent');
|
||||
instance.userAgent ??= overrideUserAgent;
|
||||
console.log('Set userAgent:', instance.userAgent);
|
||||
|
||||
const keepImgDataUrl = getHeader('x-keep-img-data-url');
|
||||
if (keepImgDataUrl !== undefined) {
|
||||
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
||||
console.log('Set keepImgDataUrl:', instance.keepImgDataUrl);
|
||||
}
|
||||
|
||||
const withIframe = getHeader('x-with-iframe');
|
||||
if (withIframe !== undefined) {
|
||||
instance.withIframe = Boolean(withIframe);
|
||||
console.log('Set withIframe:', instance.withIframe);
|
||||
}
|
||||
|
||||
if (instance.withIframe) {
|
||||
instance.timeout ??= null;
|
||||
console.log('Set timeout to null due to withIframe');
|
||||
}
|
||||
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = getHeader('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
||||
console.log('SetCookieHeaders:', setCookieHeaders);
|
||||
if (Array.isArray(setCookieHeaders)) {
|
||||
for (const setCookie of setCookieHeaders) {
|
||||
cookies.push({
|
||||
|
@ -300,20 +282,12 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
|
|||
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
||||
});
|
||||
}
|
||||
console.log('Set cookies:', cookies); // Cool
|
||||
|
||||
if (cookies.length > 0) {
|
||||
instance.setCookies = cookies;
|
||||
console.log('Set setCookies:', instance.setCookies);
|
||||
}
|
||||
|
||||
const proxyUrl = getHeader('x-proxy-url');
|
||||
instance.proxyUrl ??= proxyUrl;
|
||||
console.log('Set proxyUrl:', instance.proxyUrl);
|
||||
|
||||
if (instance.cacheTolerance) {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
console.log('Adjusted cacheTolerance:', instance.cacheTolerance);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user