Revert "WIP: Cookie fixes"

This reverts commit 12850d79c7.
This commit is contained in:
Harsh Gupta 2024-08-15 21:31:17 +05:30
parent 12850d79c7
commit 953429218a
2 changed files with 2 additions and 83 deletions

View File

@ -9,11 +9,6 @@ import { AsyncContext, CloudHTTPv2, FirebaseStorageBucketControl, Logger, Output
import _ from 'lodash'; import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express'; import { Request, Response } from 'express';
import { CookieParam as PuppeteerCookieParam } from 'puppeteer';
type CookieParam = Omit<PuppeteerCookieParam, 'expires'> & {
expires?: number | string;
};
const pNormalizeUrl = import("@esm2cjs/normalize-url"); const pNormalizeUrl = import("@esm2cjs/normalize-url");
// import { AltTextService } from '../services/alt-text'; // import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown'; import TurndownService from 'turndown';
@ -884,7 +879,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const crawlOpts: ExtraScrappingOptions = { const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl, proxyUrl: opts.proxyUrl,
cookies: this.validateCookies(opts.setCookies), cookies: opts.setCookies,
favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith), favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
removeSelector: opts.removeSelector, removeSelector: opts.removeSelector,
targetSelector: opts.targetSelector, targetSelector: opts.targetSelector,
@ -897,56 +892,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return crawlOpts; return crawlOpts;
} }
validateCookies(cookies?: CookieParam[]): PuppeteerCookieParam[] | undefined {
if (!cookies) return undefined;
return cookies.filter(cookie => {
if (!cookie.name || !cookie.value) {
this.logger.warn(`Invalid cookie: missing name or value`, { cookie });
return false;
}
const validatedCookie: PuppeteerCookieParam = {
name: cookie.name,
value: cookie.value,
domain: cookie.domain,
path: cookie.path,
expires: cookie.expires ? Number(new Date(cookie.expires)) / 1000 : undefined,
httpOnly: cookie.httpOnly,
secure: cookie.secure,
sameSite: cookie.sameSite as 'Strict' | 'Lax' | 'None' | undefined
};
if (cookie.expires && isNaN(validatedCookie.expires!)) {
this.logger.warn(`Invalid cookie: invalid expires date`, { cookie });
return false;
}
if (cookie.domain && typeof cookie.domain !== 'string') {
this.logger.warn(`Invalid cookie: domain must be a string`, { cookie });
return false;
}
if (cookie.path && typeof cookie.path !== 'string') {
this.logger.warn(`Invalid cookie: path must be a string`, { cookie });
return false;
}
if (cookie.secure !== undefined && typeof cookie.secure !== 'boolean') {
this.logger.warn(`Invalid cookie: secure must be a boolean`, { cookie });
return false;
}
if (cookie.httpOnly !== undefined && typeof cookie.httpOnly !== 'boolean') {
this.logger.warn(`Invalid cookie: httpOnly must be a boolean`, { cookie });
return false;
}
if (cookie.sameSite && !['Strict', 'Lax', 'None'].includes(cookie.sameSite)) {
this.logger.warn(`Invalid cookie: sameSite must be 'Strict', 'Lax', or 'None'`, { cookie });
return false;
}
return validatedCookie;
}) as PuppeteerCookieParam[];
}
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) { async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
const it = this.scrap(url, { ...opts, minIntervalMs: 500 }); const it = this.scrap(url, { ...opts, minIntervalMs: 500 });

View File

@ -191,7 +191,7 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
static override from<T extends CrawlerOptions>(this: Constructor<T>, input: any, ...args: any[]): T { static override from<T extends CrawlerOptions>(this: Constructor<T>, input: any, ...args: any[]): T {
const instance = super.from(input, ...args) as T; const instance = super.from(input, ...args) as T;
const req = args[0] as Request | undefined; const req = args[0] as Request | undefined;
if (req) { if (req) {
console.log('Request headers:', req.headers); console.log('Request headers:', req.headers);
@ -203,92 +203,74 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
const customMode = getHeader('X-Respond-With') || getHeader('X-Return-Format'); const customMode = getHeader('X-Respond-With') || getHeader('X-Return-Format');
if (customMode) { if (customMode) {
instance.respondWith = customMode; instance.respondWith = customMode;
console.log('Set respondWith:', instance.respondWith);
} }
const withGeneratedAlt = getHeader('X-With-Generated-Alt'); const withGeneratedAlt = getHeader('X-With-Generated-Alt');
if (withGeneratedAlt !== undefined) { if (withGeneratedAlt !== undefined) {
instance.withGeneratedAlt = withGeneratedAlt.toLowerCase() === 'true'; instance.withGeneratedAlt = withGeneratedAlt.toLowerCase() === 'true';
console.log('Set withGeneratedAlt:', instance.withGeneratedAlt);
} }
const withLinksSummary = getHeader('x-with-links-summary'); const withLinksSummary = getHeader('x-with-links-summary');
if (withLinksSummary !== undefined) { if (withLinksSummary !== undefined) {
instance.withLinksSummary = Boolean(withLinksSummary); instance.withLinksSummary = Boolean(withLinksSummary);
console.log('Set withLinksSummary:', instance.withLinksSummary);
} }
const withImagesSummary = getHeader('x-with-images-summary'); const withImagesSummary = getHeader('x-with-images-summary');
if (withImagesSummary !== undefined) { if (withImagesSummary !== undefined) {
instance.withImagesSummary = Boolean(withImagesSummary); instance.withImagesSummary = Boolean(withImagesSummary);
console.log('Set withImagesSummary:', instance.withImagesSummary);
} }
const noCache = getHeader('x-no-cache'); const noCache = getHeader('x-no-cache');
if (noCache !== undefined) { if (noCache !== undefined) {
instance.noCache = Boolean(noCache); instance.noCache = Boolean(noCache);
console.log('Set noCache:', instance.noCache);
} }
if (instance.noCache && instance.cacheTolerance === undefined) { if (instance.noCache && instance.cacheTolerance === undefined) {
instance.cacheTolerance = 0; instance.cacheTolerance = 0;
console.log('Set cacheTolerance to 0 due to noCache');
} }
let cacheTolerance = parseInt(getHeader('x-cache-tolerance') || ''); let cacheTolerance = parseInt(getHeader('x-cache-tolerance') || '');
if (!isNaN(cacheTolerance)) { if (!isNaN(cacheTolerance)) {
instance.cacheTolerance = cacheTolerance; instance.cacheTolerance = cacheTolerance;
console.log('Set cacheTolerance:', instance.cacheTolerance);
} }
let timeoutSeconds = parseInt(getHeader('x-timeout') || ''); let timeoutSeconds = parseInt(getHeader('x-timeout') || '');
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) { if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180; instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
console.log('Set timeout:', instance.timeout);
} else if (getHeader('x-timeout')) { } else if (getHeader('x-timeout')) {
instance.timeout = null; instance.timeout = null;
console.log('Set timeout to null');
} }
const removeSelector = getHeader('x-remove-selector')?.split(', '); const removeSelector = getHeader('x-remove-selector')?.split(', ');
instance.removeSelector ??= removeSelector; instance.removeSelector ??= removeSelector;
console.log('Set removeSelector:', instance.removeSelector);
const targetSelector = getHeader('x-target-selector')?.split(', '); const targetSelector = getHeader('x-target-selector')?.split(', ');
instance.targetSelector ??= targetSelector; instance.targetSelector ??= targetSelector;
console.log('Set targetSelector:', instance.targetSelector);
const waitForSelector = getHeader('x-wait-for-selector')?.split(', '); const waitForSelector = getHeader('x-wait-for-selector')?.split(', ');
instance.waitForSelector ??= waitForSelector || instance.targetSelector; instance.waitForSelector ??= waitForSelector || instance.targetSelector;
console.log('Set waitForSelector:', instance.waitForSelector);
instance.targetSelector = filterSelector(instance.targetSelector); instance.targetSelector = filterSelector(instance.targetSelector);
console.log('Filtered targetSelector:', instance.targetSelector);
const overrideUserAgent = getHeader('x-user-agent'); const overrideUserAgent = getHeader('x-user-agent');
instance.userAgent ??= overrideUserAgent; instance.userAgent ??= overrideUserAgent;
console.log('Set userAgent:', instance.userAgent);
const keepImgDataUrl = getHeader('x-keep-img-data-url'); const keepImgDataUrl = getHeader('x-keep-img-data-url');
if (keepImgDataUrl !== undefined) { if (keepImgDataUrl !== undefined) {
instance.keepImgDataUrl = Boolean(keepImgDataUrl); instance.keepImgDataUrl = Boolean(keepImgDataUrl);
console.log('Set keepImgDataUrl:', instance.keepImgDataUrl);
} }
const withIframe = getHeader('x-with-iframe'); const withIframe = getHeader('x-with-iframe');
if (withIframe !== undefined) { if (withIframe !== undefined) {
instance.withIframe = Boolean(withIframe); instance.withIframe = Boolean(withIframe);
console.log('Set withIframe:', instance.withIframe);
} }
if (instance.withIframe) { if (instance.withIframe) {
instance.timeout ??= null; instance.timeout ??= null;
console.log('Set timeout to null due to withIframe');
} }
const cookies: CookieParam[] = []; const cookies: CookieParam[] = [];
const setCookieHeaders = getHeader('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]); const setCookieHeaders = getHeader('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
console.log('SetCookieHeaders:', setCookieHeaders);
if (Array.isArray(setCookieHeaders)) { if (Array.isArray(setCookieHeaders)) {
for (const setCookie of setCookieHeaders) { for (const setCookie of setCookieHeaders) {
cookies.push({ cookies.push({
@ -300,20 +282,12 @@ export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClas
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam, ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
}); });
} }
console.log('Set cookies:', cookies); // Cool
if (cookies.length > 0) {
instance.setCookies = cookies;
console.log('Set setCookies:', instance.setCookies);
}
const proxyUrl = getHeader('x-proxy-url'); const proxyUrl = getHeader('x-proxy-url');
instance.proxyUrl ??= proxyUrl; instance.proxyUrl ??= proxyUrl;
console.log('Set proxyUrl:', instance.proxyUrl);
if (instance.cacheTolerance) { if (instance.cacheTolerance) {
instance.cacheTolerance = instance.cacheTolerance * 1000; instance.cacheTolerance = instance.cacheTolerance * 1000;
console.log('Adjusted cacheTolerance:', instance.cacheTolerance);
} }
} }