From 1a5f2eb408aff21e3c864cb18db35ac67a176ac2 Mon Sep 17 00:00:00 2001 From: "Harsh Gupta (aider)" Date: Thu, 15 Aug 2024 22:40:49 +0530 Subject: [PATCH] feat: Read cookies from x-set-cookie header and set those cookies in crawlOpts, the url needs to be read from the request parameters --- .../functions/src/cloud-functions/crawler.ts | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index bd180ea..a4a6cd0 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -674,7 +674,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; }); } - const crawlOpts = this.configure(crawlerOptions); + const crawlOpts = this.configure(crawlerOptions, req); console.log('Configured crawl options:', crawlOpts); if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { @@ -865,7 +865,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } } - configure(opts: CrawlerOptions) { + configure(opts: CrawlerOptions, req: Request) { this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt); this.threadLocal.set('withLinksSummary', opts.withLinksSummary); @@ -877,15 +877,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; this.threadLocal.set('timeout', opts.timeout * 1000); } - const randomCookies = [ - { name: 'session_id', value: Math.random().toString(36).substring(7), url: 'https://hargup-ripeharlequincephalopod.web.val.run/' }, - { name: 'user_pref', value: 'dark_mode', url: 'https://hargup-ripeharlequincephalopod.web.val.run/' }, - { name: 'visit_count', value: Math.floor(Math.random() * 10).toString(), url: 'https://hargup-ripeharlequincephalopod.web.val.run/' } - ]; + const cookies = req.headers['x-set-cookie'] ? + (Array.isArray(req.headers['x-set-cookie']) ? req.headers['x-set-cookie'] : [req.headers['x-set-cookie']]) + .map(cookie => { + const [name, value] = cookie.split('='); + return { name, value, url: opts.url || req.query.url as string }; + }) + : []; const crawlOpts: ExtraScrappingOptions = { proxyUrl: opts.proxyUrl, - cookies: randomCookies, + cookies: cookies, favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith), removeSelector: opts.removeSelector, targetSelector: opts.targetSelector,