From acde353e5652bc64318ff989fe7002c7d798a763 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Wed, 23 Oct 2024 01:07:03 +0300 Subject: [PATCH] skipTlsVerification on robots.txt scraping --- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/scraper/WebScraper/crawler.ts | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index c8e449f0..0000b6fe 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -78,7 +78,7 @@ export async function crawlController( const crawler = crawlToCrawler(id, sc); try { - sc.robots = await crawler.getRobotsTxt(); + sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification); } catch (e) { Logger.debug( `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 009a5933..72a49fd8 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -9,7 +9,7 @@ import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; import { Logger } from "../../../src/lib/logger"; - +import https from "https"; export class WebCrawler { private jobId: string; private initialUrl: string; @@ -145,8 +145,14 @@ export class WebCrawler { .slice(0, limit); } - public async getRobotsTxt(): Promise { - const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); + public async getRobotsTxt(skipTlsVerification = false): Promise { + let extraArgs = {}; + if(skipTlsVerification) { + extraArgs["httpsAgent"] = new https.Agent({ + rejectUnauthorized: false + }); + } + const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs }); return response.data; }