skipTlsVerification on robots.txt scraping

This commit is contained in:
Thomas Kosmas 2024-10-23 01:07:03 +03:00
parent bd55464b52
commit acde353e56
2 changed files with 10 additions and 4 deletions

View File

@ -78,7 +78,7 @@ export async function crawlController(
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
try { try {
sc.robots = await crawler.getRobotsTxt(); sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
} catch (e) { } catch (e) {
Logger.debug( Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(

View File

@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger"; import { Logger } from "../../../src/lib/logger";
import https from "https";
export class WebCrawler { export class WebCrawler {
private jobId: string; private jobId: string;
private initialUrl: string; private initialUrl: string;
@ -145,8 +145,14 @@ export class WebCrawler {
.slice(0, limit); .slice(0, limit);
} }
public async getRobotsTxt(): Promise<string> { public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); let extraArgs = {};
if(skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
rejectUnauthorized: false
});
}
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
return response.data; return response.data;
} }