mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
skipTlsVerification on robots.txt scraping
This commit is contained in:
parent
bd55464b52
commit
acde353e56
|
@ -78,7 +78,7 @@ export async function crawlController(
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt();
|
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.debug(
|
Logger.debug(
|
||||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||||
|
|
|
@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
import https from "https";
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private jobId: string;
|
private jobId: string;
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
|
@ -145,8 +145,14 @@ export class WebCrawler {
|
||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
public async getRobotsTxt(): Promise<string> {
|
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
let extraArgs = {};
|
||||||
|
if(skipTlsVerification) {
|
||||||
|
extraArgs["httpsAgent"] = new https.Agent({
|
||||||
|
rejectUnauthorized: false
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
||||||
return response.data;
|
return response.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user