mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
skipTlsVerification on robots.txt scraping
This commit is contained in:
parent
bd55464b52
commit
acde353e56
|
@ -78,7 +78,7 @@ export async function crawlController(
|
|||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
|
|
|
@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
|
|||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
import https from "https";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
|
@ -145,8 +145,14 @@ export class WebCrawler {
|
|||
.slice(0, limit);
|
||||
}
|
||||
|
||||
public async getRobotsTxt(): Promise<string> {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||
let extraArgs = {};
|
||||
if(skipTlsVerification) {
|
||||
extraArgs["httpsAgent"] = new https.Agent({
|
||||
rejectUnauthorized: false
|
||||
});
|
||||
}
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user