Merge remote-tracking branch 'origin/v1/python-sdk' into v1-webscraper

This commit is contained in:
rafaelsideguide 2024-08-22 15:18:05 -03:00
commit 30e809966f
25 changed files with 1575 additions and 452 deletions

1
.gitignore vendored
View File

@ -19,3 +19,4 @@ apps/test-suite/load-test-results/test-run-report.json
apps/playwright-service-ts/node_modules/ apps/playwright-service-ts/node_modules/
apps/playwright-service-ts/package-lock.json apps/playwright-service-ts/package-lock.json
*.pyc

View File

@ -538,7 +538,7 @@ describe("E2E Tests for v0 API Routes", () => {
const urls = completedResponse.body.data.map( const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL (item: any) => item.metadata?.sourceURL
); );
expect(urls.length).toBeGreaterThan(1); expect(urls.length).toBeGreaterThanOrEqual(1);
// Check if all URLs have a maximum depth of 1 // Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => { urls.forEach((url: string) => {
@ -762,11 +762,11 @@ describe("E2E Tests for v0 API Routes", () => {
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://jestjs.io" }); .send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
await new Promise((r) => setTimeout(r, 20000)); await new Promise((r) => setTimeout(r, 10000));
const responseCancel = await request(TEST_URL) const responseCancel = await request(TEST_URL)
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
@ -869,96 +869,4 @@ describe("E2E Tests for v0 API Routes", () => {
60000 60000
); // 60 secs ); // 60 secs
}); });
describe("POST /v0/map", () => {
it.concurrent(
"should return a list of links for mendable.ai without subdomains included",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
expect(response.body.links).not.toContain("https://docs.mendable.ai");
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
},
60000
); // 60 secs
it.concurrent(
"should return a list of links for a given URL with subdomains included",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://python.langchain.com",
includeSubdomains: true,
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
},
60000
); // 60 secs
it.concurrent(
"should return a list of links for a given URL with subdomains and search",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://python.langchain.com",
includeSubdomains: true,
search: "agents",
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
expect(response.body.links).toContain(
"https://api.python.langchain.com/en/latest/_modules/langchain/agents/openai_functions_agent/base.html"
);
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
response.body.links.forEach((link) => {
expect(link).toContain("python.langchain.com");
});
},
60000
); // 60 secs
it.concurrent(
"should handle invalid URL input gracefully",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "invalid-url",
includeSubdomains: true,
search: "agents",
});
expect(response.statusCode).toBe(400);
expect(response.body).toHaveProperty("success", false);
expect(response.body).toHaveProperty("details");
},
60000
); // 60 secs
});
}); });

View File

@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try { // try {

View File

@ -74,7 +74,15 @@ export async function scrapeHelper(
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
delete doc.rawHtml; if (doc.rawHtml) {
delete doc.rawHtml;
}
}
if (!pageOptions.includeHtml) {
if (doc.html) {
delete doc.html;
}
} }
return { return {

View File

@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) {
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { const pageOptions = req.body.pageOptions ?? {
includeHtml: true, includeHtml: req.body.pageOptions?.includeHtml ?? false,
onlyMainContent: true, onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
fetchPageContent: true, fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
removeTags: [], removeTags: req.body.pageOptions?.removeTags ?? [],
fallback: false, fallback: req.body.pageOptions?.fallback ?? false,
}; };
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";

View File

@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds
export const defaultPageOptions = { export const defaultPageOptions = {
onlyMainContent: false, onlyMainContent: false,
includeHtml: true, includeHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
fullPageScreenshot: false, fullPageScreenshot: false,
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {
export const defaultCrawlPageOptions = { export const defaultCrawlPageOptions = {
onlyMainContent: false, onlyMainContent: false,
includeHtml: true, includeHtml: false,
removeTags: [], removeTags: [],
parsePDF: true parsePDF: true
} }

View File

@ -296,6 +296,12 @@ export class WebScraperDataProvider {
if (this.pageOptions.includeMarkdown) { if (this.pageOptions.includeMarkdown) {
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
} }
if (!this.pageOptions.includeHtml) {
for (let document of documents) {
delete document.html;
}
}
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
@ -572,12 +578,19 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000; this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText = this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false; options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { this.pageOptions = {
onlyMainContent: false, onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
includeHtml: true, includeHtml: options.pageOptions?.includeHtml ?? false,
replaceAllPathsWithAbsolutePaths: false, replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false,
parsePDF: true, parsePDF: options.pageOptions?.parsePDF ?? true,
removeTags: [], removeTags: options.pageOptions?.removeTags ?? [],
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
waitFor: options.pageOptions?.waitFor ?? undefined,
headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true,
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
screenshot: options.pageOptions?.screenshot ?? false,
}; };
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths = this.replaceAllPathsWithAbsolutePaths =

View File

@ -122,23 +122,36 @@ function getScrapingFallbackOrder(
export async function scrapSingleUrl( export async function scrapSingleUrl(
jobId: string, jobId: string,
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions,
includeMarkdown: true, extractorOptions?: ExtractorOptions,
onlyMainContent: true, existingHtml?: string,
includeHtml: true,
includeRawHtml: false,
waitFor: 0,
screenshot: false,
fullPageScreenshot: false,
headers: undefined,
includeLinks: true
},
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown",
},
existingHtml: string = "",
priority?: number, priority?: number,
): Promise<Document> { ): Promise<Document> {
pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true,
onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false,
waitFor: pageOptions.waitFor ?? undefined,
screenshot: pageOptions.screenshot ?? false,
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
headers: pageOptions.headers ?? undefined,
includeLinks: pageOptions.includeLinks ?? true,
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false,
parsePDF: pageOptions.parsePDF ?? true,
removeTags: pageOptions.removeTags ?? [],
}
if (extractorOptions) {
extractorOptions = {
mode: extractorOptions.mode ?? "llm-extraction-from-markdown",
}
}
if (!existingHtml) {
existingHtml = "";
}
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
const attemptScraping = async ( const attemptScraping = async (

View File

@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
description = soup('meta[name="description"]').attr("content") || null; description = soup('meta[name="description"]').attr("content") || null;
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern
const pattern = /([a-zA-Z]+-[A-Z]{2})/; language = soup('html').attr('lang') || null;
const match = pattern.exec(url);
language = match ? match[1] : null;
keywords = soup('meta[name="keywords"]').attr("content") || null; keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null;

View File

@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) {
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;
const rawHtml = docs[0].rawHtml; const rawHtml = docs[0] ? docs[0].rawHtml : "";
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
delete docs[0].rawHtml; if (docs[0] && docs[0].rawHtml) {
delete docs[0].rawHtml;
}
} }
const data = { const data = {

View File

@ -1,16 +1,16 @@
import { v4 as uuidv4 } from 'uuid'; import FirecrawlApp from './firecrawl/src/index'; //'@mendable/firecrawl-js';
import FirecrawlApp from '@mendable/firecrawl-js';
import { z } from "zod";
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
// Scrape a website: // Scrape a website:
const scrapeResult = await app.scrapeUrl('firecrawl.dev'); const scrapeResult = await app.scrapeUrl('firecrawl.dev');
console.log(scrapeResult.data.content)
if (scrapeResult.data) {
console.log(scrapeResult.data.markdown)
}
// Crawl a website: // Crawl a website:
const idempotencyKey = uuidv4(); // optional const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
console.log(crawlResult) console.log(crawlResult)
const jobId = await crawlResult['jobId']; const jobId = await crawlResult['jobId'];
@ -19,67 +19,15 @@ console.log(jobId);
let job; let job;
while (true) { while (true) {
job = await app.checkCrawlStatus(jobId); job = await app.checkCrawlStatus(jobId);
if (job.status == 'completed') { if (job.status === 'completed') {
break; break;
} }
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
} }
console.log(job.data[0].content); if (job.data) {
console.log(job.data[0].markdown);
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query)
console.log(searchResult)
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
console.log(llmExtractionResult.data.llm_extraction);
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
} }
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { const mapResult = await app.map('https://firecrawl.dev');
extractorOptions: { extractionSchema: jsonSchema }, console.log(mapResult)
});
console.log(llmExtractionResult.data.llm_extraction);

View File

@ -1,5 +1,5 @@
import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js'; import FirecrawlApp from './firecrawl/src/index' //'@mendable/firecrawl-js';
import { z } from "zod"; import { CrawlStatusResponse } from './firecrawl/src/index';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
@ -7,7 +7,7 @@ const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
const scrapeResult = await app.scrapeUrl('firecrawl.dev'); const scrapeResult = await app.scrapeUrl('firecrawl.dev');
if (scrapeResult.data) { if (scrapeResult.data) {
console.log(scrapeResult.data.content) console.log(scrapeResult.data.markdown)
} }
// Crawl a website: // Crawl a website:
@ -17,9 +17,9 @@ console.log(crawlResult)
const jobId: string = await crawlResult['jobId']; const jobId: string = await crawlResult['jobId'];
console.log(jobId); console.log(jobId);
let job: JobStatusResponse; let job: CrawlStatusResponse;
while (true) { while (true) {
job = await app.checkCrawlStatus(jobId); job = await app.checkCrawlStatus(jobId) as CrawlStatusResponse;
if (job.status === 'completed') { if (job.status === 'completed') {
break; break;
} }
@ -27,66 +27,8 @@ while (true) {
} }
if (job.data) { if (job.data) {
console.log(job.data[0].content); console.log(job.data[0].markdown);
}
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query)
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data.llm_extraction);
}
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: jsonSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data.llm_extraction);
} }
const mapResult = await app.map('https://firecrawl.dev');
console.log(mapResult)

85
apps/js-sdk/exampleV0.js Normal file
View File

@ -0,0 +1,85 @@
import { v4 as uuidv4 } from 'uuid';
import FirecrawlApp from '@mendable/firecrawl-js';
import { z } from "zod";
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
// Scrape a website:
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
console.log(scrapeResult.data.content)
// Crawl a website:
const idempotencyKey = uuidv4(); // optional
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
console.log(crawlResult)
const jobId = await crawlResult['jobId'];
console.log(jobId);
let job;
while (true) {
job = await app.checkCrawlStatus(jobId);
if (job.status == 'completed') {
break;
}
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
}
console.log(job.data[0].content);
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query)
console.log(searchResult)
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
console.log(llmExtractionResult.data.llm_extraction);
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: jsonSchema },
});
console.log(llmExtractionResult.data.llm_extraction);

95
apps/js-sdk/exampleV0.ts Normal file
View File

@ -0,0 +1,95 @@
import FirecrawlApp, { ScrapeResponseV0, CrawlStatusResponseV0, SearchResponseV0 } from './firecrawl/src/index' //'@mendable/firecrawl-js';
import { z } from "zod";
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY", version: "v0"});
// Scrape a website:
const scrapeResult = await app.scrapeUrl('firecrawl.dev') as ScrapeResponseV0;
if (scrapeResult.data) {
console.log(scrapeResult.data.content)
}
// Crawl a website:
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
console.log(crawlResult)
const jobId: string = await crawlResult['jobId'];
console.log(jobId);
let job: CrawlStatusResponseV0;
while (true) {
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponseV0;
if (job.status === 'completed') {
break;
}
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
}
if (job.data) {
console.log(job.data[0].content);
}
// Search for a query:
const query = 'what is mendable?'
const searchResult = await app.search(query) as SearchResponseV0;
if (searchResult.data) {
console.log(searchResult.data[0].content)
}
// LLM Extraction:
// Define schema to extract contents into using zod schema
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data[0].llm_extraction);
}
// Define schema to extract contents into using json schema
const jsonSchema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: jsonSchema },
});
if (llmExtractionResult.data) {
console.log(llmExtractionResult.data[0].llm_extraction);
}

View File

@ -1,4 +1,4 @@
import FirecrawlApp from '../../index'; import FirecrawlApp, { CrawlResponseV0, FirecrawlDocumentV0, JobStatusResponseV0, ScrapeResponseV0, SearchResponseV0 } from '../../index';
import { v4 as uuidv4 } from 'uuid'; import { v4 as uuidv4 } from 'uuid';
import dotenv from 'dotenv'; import dotenv from 'dotenv';
import { describe, test, expect } from '@jest/globals'; import { describe, test, expect } from '@jest/globals';
@ -11,31 +11,31 @@ const API_URL = "http://127.0.0.1:3002";
describe('FirecrawlApp E2E Tests', () => { describe('FirecrawlApp E2E Tests', () => {
test.concurrent('should throw error for no API key', async () => { test.concurrent('should throw error for no API key', async () => {
expect(() => { expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); new FirecrawlApp({ apiKey: null, apiUrl: API_URL, version: "v0" });
}).toThrow("No API key provided"); }).toThrow("No API key provided");
}); });
test.concurrent('should throw error for invalid API key on scrape', async () => { test.concurrent('should throw error for invalid API key on scrape', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
}); });
test.concurrent('should throw error for blocklisted URL on scrape', async () => { test.concurrent('should throw error for blocklisted URL on scrape', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const blocklistedUrl = "https://facebook.com/fake-test"; const blocklistedUrl = "https://facebook.com/fake-test";
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
}); });
test.concurrent('should return successful response with valid preview token', async () => { test.concurrent('should return successful response with valid preview token', async () => {
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL, version: "v0" });
const response = await app.scrapeUrl('https://roastmywebsite.ai'); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.data?.content).toContain("_Roast_"); expect(response.data?.content).toContain("_Roast_");
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid scrape', async () => { test.concurrent('should return successful response for valid scrape', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.scrapeUrl('https://roastmywebsite.ai'); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.data?.content).toContain("_Roast_"); expect(response.data?.content).toContain("_Roast_");
expect(response.data).toHaveProperty('markdown'); expect(response.data).toHaveProperty('markdown');
@ -44,8 +44,8 @@ describe('FirecrawlApp E2E Tests', () => {
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should return successful response with valid API key and include HTML', async () => { test.concurrent('should return successful response with valid API key and include HTML', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }); const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.data?.content).toContain("_Roast_"); expect(response.data?.content).toContain("_Roast_");
expect(response.data?.markdown).toContain("_Roast_"); expect(response.data?.markdown).toContain("_Roast_");
@ -53,41 +53,41 @@ describe('FirecrawlApp E2E Tests', () => {
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid scrape with PDF file', async () => { test.concurrent('should return successful response for valid scrape with PDF file', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf'); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001'); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should throw error for invalid API key on crawl', async () => { test.concurrent('should throw error for invalid API key on crawl', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
}); });
test.concurrent('should throw error for blocklisted URL on crawl', async () => { test.concurrent('should throw error for blocklisted URL on crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const blocklistedUrl = "https://twitter.com/fake-test"; const blocklistedUrl = "https://twitter.com/fake-test";
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
}); });
test.concurrent('should return successful response for crawl and wait for completion', async () => { test.concurrent('should return successful response for crawl and wait for completion', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30); const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as CrawlResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response[0].content).toContain("_Roast_"); expect(response[0].content).toContain("_Roast_");
}, 60000); // 60 seconds timeout }, 60000); // 60 seconds timeout
test.concurrent('should handle idempotency key for crawl', async () => { test.concurrent('should handle idempotency key for crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const uniqueIdempotencyKey = uuidv4(); const uniqueIdempotencyKey = uuidv4();
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey); const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.jobId).toBeDefined(); expect(response.jobId).toBeDefined();
@ -95,12 +95,12 @@ describe('FirecrawlApp E2E Tests', () => {
}); });
test.concurrent('should check crawl status', async () => { test.concurrent('should check crawl status', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false); const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.jobId).toBeDefined(); expect(response.jobId).toBeDefined();
let statusResponse = await app.checkCrawlStatus(response.jobId); let statusResponse: any = await app.checkCrawlStatus(response.jobId);
const maxChecks = 15; const maxChecks = 15;
let checks = 0; let checks = 0;
@ -108,7 +108,7 @@ describe('FirecrawlApp E2E Tests', () => {
await new Promise(resolve => setTimeout(resolve, 1000)); await new Promise(resolve => setTimeout(resolve, 1000));
expect(statusResponse.partial_data).not.toBeNull(); expect(statusResponse.partial_data).not.toBeNull();
expect(statusResponse.current).toBeGreaterThanOrEqual(1); expect(statusResponse.current).toBeGreaterThanOrEqual(1);
statusResponse = await app.checkCrawlStatus(response.jobId); statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponseV0;
checks++; checks++;
} }
@ -121,20 +121,20 @@ describe('FirecrawlApp E2E Tests', () => {
}, 35000); // 35 seconds timeout }, 35000); // 35 seconds timeout
test.concurrent('should return successful response for search', async () => { test.concurrent('should return successful response for search', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.search("test query"); const response = await app.search("test query") as SearchResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response?.data?.[0]?.content).toBeDefined(); expect(response?.data?.[0]?.content).toBeDefined();
expect(response?.data?.length).toBeGreaterThan(2); expect(response?.data?.length).toBeGreaterThan(2);
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should throw error for invalid API key on search', async () => { test.concurrent('should throw error for invalid API key on search', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401"); await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401");
}); });
test.concurrent('should perform LLM extraction', async () => { test.concurrent('should perform LLM extraction', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
const response = await app.scrapeUrl("https://mendable.ai", { const response = await app.scrapeUrl("https://mendable.ai", {
extractorOptions: { extractorOptions: {
mode: 'llm-extraction', mode: 'llm-extraction',
@ -149,7 +149,7 @@ describe('FirecrawlApp E2E Tests', () => {
required: ['company_mission', 'supports_sso', 'is_open_source'] required: ['company_mission', 'supports_sso', 'is_open_source']
} }
} }
}); }) as ScrapeResponseV0;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.data?.llm_extraction).toBeDefined(); expect(response.data?.llm_extraction).toBeDefined();
const llmExtraction = response.data?.llm_extraction; const llmExtraction = response.data?.llm_extraction;

View File

@ -0,0 +1,312 @@
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
import { v4 as uuidv4 } from 'uuid';
import dotenv from 'dotenv';
import { describe, test, expect } from '@jest/globals';
dotenv.config();
const TEST_API_KEY = process.env.TEST_API_KEY;
const API_URL = "http://127.0.0.1:3002";
describe('FirecrawlApp E2E Tests', () => {
test.concurrent('should throw error for no API key', async () => {
expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
}).toThrow("No API key provided");
});
test.concurrent('should throw error for invalid API key on scrape', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
});
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://facebook.com/fake-test";
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
});
test.concurrent('should return successful response with valid preview token', async () => {
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
expect(response).not.toBeNull();
expect(response?.markdown).toContain("_Roast_");
}, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid scrape', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
expect(response).not.toBeNull();
expect(response).not.toHaveProperty('content'); // v0
expect(response).not.toHaveProperty('html');
expect(response).not.toHaveProperty('rawHtml');
expect(response).not.toHaveProperty('screenshot');
expect(response).not.toHaveProperty('links');
expect(response).toHaveProperty('markdown');
expect(response).toHaveProperty('metadata');
}, 30000); // 30 seconds timeout
test.concurrent('should return successful response with valid API key and options', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl(
'https://roastmywebsite.ai', {
formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
headers: { "x-key": "test" },
includeTags: ['h1'],
excludeTags: ['h2'],
onlyMainContent: true,
timeout: 30000,
waitFor: 1000
}) as ScrapeResponse;
expect(response).not.toBeNull();
expect(response).not.toHaveProperty('content'); // v0
expect(response.markdown).toContain("_Roast_");
expect(response.html).toContain("<h1");
expect(response.rawHtml).toContain("<h1");
expect(response.screenshot).not.toBeUndefined();
expect(response.screenshot).not.toBeNull();
expect(response.screenshot).toContain("https://");
expect(response.links).not.toBeNull();
expect(response.links?.length).toBeGreaterThan(0);
expect(response.links?.[0]).toContain("https://");
expect(response.metadata).not.toBeNull();
expect(response.metadata).toHaveProperty("title");
expect(response.metadata).toHaveProperty("description");
expect(response.metadata).toHaveProperty("keywords");
expect(response.metadata).toHaveProperty("robots");
expect(response.metadata).toHaveProperty("ogTitle");
expect(response.metadata).toHaveProperty("ogDescription");
expect(response.metadata).toHaveProperty("ogUrl");
expect(response.metadata).toHaveProperty("ogImage");
expect(response.metadata).toHaveProperty("ogLocaleAlternate");
expect(response.metadata).toHaveProperty("ogSiteName");
expect(response.metadata).toHaveProperty("sourceURL");
expect(response.metadata).not.toHaveProperty("pageStatusCode");
expect(response.metadata).toHaveProperty("statusCode");
expect(response.metadata).not.toHaveProperty("pageError");
expect(response.metadata.error).toBeUndefined();
expect(response.metadata.title).toBe("Roast My Website");
expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
expect(response.metadata.robots).toBe("follow, index");
expect(response.metadata.ogTitle).toBe("Roast My Website");
expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.metadata.ogSiteName).toBe("Roast My Website");
expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
expect(response.metadata.statusCode).toBe(200);
}, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse;
expect(response).not.toBeNull();
expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse;
expect(response).not.toBeNull();
expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds timeout
test.concurrent('should throw error for invalid API key on crawl', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
});
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://twitter.com/fake-test";
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
});
test.concurrent('should return successful response for crawl and wait for completion', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse;
expect(response).not.toBeNull();
expect(response).toHaveProperty("totalCount");
expect(response.totalCount).toBeGreaterThan(0);
expect(response).toHaveProperty("creditsUsed");
expect(response.creditsUsed).toBeGreaterThan(0);
expect(response).toHaveProperty("expiresAt");
expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
expect(response).toHaveProperty("status");
expect(response.status).toBe("completed");
expect(response).not.toHaveProperty("next"); // wait until done
expect(response.data?.length).toBeGreaterThan(0);
expect(response.data?.[0]).toHaveProperty("markdown");
expect(response.data?.[0].markdown).toContain("_Roast_");
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
expect(response.data?.[0]).not.toHaveProperty("html");
expect(response.data?.[0]).not.toHaveProperty("rawHtml");
expect(response.data?.[0]).not.toHaveProperty("screenshot");
expect(response.data?.[0]).not.toHaveProperty("links");
expect(response.data?.[0]).toHaveProperty("metadata");
expect(response.data?.[0].metadata).toHaveProperty("title");
expect(response.data?.[0].metadata).toHaveProperty("description");
expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
expect(response.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.crawlUrl('https://roastmywebsite.ai', {
excludePaths: ['blog/*'],
includePaths: ['/'],
maxDepth: 2,
ignoreSitemap: true,
limit: 10,
allowBackwardLinks: true,
allowExternalLinks: true,
scrapeOptions: {
formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
headers: { "x-key": "test" },
includeTags: ['h1'],
excludeTags: ['h2'],
onlyMainContent: true,
waitFor: 1000
}
} as CrawlParams, true, 30) as CrawlStatusResponse;
expect(response).not.toBeNull();
expect(response).toHaveProperty("totalCount");
expect(response.totalCount).toBeGreaterThan(0);
expect(response).toHaveProperty("creditsUsed");
expect(response.creditsUsed).toBeGreaterThan(0);
expect(response).toHaveProperty("expiresAt");
expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
expect(response).toHaveProperty("status");
expect(response.status).toBe("completed");
expect(response).not.toHaveProperty("next");
expect(response.data?.length).toBeGreaterThan(0);
expect(response.data?.[0]).toHaveProperty("markdown");
expect(response.data?.[0].markdown).toContain("_Roast_");
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
expect(response.data?.[0]).toHaveProperty("html");
expect(response.data?.[0].html).toContain("<h1");
expect(response.data?.[0]).toHaveProperty("rawHtml");
expect(response.data?.[0].rawHtml).toContain("<h1");
expect(response.data?.[0]).toHaveProperty("screenshot");
expect(response.data?.[0].screenshot).toContain("https://");
expect(response.data?.[0]).toHaveProperty("links");
expect(response.data?.[0].links).not.toBeNull();
expect(response.data?.[0].links?.length).toBeGreaterThan(0);
expect(response.data?.[0]).toHaveProperty("metadata");
expect(response.data?.[0].metadata).toHaveProperty("title");
expect(response.data?.[0].metadata).toHaveProperty("description");
expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
expect(response.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout
test.concurrent('should handle idempotency key for crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const uniqueIdempotencyKey = uuidv4();
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
expect(response).not.toBeNull();
expect(response.id).toBeDefined();
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
});
test.concurrent('should check crawl status', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
expect(response).not.toBeNull();
expect(response.id).toBeDefined();
let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
const maxChecks = 15;
let checks = 0;
while (statusResponse.status === 'scraping' && checks < maxChecks) {
await new Promise(resolve => setTimeout(resolve, 5000));
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
expect(statusResponse).not.toHaveProperty("current"); // v0
expect(statusResponse).toHaveProperty("data");
expect(statusResponse).toHaveProperty("totalCount");
expect(statusResponse).toHaveProperty("creditsUsed");
expect(statusResponse).toHaveProperty("expiresAt");
expect(statusResponse).toHaveProperty("status");
expect(statusResponse).toHaveProperty("next");
expect(statusResponse.totalCount).toBeGreaterThan(0);
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse.status).toBe("scraping");
expect(statusResponse.next).toContain("/v1/crawl/");
statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
checks++;
}
expect(statusResponse).not.toBeNull();
expect(statusResponse).toHaveProperty("totalCount");
expect(statusResponse.totalCount).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("creditsUsed");
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("expiresAt");
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse).toHaveProperty("status");
expect(statusResponse.status).toBe("completed");
expect(statusResponse.data?.length).toBeGreaterThan(0);
expect(statusResponse.data?.[0]).toHaveProperty("markdown");
expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
expect(statusResponse.data?.[0]).toHaveProperty("html");
expect(statusResponse.data?.[0].html).toContain("<div");
expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
expect(statusResponse.data?.[0].rawHtml).toContain("<div");
expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
expect(statusResponse.data?.[0].screenshot).toContain("https://");
expect(statusResponse.data?.[0]).toHaveProperty("links");
expect(statusResponse.data?.[0].links).not.toBeNull();
expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
expect(statusResponse.data?.[0]).toHaveProperty("metadata");
expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout
test.concurrent('should throw error for invalid API key on map', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
});
test.concurrent('should throw error for blocklisted URL on map', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://facebook.com/fake-test";
await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
});
test.concurrent('should return successful response with valid preview token', async () => {
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
expect(response).not.toBeNull();
expect(response.links?.length).toBeGreaterThan(0);
}, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid map', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
expect(response).not.toBeNull();
expect(response.links?.length).toBeGreaterThan(0);
expect(response.links?.[0]).toContain("https://");
const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai"));
expect(filteredLinks?.length).toBeGreaterThan(0);
}, 30000); // 30 seconds timeout
test('should throw NotImplementedError for search on v1', async () => {
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1");
});
});

View File

@ -1,16 +1,22 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios"; import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod"; import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema"; import { zodToJsonSchema } from "zod-to-json-schema";
/** /**
* Configuration interface for FirecrawlApp. * Configuration interface for FirecrawlApp.
* @param apiKey - Optional API key for authentication.
* @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'.
* @param version - API version, either 'v0' or 'v1'.
*/ */
export interface FirecrawlAppConfig { export interface FirecrawlAppConfig {
apiKey?: string | null; apiKey?: string | null;
apiUrl?: string | null; apiUrl?: string | null;
version?: "v0" | "v1";
} }
/** /**
* Metadata for a Firecrawl document. * Metadata for a Firecrawl document.
* Includes various optional properties for document metadata.
*/ */
export interface FirecrawlDocumentMetadata { export interface FirecrawlDocumentMetadata {
title?: string; title?: string;
@ -43,6 +49,17 @@ export interface FirecrawlDocumentMetadata {
articleTag?: string; articleTag?: string;
articleSection?: string; articleSection?: string;
sourceURL?: string; sourceURL?: string;
statusCode?: number;
error?: string;
[key: string]: any; // Allows for additional metadata properties not explicitly defined.
}
/**
* Metadata for a Firecrawl document on v0.
* Similar to FirecrawlDocumentMetadata but includes properties specific to API version v0.
*/
export interface FirecrawlDocumentMetadataV0 {
// Similar properties as FirecrawlDocumentMetadata with additional v0 specific adjustments
pageStatusCode?: number; pageStatusCode?: number;
pageError?: string; pageError?: string;
[key: string]: any; [key: string]: any;
@ -50,8 +67,23 @@ export interface FirecrawlDocumentMetadata {
/** /**
* Document interface for Firecrawl. * Document interface for Firecrawl.
* Represents a document retrieved or processed by Firecrawl.
*/ */
export interface FirecrawlDocument { export interface FirecrawlDocument {
url?: string;
markdown?: string;
html?: string;
rawHtml?: string;
links?: string[];
screenshot?: string;
metadata: FirecrawlDocumentMetadata;
}
/**
* Document interface for Firecrawl on v0.
* Represents a document specifically for API version v0 with additional properties.
*/
export interface FirecrawlDocumentV0 {
id?: string; id?: string;
url?: string; url?: string;
content: string; content: string;
@ -61,79 +93,240 @@ export interface FirecrawlDocument {
createdAt?: Date; createdAt?: Date;
updatedAt?: Date; updatedAt?: Date;
type?: string; type?: string;
metadata: FirecrawlDocumentMetadata; metadata: FirecrawlDocumentMetadataV0;
childrenLinks?: string[]; childrenLinks?: string[];
provider?: string; provider?: string;
warning?: string; warning?: string;
index?: number; index?: number;
} }
/** /**
* Response interface for scraping operations. * Parameters for scraping operations.
* Defines the options and configurations available for scraping web content.
*/ */
export interface ScrapeResponse { export interface ScrapeParams {
success: boolean; formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot")[];
data?: FirecrawlDocument; headers?: Record<string, string>;
error?: string; includeTags?: string[];
excludeTags?: string[];
onlyMainContent?: boolean;
screenshotMode?: "desktop" | "full-desktop" | "mobile" | "full-mobile";
waitFor?: number;
timeout?: number;
} }
/** /**
* Response interface for searching operations. * Parameters for scraping operations on v0.
* Includes page and extractor options specific to API version v0.
*/ */
export interface SearchResponse { export interface ScrapeParamsV0 {
pageOptions?: {
headers?: Record<string, string>;
includeHtml?: boolean;
includeRawHtml?: boolean;
onlyIncludeTags?: string[];
onlyMainContent?: boolean;
removeTags?: string[];
replaceAllPathsWithAbsolutePaths?: boolean;
screenshot?: boolean;
fullPageScreenshot?: boolean;
waitFor?: number;
};
extractorOptions?: {
mode?: "markdown" | "llm-extraction" | "llm-extraction-from-raw-html" | "llm-extraction-from-markdown";
extractionPrompt?: string;
extractionSchema?: Record<string, any> | z.ZodSchema | any;
};
timeout?: number;
}
/**
* Response interface for scraping operations.
* Defines the structure of the response received after a scraping operation.
*/
export interface ScrapeResponse extends FirecrawlDocument {
success: boolean; success: boolean;
data?: FirecrawlDocument[]; warning?: string;
error?: string; error?: string;
} }
/**
* Response interface for scraping operations on v0.
* Similar to ScrapeResponse but tailored for responses from API version v0.
*/
export interface ScrapeResponseV0 {
success: boolean;
data?: FirecrawlDocumentV0;
error?: string;
}
/**
* Parameters for crawling operations.
* Includes options for both scraping and mapping during a crawl.
*/
export interface CrawlParams {
scrapeOptions?: ScrapeParams;
crawlerOptions?: {
includePaths?: string[]
excludePaths?: string[]
maxDepth?: number
limit?: number
allowBackwardLinks?: boolean
allowExternalLinks?: boolean
ignoreSitemap?: boolean
};
}
/**
* Parameters for crawling operations on v0.
* Tailored for API version v0, includes specific options for crawling.
*/
export interface CrawlParamsV0 {
crawlerOptions?: {
includes?: string[];
excludes?: string[];
generateImgAltText?: boolean;
returnOnlyUrls?: boolean;
maxDepth?: number;
mode?: "default" | "fast";
ignoreSitemap?: boolean;
limit?: number;
allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
};
pageOptions?: {
headers?: Record<string, string>;
includeHtml?: boolean;
includeRawHtml?: boolean;
onlyIncludeTags?: string[];
onlyMainContent?: boolean;
removeTags?: string[];
replaceAllPathsWithAbsolutePaths?: boolean;
screenshot?: boolean;
fullPageScreenshot?: boolean;
waitFor?: number;
};
}
/** /**
* Response interface for crawling operations. * Response interface for crawling operations.
* Defines the structure of the response received after initiating a crawl.
*/ */
export interface CrawlResponse { export interface CrawlResponse {
id?: string;
url?: string;
success: boolean; success: boolean;
error?: string;
}
/**
* Response interface for crawling operations on v0.
* Similar to CrawlResponse but tailored for responses from API version v0.
*/
export interface CrawlResponseV0 {
jobId?: string; jobId?: string;
success: boolean;
error?: string;
}
/**
* Response interface for job status checks.
* Provides detailed status of a crawl job including progress and results.
*/
export interface CrawlStatusResponse {
success: boolean;
totalCount: number;
creditsUsed: number;
expiresAt: Date;
status: "scraping" | "completed" | "failed";
next: string;
data?: FirecrawlDocument[]; data?: FirecrawlDocument[];
error?: string; error?: string;
} }
/** /**
* Response interface for job status checks. * Response interface for job status checks on v0.
* Tailored for API version v0, provides status and partial data of a crawl job.
*/ */
export interface JobStatusResponse { export interface CrawlStatusResponseV0 {
success: boolean; success: boolean;
status: string; status: string;
current?: number; current?: number;
current_url?: string; current_url?: string;
current_step?: string; current_step?: string;
total?: number; total?: number;
jobId?: string; data?: FirecrawlDocumentV0[];
data?: FirecrawlDocument[]; partial_data?: FirecrawlDocumentV0[];
partial_data?: FirecrawlDocument[];
error?: string; error?: string;
} }
/** /**
* Generic parameter interface. * Parameters for mapping operations.
* Defines options for mapping URLs during a crawl.
*/ */
export interface Params { export interface MapParams {
[key: string]: any; includePaths?: string[]
extractorOptions?: { excludePaths?: string[]
extractionSchema: z.ZodSchema | any; maxDepth?: number
mode?: "llm-extraction"; limit?: number
extractionPrompt?: string; allowBackwardLinks?: boolean
allowExternalLinks?: boolean
ignoreSitemap?: boolean
}
/**
* Response interface for mapping operations.
* Defines the structure of the response received after a mapping operation.
*/
export interface MapResponse {
success: boolean;
links?: string[];
error?: string;
}
/**
* Parameters for searching operations on v0.
* Tailored for API version v0, includes specific options for searching content.
*/
export interface SearchParamsV0 {
pageOptions?: {
onlyMainContent?: boolean;
fetchPageContent?: boolean;
includeHtml?: boolean;
includeRawHtml?: boolean;
};
searchOptions?: {
limit?: number;
}; };
} }
/**
* Response interface for searching operations on v0.
* Defines the structure of the response received after a search operation on v0.
*/
export interface SearchResponseV0 {
success: boolean;
data?: FirecrawlDocumentV0[];
error?: string;
}
/** /**
* Main class for interacting with the Firecrawl API. * Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content.
*/ */
export default class FirecrawlApp { export default class FirecrawlApp {
private apiKey: string; private apiKey: string;
private apiUrl: string; private apiUrl: string;
private version: "v0" | "v1";
/** /**
* Initializes a new instance of the FirecrawlApp class. * Initializes a new instance of the FirecrawlApp class.
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. * @param config - Configuration options for the FirecrawlApp instance.
*/ */
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { constructor({ apiKey = null, apiUrl = null, version = "v1" }: FirecrawlAppConfig) {
this.apiKey = apiKey || ""; this.apiKey = apiKey || "";
this.apiUrl = apiUrl || "https://api.firecrawl.dev"; this.apiUrl = apiUrl || "https://api.firecrawl.dev";
this.version = version;
if (!this.apiKey) { if (!this.apiKey) {
throw new Error("No API key provided"); throw new Error("No API key provided");
} }
@ -141,21 +334,21 @@ export default class FirecrawlApp {
/** /**
* Scrapes a URL using the Firecrawl API. * Scrapes a URL using the Firecrawl API.
* @param {string} url - The URL to scrape. * @param url - The URL to scrape.
* @param {Params | null} params - Additional parameters for the scrape request. * @param params - Additional parameters for the scrape request.
* @returns {Promise<ScrapeResponse>} The response from the scrape operation. * @returns The response from the scrape operation.
*/ */
async scrapeUrl( async scrapeUrl(
url: string, url: string,
params: Params | null = null params?: ScrapeParams | ScrapeParamsV0
): Promise<ScrapeResponse> { ): Promise<ScrapeResponse | ScrapeResponseV0> {
const headers: AxiosRequestHeaders = { const headers: AxiosRequestHeaders = {
"Content-Type": "application/json", "Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
let jsonData: Params = { url, ...params }; let jsonData: any = { url, ...params };
if (params?.extractorOptions?.extractionSchema) { if (jsonData?.extractorOptions?.extractionSchema) {
let schema = params.extractorOptions.extractionSchema; let schema = jsonData.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) { if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema); schema = zodToJsonSchema(schema);
@ -163,22 +356,31 @@ export default class FirecrawlApp {
jsonData = { jsonData = {
...jsonData, ...jsonData,
extractorOptions: { extractorOptions: {
...params.extractorOptions, ...jsonData.extractorOptions,
extractionSchema: schema, extractionSchema: schema,
mode: params.extractorOptions.mode || "llm-extraction", mode: jsonData.extractorOptions.mode || "llm-extraction",
}, },
}; };
} }
try { try {
const response: AxiosResponse = await axios.post( const response: AxiosResponse = await axios.post(
this.apiUrl + "/v0/scrape", this.apiUrl + `/${this.version}/scrape`,
jsonData, jsonData,
{ headers } { headers }
); );
if (response.status === 200) { if (response.status === 200) {
const responseData = response.data; const responseData = response.data;
if (responseData.success) { if (responseData.success) {
return responseData; if (this.version == 'v0') {
return responseData as ScrapeResponseV0;
} else {
return {
success: true,
warning: responseData.warning,
error: responseData.error,
...responseData.data
} as ScrapeResponse;
}
} else { } else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
} }
@ -193,19 +395,23 @@ export default class FirecrawlApp {
/** /**
* Searches for a query using the Firecrawl API. * Searches for a query using the Firecrawl API.
* @param {string} query - The query to search for. * @param query - The query to search for.
* @param {Params | null} params - Additional parameters for the search request. * @param params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation. * @returns The response from the search operation.
*/ */
async search( async search(
query: string, query: string,
params: Params | null = null params?: SearchParamsV0
): Promise<SearchResponse> { ): Promise<SearchResponseV0> {
if (this.version === "v1") {
throw new Error("Search is not supported in v1");
}
const headers: AxiosRequestHeaders = { const headers: AxiosRequestHeaders = {
"Content-Type": "application/json", "Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
let jsonData: Params = { query }; let jsonData: any = { query };
if (params) { if (params) {
jsonData = { ...jsonData, ...params }; jsonData = { ...jsonData, ...params };
} }
@ -233,93 +439,157 @@ export default class FirecrawlApp {
/** /**
* Initiates a crawl job for a URL using the Firecrawl API. * Initiates a crawl job for a URL using the Firecrawl API.
* @param {string} url - The URL to crawl. * @param url - The URL to crawl.
* @param {Params | null} params - Additional parameters for the crawl request. * @param params - Additional parameters for the crawl request.
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. * @param waitUntilDone - Whether to wait for the crawl job to complete.
* @param {number} pollInterval - Time in seconds for job status checks. * @param pollInterval - Time in seconds for job status checks.
* @param {string} idempotencyKey - Optional idempotency key for the request. * @param idempotencyKey - Optional idempotency key for the request.
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation. * @returns The response from the crawl operation.
*/ */
async crawlUrl( async crawlUrl(
url: string, url: string,
params: Params | null = null, params?: CrawlParams | CrawlParamsV0,
waitUntilDone: boolean = true, waitUntilDone: boolean = true,
pollInterval: number = 2, pollInterval: number = 2,
idempotencyKey?: string idempotencyKey?: string
): Promise<CrawlResponse | any> { ): Promise<CrawlResponse | CrawlResponseV0 | CrawlStatusResponse | CrawlStatusResponseV0> {
const headers = this.prepareHeaders(idempotencyKey); const headers = this.prepareHeaders(idempotencyKey);
let jsonData: Params = { url }; let jsonData: any = { url, ...params };
if (params) {
jsonData = { ...jsonData, ...params };
}
try { try {
const response: AxiosResponse = await this.postRequest( const response: AxiosResponse = await this.postRequest(
this.apiUrl + "/v0/crawl", this.apiUrl + `/${this.version}/crawl`,
jsonData, jsonData,
headers headers
); );
if (response.status === 200) { if (response.status === 200) {
const jobId: string = response.data.jobId; const id: string = this.version == 'v0' ? response.data.jobId : response.data.id;
let checkUrl: string | undefined = undefined;
if (waitUntilDone) { if (waitUntilDone) {
return this.monitorJobStatus(jobId, headers, pollInterval); if (this.version == 'v1') { checkUrl = response.data.url }
return this.monitorJobStatus(id, headers, pollInterval, checkUrl);
} else { } else {
return { success: true, jobId }; if (this.version == 'v0') {
return {
success: true,
jobId: id
} as CrawlResponseV0;
} else {
return {
success: true,
id: id
} as CrawlResponse;
}
} }
} else { } else {
this.handleError(response, "start crawl job"); this.handleError(response, "start crawl job");
} }
} catch (error: any) { } catch (error: any) {
console.log(error); if (error.response.data.error) {
throw new Error(error.message); throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
} else {
throw new Error(error.message);
}
} }
return { success: false, error: "Internal server error." }; return { success: false, error: "Internal server error." };
} }
/** /**
* Checks the status of a crawl job using the Firecrawl API. * Checks the status of a crawl job using the Firecrawl API.
* @param {string} jobId - The job ID of the crawl operation. * @param id - The ID of the crawl operation.
* @returns {Promise<JobStatusResponse>} The response containing the job status. * @returns The response containing the job status.
*/ */
async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> { async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
if (!id) {
throw new Error("No crawl ID provided");
}
const headers: AxiosRequestHeaders = this.prepareHeaders(); const headers: AxiosRequestHeaders = this.prepareHeaders();
try { try {
const response: AxiosResponse = await this.getRequest( const response: AxiosResponse = await this.getRequest(
this.apiUrl + `/v0/crawl/status/${jobId}`, this.version == 'v1' ?
this.apiUrl + `/${this.version}/crawl/${id}` :
this.apiUrl + `/${this.version}/crawl/status/${id}`,
headers headers
); );
if (response.status === 200) { if (response.status === 200) {
return { if (this.version == 'v0') {
success: true, return {
status: response.data.status, success: true,
current: response.data.current, status: response.data.status,
current_url: response.data.current_url, current: response.data.current,
current_step: response.data.current_step, current_url: response.data.current_url,
total: response.data.total, current_step: response.data.current_step,
data: response.data.data, total: response.data.total,
partial_data: !response.data.data data: response.data.data,
? response.data.partial_data partial_data: !response.data.data
: undefined, ? response.data.partial_data
}; : undefined,
} as CrawlStatusResponseV0;
} else if (this.version == 'v1') {
return {
success: true,
status: response.data.status,
totalCount: response.data.totalCount,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: response.data.data,
error: response.data.error
} as CrawlStatusResponse;
}
} else { } else {
this.handleError(response, "check crawl status"); this.handleError(response, "check crawl status");
} }
} catch (error: any) { } catch (error: any) {
throw new Error(error.message); throw new Error(error.message);
} }
return {
success: false, if (this.version == 'v0') {
status: "unknown", return {
current: 0, success: false,
current_url: "", status: "unknown",
current_step: "", current: 0,
total: 0, current_url: "",
error: "Internal server error.", current_step: "",
}; total: 0,
error: "Internal server error.",
} as CrawlStatusResponseV0;
} else {
return {
success: false,
error: "Internal server error.",
} as CrawlStatusResponse;
}
}
async mapUrl(url: string, params?: MapParams): Promise<MapResponse> {
if (this.version == 'v0') {
throw new Error("Map is not supported in v0");
}
const headers = this.prepareHeaders();
let jsonData: { url: string } & MapParams = { url, ...params };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/${this.version}/map`,
jsonData,
headers
);
if (response.status === 200) {
return response.data as MapResponse;
} else {
this.handleError(response, "map");
}
} catch (error: any) {
throw new Error(error.message);
}
return { success: false, error: "Internal server error." } as MapResponse;
} }
/** /**
* Prepares the headers for an API request. * Prepares the headers for an API request.
* @returns {AxiosRequestHeaders} The prepared headers. * @param idempotencyKey - Optional key to ensure idempotency.
* @returns The prepared headers.
*/ */
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
return { return {
@ -331,14 +601,14 @@ export default class FirecrawlApp {
/** /**
* Sends a POST request to the specified URL. * Sends a POST request to the specified URL.
* @param {string} url - The URL to send the request to. * @param url - The URL to send the request to.
* @param {Params} data - The data to send in the request. * @param data - The data to send in the request.
* @param {AxiosRequestHeaders} headers - The headers for the request. * @param headers - The headers for the request.
* @returns {Promise<AxiosResponse>} The response from the POST request. * @returns The response from the POST request.
*/ */
postRequest( postRequest(
url: string, url: string,
data: Params, data: any,
headers: AxiosRequestHeaders headers: AxiosRequestHeaders
): Promise<AxiosResponse> { ): Promise<AxiosResponse> {
return axios.post(url, data, { headers }); return axios.post(url, data, { headers });
@ -346,9 +616,9 @@ export default class FirecrawlApp {
/** /**
* Sends a GET request to the specified URL. * Sends a GET request to the specified URL.
* @param {string} url - The URL to send the request to. * @param url - The URL to send the request to.
* @param {AxiosRequestHeaders} headers - The headers for the request. * @param headers - The headers for the request.
* @returns {Promise<AxiosResponse>} The response from the GET request. * @returns The response from the GET request.
*/ */
getRequest( getRequest(
url: string, url: string,
@ -359,31 +629,38 @@ export default class FirecrawlApp {
/** /**
* Monitors the status of a crawl job until completion or failure. * Monitors the status of a crawl job until completion or failure.
* @param {string} jobId - The job ID of the crawl operation. * @param id - The ID of the crawl operation.
* @param {AxiosRequestHeaders} headers - The headers for the request. * @param headers - The headers for the request.
* @param {number} timeout - Timeout in seconds for job status checks. * @param checkInterval - Interval in seconds for job status checks.
* @returns {Promise<any>} The final job status or data. * @returns The final job status or data.
*/ */
async monitorJobStatus( async monitorJobStatus(
jobId: string, id: string,
headers: AxiosRequestHeaders, headers: AxiosRequestHeaders,
checkInterval: number checkInterval: number,
): Promise<any> { checkUrl?: string
): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
let apiUrl: string = '';
while (true) { while (true) {
if (this.version == 'v1') {
apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`;
} else if (this.version == 'v0') {
apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`;
}
const statusResponse: AxiosResponse = await this.getRequest( const statusResponse: AxiosResponse = await this.getRequest(
this.apiUrl + `/v0/crawl/status/${jobId}`, apiUrl,
headers headers
); );
if (statusResponse.status === 200) { if (statusResponse.status === 200) {
const statusData = statusResponse.data; const statusData = statusResponse.data;
if (statusData.status === "completed") { if (statusData.status === "completed") {
if ("data" in statusData) { if ("data" in statusData) {
return statusData.data; return this.version == 'v0' ? statusData.data : statusData;
} else { } else {
throw new Error("Crawl job completed but no data was returned"); throw new Error("Crawl job completed but no data was returned");
} }
} else if ( } else if (
["active", "paused", "pending", "queued"].includes(statusData.status) ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
) { ) {
if (checkInterval < 2) { if (checkInterval < 2) {
checkInterval = 2; checkInterval = 2;

View File

@ -11,7 +11,7 @@
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */ /* Language and Environment */
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ "target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */ // "jsx": "preserve", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
@ -25,9 +25,9 @@
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */ /* Modules */
"module": "NodeNext", /* Specify what module code is generated. */ "module": "commonjs", /* Specify what module code is generated. */
"rootDir": "./src", /* Specify the root folder within your source files. */ "rootDir": "./src", /* Specify the root folder within your source files. */
"moduleResolution": "nodenext", /* Specify how TypeScript looks up a file from a given module specifier. */ "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */

View File

@ -0,0 +1,75 @@
import uuid
from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
# Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])
# Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
print(crawl_result)
# LLM Extraction:
# Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field
from typing import List
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
print(llm_extraction_result['llm_extraction'])
# Define schema to extract contents into using json schema
json_schema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': json_schema,
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
print(llm_extraction_result['llm_extraction'])

View File

@ -7,7 +7,7 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
API_URL = "http://127.0.0.1:3002"; API_URL = "http://127.0.0.1:3002"
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
TEST_API_KEY = os.getenv('TEST_API_KEY') TEST_API_KEY = os.getenv('TEST_API_KEY')
@ -20,32 +20,34 @@ FirecrawlApp = firecrawl.FirecrawlApp
def test_no_api_key(): def test_no_api_key():
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
invalid_app = FirecrawlApp(api_url=API_URL) invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
assert "No API key provided" in str(excinfo.value) assert "No API key provided" in str(excinfo.value)
def test_scrape_url_invalid_api_key(): def test_scrape_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev') invalid_app.scrape_url('https://firecrawl.dev')
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url(): def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test" blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url) app.scrape_url(blocklisted_url)
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_successful_response_with_valid_preview_token(): def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
response = app.scrape_url('https://roastmywebsite.ai') response = app.scrape_url('https://roastmywebsite.ai')
assert response is not None assert response is not None
assert 'content' in response assert 'content' in response
assert "_Roast_" in response['content'] assert "_Roast_" in response['content']
def test_scrape_url_e2e(): def test_scrape_url_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url('https://roastmywebsite.ai') response = app.scrape_url('https://roastmywebsite.ai')
print(response)
assert response is not None assert response is not None
assert 'content' in response assert 'content' in response
assert 'markdown' in response assert 'markdown' in response
@ -54,7 +56,7 @@ def test_scrape_url_e2e():
assert "_Roast_" in response['content'] assert "_Roast_" in response['content']
def test_successful_response_with_valid_api_key_and_include_html(): def test_successful_response_with_valid_api_key_and_include_html():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}}) response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
assert response is not None assert response is not None
assert 'content' in response assert 'content' in response
@ -66,7 +68,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
assert "<h1" in response['html'] assert "<h1" in response['html']
def test_successful_response_for_valid_scrape_with_pdf_file(): def test_successful_response_for_valid_scrape_with_pdf_file():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf') response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
assert response is not None assert response is not None
assert 'content' in response assert 'content' in response
@ -74,7 +76,7 @@ def test_successful_response_for_valid_scrape_with_pdf_file():
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content'] assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension(): def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001') response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
time.sleep(6) # wait for 6 seconds time.sleep(6) # wait for 6 seconds
assert response is not None assert response is not None
@ -83,20 +85,20 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content'] assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
def test_crawl_url_invalid_api_key(): def test_crawl_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
invalid_app.crawl_url('https://firecrawl.dev') invalid_app.crawl_url('https://firecrawl.dev')
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url(): def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
blocklisted_url = "https://twitter.com/fake-test" blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url) app.crawl_url(blocklisted_url)
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e(): def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True) response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
assert response is not None assert response is not None
assert len(response) > 0 assert len(response) > 0
@ -104,7 +106,7 @@ def test_crawl_url_wait_for_completion_e2e():
assert "_Roast_" in response[0]['content'] assert "_Roast_" in response[0]['content']
def test_crawl_url_with_idempotency_key_e2e(): def test_crawl_url_with_idempotency_key_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
uniqueIdempotencyKey = str(uuid4()) uniqueIdempotencyKey = str(uuid4())
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
assert response is not None assert response is not None
@ -117,7 +119,7 @@ def test_crawl_url_with_idempotency_key_e2e():
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
def test_check_crawl_status_e2e(): def test_check_crawl_status_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
assert response is not None assert response is not None
assert 'jobId' in response assert 'jobId' in response
@ -131,21 +133,21 @@ def test_check_crawl_status_e2e():
assert len(status_response['data']) > 0 assert len(status_response['data']) > 0
def test_search_e2e(): def test_search_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.search("test query") response = app.search("test query")
assert response is not None assert response is not None
assert 'content' in response[0] assert 'content' in response[0]
assert len(response) > 2 assert len(response) > 2
def test_search_invalid_api_key(): def test_search_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
invalid_app.search("test query") invalid_app.search("test query")
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_llm_extraction(): def test_llm_extraction():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url("https://mendable.ai", { response = app.scrape_url("https://firecrawl.dev", {
'extractorOptions': { 'extractorOptions': {
'mode': 'llm-extraction', 'mode': 'llm-extraction',
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",

View File

@ -0,0 +1,3 @@
API_URL=http://localhost:3002
ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
TEST_API_KEY=fc-YOUR_API_KEY

View File

@ -0,0 +1,352 @@
import importlib.util
import pytest
import time
import os
from uuid import uuid4
from dotenv import load_dotenv
from datetime import datetime
load_dotenv()
API_URL = "http://127.0.0.1:3002";
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
TEST_API_KEY = os.getenv('TEST_API_KEY')
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
firecrawl = importlib.util.module_from_spec(spec)
spec.loader.exec_module(firecrawl)
FirecrawlApp = firecrawl.FirecrawlApp
def test_no_api_key():
with pytest.raises(Exception) as excinfo:
invalid_app = FirecrawlApp(api_url=API_URL)
assert "No API key provided" in str(excinfo.value)
def test_scrape_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
response = app.scrape_url('https://roastmywebsite.ai')
assert response is not None
assert "_Roast_" in response['markdown']
assert "content" not in response
assert "html" not in response
assert "metadata" in response
assert "links" not in response
assert "rawHtml" not in response
def test_successful_response_for_valid_scrape():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://roastmywebsite.ai')
assert response is not None
assert 'markdown' in response
assert "_Roast_" in response['markdown']
assert 'metadata' in response
assert 'content' not in response
assert 'html' not in response
assert 'rawHtml' not in response
assert 'screenshot' not in response
assert 'links' not in response
def test_successful_response_with_valid_api_key_and_options():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
params = {
'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
'headers': {'x-key': 'test'},
'includeTags': ['h1'],
'excludeTags': ['h2'],
'onlyMainContent': True,
'timeout': 30000,
'waitFor': 1000
}
response = app.scrape_url('https://roastmywebsite.ai', params)
assert response is not None
assert 'content' not in response
assert 'markdown' in response
assert 'html' in response
assert 'rawHtml' in response
assert 'screenshot' in response
assert 'links' in response
assert "_Roast_" in response['markdown']
assert "<h1" in response['html']
assert "<h1" in response['rawHtml']
assert "https://" in response['screenshot']
assert len(response['links']) > 0
assert "https://" in response['links'][0]
assert 'metadata' in response
assert 'title' in response['metadata']
assert 'description' in response['metadata']
assert 'keywords' in response['metadata']
assert 'robots' in response['metadata']
assert 'ogTitle' in response['metadata']
assert 'ogDescription' in response['metadata']
assert 'ogUrl' in response['metadata']
assert 'ogImage' in response['metadata']
assert 'ogLocaleAlternate' in response['metadata']
assert 'ogSiteName' in response['metadata']
assert 'sourceURL' in response['metadata']
assert 'statusCode' in response['metadata']
assert 'pageStatusCode' not in response['metadata']
assert 'pageError' not in response['metadata']
assert 'error' not in response['metadata']
assert response['metadata']['title'] == "Roast My Website"
assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl"
assert response['metadata']['robots'] == "follow, index"
assert response['metadata']['ogTitle'] == "Roast My Website"
assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai"
assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png"
assert response['metadata']['ogLocaleAlternate'] == []
assert response['metadata']['ogSiteName'] == "Roast My Website"
assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai"
assert response['metadata']['statusCode'] == 200
def test_successful_response_for_valid_scrape_with_pdf_file():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
assert response is not None
assert 'content' not in response
assert 'metadata' in response
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
time.sleep(1) # wait for 1 second
assert response is not None
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
def test_crawl_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.crawl_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30)
assert response is not None
assert 'totalCount' in response
assert response['totalCount'] > 0
assert 'creditsUsed' in response
assert response['creditsUsed'] > 0
assert 'expiresAt' in response
assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
assert 'status' in response
assert response['status'] == 'completed'
assert 'next' not in response
assert len(response['data']) > 0
assert 'markdown' in response['data'][0]
assert "_Roast_" in response['data'][0]['markdown']
assert 'content' not in response['data'][0]
assert 'html' not in response['data'][0]
assert 'rawHtml' not in response['data'][0]
assert 'screenshot' not in response['data'][0]
assert 'links' not in response['data'][0]
assert 'metadata' in response['data'][0]
assert 'title' in response['data'][0]['metadata']
assert 'description' in response['data'][0]['metadata']
assert 'language' in response['data'][0]['metadata']
assert 'sourceURL' in response['data'][0]['metadata']
assert 'statusCode' in response['data'][0]['metadata']
assert 'error' not in response['data'][0]['metadata']
def test_crawl_url_with_options_and_wait_for_completion():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.crawl_url('https://roastmywebsite.ai', {
'excludePaths': ['blog/*'],
'includePaths': ['/'],
'maxDepth': 2,
'ignoreSitemap': True,
'limit': 10,
'allowBackwardLinks': True,
'allowExternalLinks': True,
'scrapeOptions': {
'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
'headers': {"x-key": "test"},
'includeTags': ['h1'],
'excludeTags': ['h2'],
'onlyMainContent': True,
'waitFor': 1000
}
}, True, 30)
assert response is not None
assert 'totalCount' in response
assert response['totalCount'] > 0
assert 'creditsUsed' in response
assert response['creditsUsed'] > 0
assert 'expiresAt' in response
assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
assert 'status' in response
assert response['status'] == 'completed'
assert 'next' not in response
assert len(response['data']) > 0
assert 'markdown' in response['data'][0]
assert "_Roast_" in response['data'][0]['markdown']
assert 'content' not in response['data'][0]
assert 'html' in response['data'][0]
assert "<h1" in response['data'][0]['html']
assert 'rawHtml' in response['data'][0]
assert "<h1" in response['data'][0]['rawHtml']
assert 'screenshot' in response['data'][0]
assert "https://" in response['data'][0]['screenshot']
assert 'links' in response['data'][0]
assert len(response['data'][0]['links']) > 0
assert 'metadata' in response['data'][0]
assert 'title' in response['data'][0]['metadata']
assert 'description' in response['data'][0]['metadata']
assert 'language' in response['data'][0]['metadata']
assert 'sourceURL' in response['data'][0]['metadata']
assert 'statusCode' in response['data'][0]['metadata']
assert 'error' not in response['data'][0]['metadata']
def test_crawl_url_with_idempotency_key_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
uniqueIdempotencyKey = str(uuid4())
response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey)
assert response is not None
assert 'id' in response
with pytest.raises(Exception) as excinfo:
app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey)
assert "Idempotency key already used" in str(excinfo.value)
def test_check_crawl_status_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False)
assert response is not None
assert 'id' in response
max_checks = 15
checks = 0
status_response = app.check_crawl_status(response['id'])
while status_response['status'] == 'scraping' and checks < max_checks:
time.sleep(1) # wait for 1 second
assert 'partial_data' not in status_response
assert 'current' not in status_response
assert 'data' in status_response
assert 'totalCount' in status_response
assert 'creditsUsed' in status_response
assert 'expiresAt' in status_response
assert 'status' in status_response
assert 'next' in status_response
assert status_response['totalCount'] > 0
assert status_response['creditsUsed'] > 0
assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
assert status_response['status'] == 'scraping'
assert '/v1/crawl/' in status_response['next']
status_response = app.check_crawl_status(response['id'])
checks += 1
assert status_response is not None
assert 'totalCount' in status_response
assert status_response['totalCount'] > 0
assert 'creditsUsed' in status_response
assert status_response['creditsUsed'] > 0
assert 'expiresAt' in status_response
assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
assert 'status' in status_response
assert status_response['status'] == 'completed'
assert len(status_response['data']) > 0
assert 'markdown' in status_response['data'][0]
assert len(status_response['data'][0]['markdown']) > 10
assert 'content' not in status_response['data'][0]
assert 'html' in status_response['data'][0]
assert "<div" in status_response['data'][0]['html']
assert 'rawHtml' in status_response['data'][0]
assert "<div" in status_response['data'][0]['rawHtml']
assert 'screenshot' in status_response['data'][0]
assert "https://" in status_response['data'][0]['screenshot']
assert 'links' in status_response['data'][0]
assert status_response['data'][0]['links'] is not None
assert len(status_response['data'][0]['links']) > 0
assert 'metadata' in status_response['data'][0]
assert 'title' in status_response['data'][0]['metadata']
assert 'description' in status_response['data'][0]['metadata']
assert 'language' in status_response['data'][0]['metadata']
assert 'sourceURL' in status_response['data'][0]['metadata']
assert 'statusCode' in status_response['data'][0]['metadata']
assert 'error' not in status_response['data'][0]['metadata']
def test_invalid_api_key_on_map():
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
with pytest.raises(Exception) as excinfo:
invalid_app.map_url('https://roastmywebsite.ai')
assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url_on_map():
app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
blocklisted_url = "https://facebook.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.map_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_successful_response_with_valid_preview_token_on_map():
app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)
response = app.map_url('https://roastmywebsite.ai')
assert response is not None
assert len(response) > 0
def test_successful_response_for_valid_map():
app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
response = app.map_url('https://roastmywebsite.ai')
assert response is not None
assert len(response) > 0
assert any("https://" in link for link in response)
filtered_links = [link for link in response if "roastmywebsite.ai" in link]
assert len(filtered_links) > 0
def test_search_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(NotImplementedError) as excinfo:
app.search("test query")
assert "Search is not supported in v1" in str(excinfo.value)
# def test_llm_extraction():
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
# response = app.scrape_url("https://mendable.ai", {
# 'extractorOptions': {
# 'mode': 'llm-extraction',
# 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
# 'extractionSchema': {
# 'type': 'object',
# 'properties': {
# 'company_mission': {'type': 'string'},
# 'supports_sso': {'type': 'boolean'},
# 'is_open_source': {'type': 'boolean'}
# },
# 'required': ['company_mission', 'supports_sso', 'is_open_source']
# }
# }
# })
# assert response is not None
# assert 'llm_extraction' in response
# llm_extraction = response['llm_extraction']
# assert 'company_mission' in llm_extraction
# assert isinstance(llm_extraction['supports_sso'], bool)
# assert isinstance(llm_extraction['is_open_source'], bool)

View File

@ -19,24 +19,22 @@ import requests
logger : logging.Logger = logging.getLogger("firecrawl") logger : logging.Logger = logging.getLogger("firecrawl")
class FirecrawlApp: class FirecrawlApp:
""" def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None:
Initialize the FirecrawlApp instance. """
Initialize the FirecrawlApp instance with API key, API URL, and version.
Args: Args:
api_key (Optional[str]): API key for authenticating with the Firecrawl API. api_key (Optional[str]): API key for authenticating with the Firecrawl API.
api_url (Optional[str]): Base URL for the Firecrawl API. api_url (Optional[str]): Base URL for the Firecrawl API.
""" version (str): API version, either 'v0' or 'v1'.
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: """
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None: self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
logger.warning("No API key provided") self.version = version
raise ValueError('No API key provided') if self.api_key is None:
else: logger.warning("No API key provided")
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) raise ValueError('No API key provided')
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}")
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
if self.api_url != 'https://api.firecrawl.dev':
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
""" """
@ -75,9 +73,11 @@ class FirecrawlApp:
for key, value in params.items(): for key, value in params.items():
if key != 'extractorOptions': if key != 'extractorOptions':
scrape_params[key] = value scrape_params[key] = value
endpoint = f'/{self.version}/scrape'
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
f'{self.api_url}/v0/scrape', f'{self.api_url}{endpoint}',
headers=headers, headers=headers,
json=scrape_params, json=scrape_params,
) )
@ -104,6 +104,9 @@ class FirecrawlApp:
Raises: Raises:
Exception: If the search request fails. Exception: If the search request fails.
""" """
if self.version == 'v1':
raise NotImplementedError("Search is not supported in v1")
headers = self._prepare_headers() headers = self._prepare_headers()
json_data = {'query': query} json_data = {'query': query}
if params: if params:
@ -145,26 +148,37 @@ class FirecrawlApp:
Raises: Raises:
Exception: If the crawl job initiation or monitoring fails. Exception: If the crawl job initiation or monitoring fails.
""" """
endpoint = f'/{self.version}/crawl'
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers(idempotency_key)
json_data = {'url': url} json_data = {'url': url}
if params: if params:
json_data.update(params) json_data.update(params)
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200: if response.status_code == 200:
job_id = response.json().get('jobId') if self.version == 'v0':
if wait_until_done: id = response.json().get('jobId')
return self._monitor_job_status(job_id, headers, poll_interval)
else: else:
return {'jobId': job_id} id = response.json().get('id')
if wait_until_done:
check_url = None
if self.version == 'v1':
check_url = response.json().get('url')
return self._monitor_job_status(id, headers, poll_interval, check_url)
else:
if self.version == 'v0':
return {'jobId': id}
else:
return {'id': id}
else: else:
self._handle_error(response, 'start crawl job') self._handle_error(response, 'start crawl job')
def check_crawl_status(self, job_id: str) -> Any: def check_crawl_status(self, id: str) -> Any:
""" """
Check the status of a crawl job using the Firecrawl API. Check the status of a crawl job using the Firecrawl API.
Args: Args:
job_id (str): The ID of the crawl job. id (str): The ID of the crawl job.
Returns: Returns:
Any: The status of the crawl job. Any: The status of the crawl job.
@ -172,13 +186,72 @@ class FirecrawlApp:
Raises: Raises:
Exception: If the status check request fails. Exception: If the status check request fails.
""" """
if self.version == 'v0':
endpoint = f'/{self.version}/crawl/status/{id}'
else:
endpoint = f'/{self.version}/crawl/{id}'
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200: if response.status_code == 200:
return response.json() data = response.json()
if self.version == 'v0':
return {
'success': True,
'status': data.get('status'),
'current': data.get('current'),
'current_url': data.get('current_url'),
'current_step': data.get('current_step'),
'total': data.get('total'),
'data': data.get('data'),
'partial_data': data.get('partial_data') if not data.get('data') else None,
}
elif self.version == 'v1':
return {
'success': True,
'status': data.get('status'),
'totalCount': data.get('totalCount'),
'creditsUsed': data.get('creditsUsed'),
'expiresAt': data.get('expiresAt'),
'next': data.get('next'),
'data': data.get('data'),
'error': data.get('error')
}
else: else:
self._handle_error(response, 'check crawl status') self._handle_error(response, 'check crawl status')
def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""
Perform a map search using the Firecrawl API.
"""
if self.version == 'v0':
raise NotImplementedError("Map is not supported in v0")
endpoint = f'/{self.version}/map'
headers = self._prepare_headers()
# Prepare the base scrape parameters with the URL
json_data = {'url': url}
if params:
json_data.update(params)
# Make the POST request with the prepared headers and JSON data
response = requests.post(
f'{self.api_url}{endpoint}',
headers=headers,
json=json_data,
)
if response.status_code == 200:
response = response.json()
print(response)
if response['success'] and 'links' in response:
return response['links']
else:
raise Exception(f'Failed to map URL. Error: {response["error"]}')
else:
self._handle_error(response, 'map')
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
""" """
Prepare the headers for API requests. Prepare the headers for API requests.
@ -257,15 +330,15 @@ class FirecrawlApp:
return response return response
return response return response
def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int, check_url: Optional[str] = None) -> Any:
""" """
Monitor the status of a crawl job until completion. Monitor the status of a crawl job until completion.
Args: Args:
job_id (str): The ID of the crawl job. id (str): The ID of the crawl job.
headers (Dict[str, str]): The headers to include in the status check requests. headers (Dict[str, str]): The headers to include in the status check requests.
poll_interval (int): Secounds between status checks. poll_interval (int): Secounds between status checks.
check_url (Optional[str]): The URL to check for the crawl job.
Returns: Returns:
Any: The crawl results if the job is completed successfully. Any: The crawl results if the job is completed successfully.
@ -273,15 +346,30 @@ class FirecrawlApp:
Exception: If the job fails or an error occurs during status checks. Exception: If the job fails or an error occurs during status checks.
""" """
while True: while True:
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) api_url = ''
if (self.version == 'v0'):
if check_url:
api_url = check_url
else:
api_url = f'{self.api_url}/v0/crawl/status/{id}'
else:
if check_url:
api_url = check_url
else:
api_url = f'{self.api_url}/v1/crawl/{id}'
status_response = self._get_request(api_url, headers)
if status_response.status_code == 200: if status_response.status_code == 200:
status_data = status_response.json() status_data = status_response.json()
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
if 'data' in status_data: if 'data' in status_data:
return status_data['data'] if self.version == 'v0':
return status_data['data']
else:
return status_data
else: else:
raise Exception('Crawl job completed but no data was returned') raise Exception('Crawl job completed but no data was returned')
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
poll_interval=max(poll_interval,2) poll_interval=max(poll_interval,2)
time.sleep(poll_interval) # Wait for the specified interval before checking again time.sleep(poll_interval) # Wait for the specified interval before checking again
else: else:
@ -300,18 +388,19 @@ class FirecrawlApp:
Raises: Raises:
Exception: An exception with a message containing the status code and error details from the response. Exception: An exception with a message containing the status code and error details from the response.
""" """
error_message = response.json().get('error', 'No additional error details provided.') error_message = response.json().get('error', 'No error message provided.')
error_details = response.json().get('details', 'No additional error details provided.')
if response.status_code == 402: if response.status_code == 402:
message = f"Payment Required: Failed to {action}. {error_message}" message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
elif response.status_code == 408: elif response.status_code == 408:
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
elif response.status_code == 409: elif response.status_code == 409:
message = f"Conflict: Failed to {action} due to a conflict. {error_message}" message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
elif response.status_code == 500: elif response.status_code == 500:
message = f"Internal Server Error: Failed to {action}. {error_message}" message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
else: else:
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
# Raise an HTTPError with the custom message and attach the response # Raise an HTTPError with the custom message and attach the response
raise requests.exceptions.HTTPError(message, response=response) raise requests.exceptions.HTTPError(message, response=response)