mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge remote-tracking branch 'origin/v1/python-sdk' into v1-webscraper
This commit is contained in:
commit
30e809966f
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -19,3 +19,4 @@ apps/test-suite/load-test-results/test-run-report.json
|
|||
apps/playwright-service-ts/node_modules/
|
||||
apps/playwright-service-ts/package-lock.json
|
||||
|
||||
*.pyc
|
||||
|
|
|
@ -538,7 +538,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(1);
|
||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
|
@ -762,11 +762,11 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://jestjs.io" });
|
||||
.send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } });
|
||||
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 20000));
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
|
||||
const responseCancel = await request(TEST_URL)
|
||||
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
||||
|
@ -869,96 +869,4 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
60000
|
||||
); // 60 secs
|
||||
});
|
||||
|
||||
describe("POST /v0/map", () => {
|
||||
it.concurrent(
|
||||
"should return a list of links for mendable.ai without subdomains included",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
});
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(response.body.links).not.toContain("https://docs.mendable.ai");
|
||||
expect(Array.isArray(response.body.links)).toBe(true);
|
||||
expect(response.body.links.length).toBeGreaterThan(0);
|
||||
},
|
||||
60000
|
||||
); // 60 secs
|
||||
|
||||
it.concurrent(
|
||||
"should return a list of links for a given URL with subdomains included",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://python.langchain.com",
|
||||
includeSubdomains: true,
|
||||
});
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(Array.isArray(response.body.links)).toBe(true);
|
||||
expect(response.body.links.length).toBeGreaterThan(0);
|
||||
},
|
||||
60000
|
||||
); // 60 secs
|
||||
|
||||
it.concurrent(
|
||||
"should return a list of links for a given URL with subdomains and search",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://python.langchain.com",
|
||||
includeSubdomains: true,
|
||||
search: "agents",
|
||||
});
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(response.body.links).toContain(
|
||||
"https://api.python.langchain.com/en/latest/_modules/langchain/agents/openai_functions_agent/base.html"
|
||||
);
|
||||
expect(Array.isArray(response.body.links)).toBe(true);
|
||||
expect(response.body.links.length).toBeGreaterThan(0);
|
||||
response.body.links.forEach((link) => {
|
||||
expect(link).toContain("python.langchain.com");
|
||||
});
|
||||
},
|
||||
60000
|
||||
); // 60 secs
|
||||
|
||||
it.concurrent(
|
||||
"should handle invalid URL input gracefully",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "invalid-url",
|
||||
includeSubdomains: true,
|
||||
search: "agents",
|
||||
});
|
||||
|
||||
expect(response.statusCode).toBe(400);
|
||||
expect(response.body).toHaveProperty("success", false);
|
||||
expect(response.body).toHaveProperty("details");
|
||||
},
|
||||
60000
|
||||
); // 60 secs
|
||||
});
|
||||
});
|
||||
|
|
|
@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] };
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
|
|
|
@ -74,7 +74,15 @@ export async function scrapeHelper(
|
|||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
delete doc.rawHtml;
|
||||
if (doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pageOptions.includeHtml) {
|
||||
if (doc.html) {
|
||||
delete doc.html;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
|
|
|
@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) {
|
|||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: true,
|
||||
onlyMainContent: true,
|
||||
fetchPageContent: true,
|
||||
removeTags: [],
|
||||
fallback: false,
|
||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||
fallback: req.body.pageOptions?.fallback ?? false,
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds
|
|||
|
||||
export const defaultPageOptions = {
|
||||
onlyMainContent: false,
|
||||
includeHtml: true,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
|
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {
|
|||
|
||||
export const defaultCrawlPageOptions = {
|
||||
onlyMainContent: false,
|
||||
includeHtml: true,
|
||||
includeHtml: false,
|
||||
removeTags: [],
|
||||
parsePDF: true
|
||||
}
|
||||
|
|
|
@ -296,6 +296,12 @@ export class WebScraperDataProvider {
|
|||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
|
||||
if (!this.pageOptions.includeHtml) {
|
||||
for (let document of documents) {
|
||||
delete document.html;
|
||||
}
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (
|
||||
|
@ -572,12 +578,19 @@ export class WebScraperDataProvider {
|
|||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: true,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: [],
|
||||
this.pageOptions = {
|
||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false,
|
||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
|
|
@ -122,23 +122,36 @@ function getScrapingFallbackOrder(
|
|||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
includeMarkdown: true,
|
||||
onlyMainContent: true,
|
||||
includeHtml: true,
|
||||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
includeLinks: true
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown",
|
||||
},
|
||||
existingHtml: string = "",
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions?: ExtractorOptions,
|
||||
existingHtml?: string,
|
||||
priority?: number,
|
||||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false,
|
||||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
extractorOptions = {
|
||||
mode: extractorOptions.mode ?? "llm-extraction-from-markdown",
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingHtml) {
|
||||
existingHtml = "";
|
||||
}
|
||||
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const attemptScraping = async (
|
||||
|
|
|
@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
const pattern = /([a-zA-Z]+-[A-Z]{2})/;
|
||||
const match = pattern.exec(url);
|
||||
language = match ? match[1] : null;
|
||||
language = soup('html').attr('lang') || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
|
|
|
@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) {
|
|||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
const rawHtml = docs[0].rawHtml;
|
||||
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||
|
||||
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
||||
delete docs[0].rawHtml;
|
||||
if (docs[0] && docs[0].rawHtml) {
|
||||
delete docs[0].rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
const data = {
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
import { v4 as uuidv4 } from 'uuid';
|
||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
import FirecrawlApp from './firecrawl/src/index'; //'@mendable/firecrawl-js';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.markdown)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
const idempotencyKey = uuidv4(); // optional
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId = await crawlResult['jobId'];
|
||||
|
@ -19,67 +19,15 @@ console.log(jobId);
|
|||
let job;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
if (job.status == 'completed') {
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
if (job.data) {
|
||||
console.log(job.data[0].markdown);
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
const mapResult = await app.map('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
import FirecrawlApp from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { CrawlStatusResponse } from './firecrawl/src/index';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
|
@ -7,7 +7,7 @@ const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
|||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.content)
|
||||
console.log(scrapeResult.data.markdown)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
|
@ -17,9 +17,9 @@ console.log(crawlResult)
|
|||
const jobId: string = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job: JobStatusResponse;
|
||||
let job: CrawlStatusResponse;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponse;
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
|
@ -27,66 +27,8 @@ while (true) {
|
|||
}
|
||||
|
||||
if (job.data) {
|
||||
console.log(job.data[0].content);
|
||||
}
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
}
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
console.log(job.data[0].markdown);
|
||||
}
|
||||
|
||||
const mapResult = await app.map('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
|
85
apps/js-sdk/exampleV0.js
Normal file
85
apps/js-sdk/exampleV0.js
Normal file
|
@ -0,0 +1,85 @@
|
|||
import { v4 as uuidv4 } from 'uuid';
|
||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
// Crawl a website:
|
||||
const idempotencyKey = uuidv4(); // optional
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
if (job.status == 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
95
apps/js-sdk/exampleV0.ts
Normal file
95
apps/js-sdk/exampleV0.ts
Normal file
|
@ -0,0 +1,95 @@
|
|||
import FirecrawlApp, { ScrapeResponseV0, CrawlStatusResponseV0, SearchResponseV0 } from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY", version: "v0"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev') as ScrapeResponseV0;
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.content)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId: string = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job: CrawlStatusResponseV0;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponseV0;
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
if (job.data) {
|
||||
console.log(job.data[0].content);
|
||||
}
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query) as SearchResponseV0;
|
||||
if (searchResult.data) {
|
||||
console.log(searchResult.data[0].content)
|
||||
}
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data[0].llm_extraction);
|
||||
}
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data[0].llm_extraction);
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
import FirecrawlApp from '../../index';
|
||||
import FirecrawlApp, { CrawlResponseV0, FirecrawlDocumentV0, JobStatusResponseV0, ScrapeResponseV0, SearchResponseV0 } from '../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
|
@ -11,31 +11,31 @@ const API_URL = "http://127.0.0.1:3002";
|
|||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for no API key', async () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL, version: "v0" });
|
||||
}).toThrow("No API key provided");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data).toHaveProperty('markdown');
|
||||
|
@ -44,8 +44,8 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response with valid API key and include HTML', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data?.markdown).toContain("_Roast_");
|
||||
|
@ -53,41 +53,41 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001');
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30);
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as CrawlResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response[0].content).toContain("_Roast_");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should handle idempotency key for crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey);
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
|
@ -95,12 +95,12 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
});
|
||||
|
||||
test.concurrent('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false);
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
let statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
let statusResponse: any = await app.checkCrawlStatus(response.jobId);
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
|
@ -108,7 +108,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
expect(statusResponse.partial_data).not.toBeNull();
|
||||
expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
||||
statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponseV0;
|
||||
checks++;
|
||||
}
|
||||
|
||||
|
@ -121,20 +121,20 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
}, 35000); // 35 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for search', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.search("test query");
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.search("test query") as SearchResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.data?.[0]?.content).toBeDefined();
|
||||
expect(response?.data?.length).toBeGreaterThan(2);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on search', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
|
||||
await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should perform LLM extraction', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl("https://mendable.ai", {
|
||||
extractorOptions: {
|
||||
mode: 'llm-extraction',
|
||||
|
@ -149,7 +149,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
required: ['company_mission', 'supports_sso', 'is_open_source']
|
||||
}
|
||||
}
|
||||
});
|
||||
}) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.llm_extraction).toBeDefined();
|
||||
const llmExtraction = response.data?.llm_extraction;
|
||||
|
|
|
@ -0,0 +1,312 @@
|
|||
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const TEST_API_KEY = process.env.TEST_API_KEY;
|
||||
const API_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for no API key', async () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
}).toThrow("No API key provided");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.markdown).toContain("_Roast_");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).not.toHaveProperty('content'); // v0
|
||||
expect(response).not.toHaveProperty('html');
|
||||
expect(response).not.toHaveProperty('rawHtml');
|
||||
expect(response).not.toHaveProperty('screenshot');
|
||||
expect(response).not.toHaveProperty('links');
|
||||
|
||||
expect(response).toHaveProperty('markdown');
|
||||
expect(response).toHaveProperty('metadata');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response with valid API key and options', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl(
|
||||
'https://roastmywebsite.ai', {
|
||||
formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
|
||||
headers: { "x-key": "test" },
|
||||
includeTags: ['h1'],
|
||||
excludeTags: ['h2'],
|
||||
onlyMainContent: true,
|
||||
timeout: 30000,
|
||||
waitFor: 1000
|
||||
}) as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).not.toHaveProperty('content'); // v0
|
||||
expect(response.markdown).toContain("_Roast_");
|
||||
expect(response.html).toContain("<h1");
|
||||
expect(response.rawHtml).toContain("<h1");
|
||||
expect(response.screenshot).not.toBeUndefined();
|
||||
expect(response.screenshot).not.toBeNull();
|
||||
expect(response.screenshot).toContain("https://");
|
||||
expect(response.links).not.toBeNull();
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
expect(response.links?.[0]).toContain("https://");
|
||||
expect(response.metadata).not.toBeNull();
|
||||
expect(response.metadata).toHaveProperty("title");
|
||||
expect(response.metadata).toHaveProperty("description");
|
||||
expect(response.metadata).toHaveProperty("keywords");
|
||||
expect(response.metadata).toHaveProperty("robots");
|
||||
expect(response.metadata).toHaveProperty("ogTitle");
|
||||
expect(response.metadata).toHaveProperty("ogDescription");
|
||||
expect(response.metadata).toHaveProperty("ogUrl");
|
||||
expect(response.metadata).toHaveProperty("ogImage");
|
||||
expect(response.metadata).toHaveProperty("ogLocaleAlternate");
|
||||
expect(response.metadata).toHaveProperty("ogSiteName");
|
||||
expect(response.metadata).toHaveProperty("sourceURL");
|
||||
expect(response.metadata).not.toHaveProperty("pageStatusCode");
|
||||
expect(response.metadata).toHaveProperty("statusCode");
|
||||
expect(response.metadata).not.toHaveProperty("pageError");
|
||||
expect(response.metadata.error).toBeUndefined();
|
||||
expect(response.metadata.title).toBe("Roast My Website");
|
||||
expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
||||
expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
|
||||
expect(response.metadata.robots).toBe("follow, index");
|
||||
expect(response.metadata.ogTitle).toBe("Roast My Website");
|
||||
expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
||||
expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
|
||||
expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
|
||||
expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||
expect(response.metadata.ogSiteName).toBe("Roast My Website");
|
||||
expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
|
||||
expect(response.metadata.statusCode).toBe(200);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).toHaveProperty("totalCount");
|
||||
expect(response.totalCount).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("creditsUsed");
|
||||
expect(response.creditsUsed).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("expiresAt");
|
||||
expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
|
||||
expect(response).toHaveProperty("status");
|
||||
expect(response.status).toBe("completed");
|
||||
expect(response).not.toHaveProperty("next"); // wait until done
|
||||
expect(response.data?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("markdown");
|
||||
expect(response.data?.[0].markdown).toContain("_Roast_");
|
||||
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(response.data?.[0]).not.toHaveProperty("html");
|
||||
expect(response.data?.[0]).not.toHaveProperty("rawHtml");
|
||||
expect(response.data?.[0]).not.toHaveProperty("screenshot");
|
||||
expect(response.data?.[0]).not.toHaveProperty("links");
|
||||
expect(response.data?.[0]).toHaveProperty("metadata");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(response.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', {
|
||||
excludePaths: ['blog/*'],
|
||||
includePaths: ['/'],
|
||||
maxDepth: 2,
|
||||
ignoreSitemap: true,
|
||||
limit: 10,
|
||||
allowBackwardLinks: true,
|
||||
allowExternalLinks: true,
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
|
||||
headers: { "x-key": "test" },
|
||||
includeTags: ['h1'],
|
||||
excludeTags: ['h2'],
|
||||
onlyMainContent: true,
|
||||
waitFor: 1000
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).toHaveProperty("totalCount");
|
||||
expect(response.totalCount).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("creditsUsed");
|
||||
expect(response.creditsUsed).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("expiresAt");
|
||||
expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
|
||||
expect(response).toHaveProperty("status");
|
||||
expect(response.status).toBe("completed");
|
||||
expect(response).not.toHaveProperty("next");
|
||||
expect(response.data?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("markdown");
|
||||
expect(response.data?.[0].markdown).toContain("_Roast_");
|
||||
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(response.data?.[0]).toHaveProperty("html");
|
||||
expect(response.data?.[0].html).toContain("<h1");
|
||||
expect(response.data?.[0]).toHaveProperty("rawHtml");
|
||||
expect(response.data?.[0].rawHtml).toContain("<h1");
|
||||
expect(response.data?.[0]).toHaveProperty("screenshot");
|
||||
expect(response.data?.[0].screenshot).toContain("https://");
|
||||
expect(response.data?.[0]).toHaveProperty("links");
|
||||
expect(response.data?.[0].links).not.toBeNull();
|
||||
expect(response.data?.[0].links?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("metadata");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(response.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should handle idempotency key for crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.id).toBeDefined();
|
||||
|
||||
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
||||
});
|
||||
|
||||
test.concurrent('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.id).toBeDefined();
|
||||
|
||||
let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
while (statusResponse.status === 'scraping' && checks < maxChecks) {
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
|
||||
expect(statusResponse).not.toHaveProperty("current"); // v0
|
||||
expect(statusResponse).toHaveProperty("data");
|
||||
expect(statusResponse).toHaveProperty("totalCount");
|
||||
expect(statusResponse).toHaveProperty("creditsUsed");
|
||||
expect(statusResponse).toHaveProperty("expiresAt");
|
||||
expect(statusResponse).toHaveProperty("status");
|
||||
expect(statusResponse).toHaveProperty("next");
|
||||
expect(statusResponse.totalCount).toBeGreaterThan(0);
|
||||
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
||||
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
||||
expect(statusResponse.status).toBe("scraping");
|
||||
expect(statusResponse.next).toContain("/v1/crawl/");
|
||||
statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
|
||||
checks++;
|
||||
}
|
||||
|
||||
expect(statusResponse).not.toBeNull();
|
||||
expect(statusResponse).toHaveProperty("totalCount");
|
||||
expect(statusResponse.totalCount).toBeGreaterThan(0);
|
||||
expect(statusResponse).toHaveProperty("creditsUsed");
|
||||
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
||||
expect(statusResponse).toHaveProperty("expiresAt");
|
||||
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
||||
expect(statusResponse).toHaveProperty("status");
|
||||
expect(statusResponse.status).toBe("completed");
|
||||
expect(statusResponse.data?.length).toBeGreaterThan(0);
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("markdown");
|
||||
expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
|
||||
expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("html");
|
||||
expect(statusResponse.data?.[0].html).toContain("<div");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
|
||||
expect(statusResponse.data?.[0].rawHtml).toContain("<div");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
|
||||
expect(statusResponse.data?.[0].screenshot).toContain("https://");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("links");
|
||||
expect(statusResponse.data?.[0].links).not.toBeNull();
|
||||
expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("metadata");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on map', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on map', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid map', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
expect(response).not.toBeNull();
|
||||
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
expect(response.links?.[0]).toContain("https://");
|
||||
const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai"));
|
||||
expect(filteredLinks?.length).toBeGreaterThan(0);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should throw NotImplementedError for search on v1', async () => {
|
||||
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
||||
await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1");
|
||||
});
|
||||
});
|
|
@ -1,16 +1,22 @@
|
|||
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
|
||||
import { z } from "zod";
|
||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||
|
||||
/**
|
||||
* Configuration interface for FirecrawlApp.
|
||||
* @param apiKey - Optional API key for authentication.
|
||||
* @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'.
|
||||
* @param version - API version, either 'v0' or 'v1'.
|
||||
*/
|
||||
export interface FirecrawlAppConfig {
|
||||
apiKey?: string | null;
|
||||
apiUrl?: string | null;
|
||||
version?: "v0" | "v1";
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata for a Firecrawl document.
|
||||
* Includes various optional properties for document metadata.
|
||||
*/
|
||||
export interface FirecrawlDocumentMetadata {
|
||||
title?: string;
|
||||
|
@ -43,6 +49,17 @@ export interface FirecrawlDocumentMetadata {
|
|||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
[key: string]: any; // Allows for additional metadata properties not explicitly defined.
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata for a Firecrawl document on v0.
|
||||
* Similar to FirecrawlDocumentMetadata but includes properties specific to API version v0.
|
||||
*/
|
||||
export interface FirecrawlDocumentMetadataV0 {
|
||||
// Similar properties as FirecrawlDocumentMetadata with additional v0 specific adjustments
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
[key: string]: any;
|
||||
|
@ -50,8 +67,23 @@ export interface FirecrawlDocumentMetadata {
|
|||
|
||||
/**
|
||||
* Document interface for Firecrawl.
|
||||
* Represents a document retrieved or processed by Firecrawl.
|
||||
*/
|
||||
export interface FirecrawlDocument {
|
||||
url?: string;
|
||||
markdown?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
metadata: FirecrawlDocumentMetadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Document interface for Firecrawl on v0.
|
||||
* Represents a document specifically for API version v0 with additional properties.
|
||||
*/
|
||||
export interface FirecrawlDocumentV0 {
|
||||
id?: string;
|
||||
url?: string;
|
||||
content: string;
|
||||
|
@ -61,79 +93,240 @@ export interface FirecrawlDocument {
|
|||
createdAt?: Date;
|
||||
updatedAt?: Date;
|
||||
type?: string;
|
||||
metadata: FirecrawlDocumentMetadata;
|
||||
metadata: FirecrawlDocumentMetadataV0;
|
||||
childrenLinks?: string[];
|
||||
provider?: string;
|
||||
warning?: string;
|
||||
|
||||
index?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for scraping operations.
|
||||
* Parameters for scraping operations.
|
||||
* Defines the options and configurations available for scraping web content.
|
||||
*/
|
||||
export interface ScrapeResponse {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocument;
|
||||
error?: string;
|
||||
export interface ScrapeParams {
|
||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot")[];
|
||||
headers?: Record<string, string>;
|
||||
includeTags?: string[];
|
||||
excludeTags?: string[];
|
||||
onlyMainContent?: boolean;
|
||||
screenshotMode?: "desktop" | "full-desktop" | "mobile" | "full-mobile";
|
||||
waitFor?: number;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for searching operations.
|
||||
* Parameters for scraping operations on v0.
|
||||
* Includes page and extractor options specific to API version v0.
|
||||
*/
|
||||
export interface SearchResponse {
|
||||
export interface ScrapeParamsV0 {
|
||||
pageOptions?: {
|
||||
headers?: Record<string, string>;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
onlyIncludeTags?: string[];
|
||||
onlyMainContent?: boolean;
|
||||
removeTags?: string[];
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
waitFor?: number;
|
||||
};
|
||||
extractorOptions?: {
|
||||
mode?: "markdown" | "llm-extraction" | "llm-extraction-from-raw-html" | "llm-extraction-from-markdown";
|
||||
extractionPrompt?: string;
|
||||
extractionSchema?: Record<string, any> | z.ZodSchema | any;
|
||||
};
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for scraping operations.
|
||||
* Defines the structure of the response received after a scraping operation.
|
||||
*/
|
||||
export interface ScrapeResponse extends FirecrawlDocument {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocument[];
|
||||
warning?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for scraping operations on v0.
|
||||
* Similar to ScrapeResponse but tailored for responses from API version v0.
|
||||
*/
|
||||
export interface ScrapeResponseV0 {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocumentV0;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for crawling operations.
|
||||
* Includes options for both scraping and mapping during a crawl.
|
||||
*/
|
||||
export interface CrawlParams {
|
||||
scrapeOptions?: ScrapeParams;
|
||||
crawlerOptions?: {
|
||||
includePaths?: string[]
|
||||
excludePaths?: string[]
|
||||
maxDepth?: number
|
||||
limit?: number
|
||||
allowBackwardLinks?: boolean
|
||||
allowExternalLinks?: boolean
|
||||
ignoreSitemap?: boolean
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for crawling operations on v0.
|
||||
* Tailored for API version v0, includes specific options for crawling.
|
||||
*/
|
||||
export interface CrawlParamsV0 {
|
||||
crawlerOptions?: {
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
generateImgAltText?: boolean;
|
||||
returnOnlyUrls?: boolean;
|
||||
maxDepth?: number;
|
||||
mode?: "default" | "fast";
|
||||
ignoreSitemap?: boolean;
|
||||
limit?: number;
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
};
|
||||
pageOptions?: {
|
||||
headers?: Record<string, string>;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
onlyIncludeTags?: string[];
|
||||
onlyMainContent?: boolean;
|
||||
removeTags?: string[];
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
waitFor?: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for crawling operations.
|
||||
* Defines the structure of the response received after initiating a crawl.
|
||||
*/
|
||||
export interface CrawlResponse {
|
||||
id?: string;
|
||||
url?: string;
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for crawling operations on v0.
|
||||
* Similar to CrawlResponse but tailored for responses from API version v0.
|
||||
*/
|
||||
export interface CrawlResponseV0 {
|
||||
jobId?: string;
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
* Provides detailed status of a crawl job including progress and results.
|
||||
*/
|
||||
export interface CrawlStatusResponse {
|
||||
success: boolean;
|
||||
totalCount: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: Date;
|
||||
status: "scraping" | "completed" | "failed";
|
||||
next: string;
|
||||
data?: FirecrawlDocument[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
* Response interface for job status checks on v0.
|
||||
* Tailored for API version v0, provides status and partial data of a crawl job.
|
||||
*/
|
||||
export interface JobStatusResponse {
|
||||
export interface CrawlStatusResponseV0 {
|
||||
success: boolean;
|
||||
status: string;
|
||||
current?: number;
|
||||
current_url?: string;
|
||||
current_step?: string;
|
||||
total?: number;
|
||||
jobId?: string;
|
||||
data?: FirecrawlDocument[];
|
||||
partial_data?: FirecrawlDocument[];
|
||||
data?: FirecrawlDocumentV0[];
|
||||
partial_data?: FirecrawlDocumentV0[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic parameter interface.
|
||||
* Parameters for mapping operations.
|
||||
* Defines options for mapping URLs during a crawl.
|
||||
*/
|
||||
export interface Params {
|
||||
[key: string]: any;
|
||||
extractorOptions?: {
|
||||
extractionSchema: z.ZodSchema | any;
|
||||
mode?: "llm-extraction";
|
||||
extractionPrompt?: string;
|
||||
export interface MapParams {
|
||||
includePaths?: string[]
|
||||
excludePaths?: string[]
|
||||
maxDepth?: number
|
||||
limit?: number
|
||||
allowBackwardLinks?: boolean
|
||||
allowExternalLinks?: boolean
|
||||
ignoreSitemap?: boolean
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for mapping operations.
|
||||
* Defines the structure of the response received after a mapping operation.
|
||||
*/
|
||||
export interface MapResponse {
|
||||
success: boolean;
|
||||
links?: string[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for searching operations on v0.
|
||||
* Tailored for API version v0, includes specific options for searching content.
|
||||
*/
|
||||
export interface SearchParamsV0 {
|
||||
pageOptions?: {
|
||||
onlyMainContent?: boolean;
|
||||
fetchPageContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
};
|
||||
searchOptions?: {
|
||||
limit?: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for searching operations on v0.
|
||||
* Defines the structure of the response received after a search operation on v0.
|
||||
*/
|
||||
export interface SearchResponseV0 {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocumentV0[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main class for interacting with the Firecrawl API.
|
||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||
*/
|
||||
export default class FirecrawlApp {
|
||||
private apiKey: string;
|
||||
private apiUrl: string;
|
||||
private version: "v0" | "v1";
|
||||
|
||||
/**
|
||||
* Initializes a new instance of the FirecrawlApp class.
|
||||
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
||||
* @param config - Configuration options for the FirecrawlApp instance.
|
||||
*/
|
||||
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
||||
constructor({ apiKey = null, apiUrl = null, version = "v1" }: FirecrawlAppConfig) {
|
||||
this.apiKey = apiKey || "";
|
||||
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
||||
this.version = version;
|
||||
if (!this.apiKey) {
|
||||
throw new Error("No API key provided");
|
||||
}
|
||||
|
@ -141,21 +334,21 @@ export default class FirecrawlApp {
|
|||
|
||||
/**
|
||||
* Scrapes a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to scrape.
|
||||
* @param {Params | null} params - Additional parameters for the scrape request.
|
||||
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
||||
* @param url - The URL to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @returns The response from the scrape operation.
|
||||
*/
|
||||
async scrapeUrl(
|
||||
url: string,
|
||||
params: Params | null = null
|
||||
): Promise<ScrapeResponse> {
|
||||
params?: ScrapeParams | ScrapeParamsV0
|
||||
): Promise<ScrapeResponse | ScrapeResponseV0> {
|
||||
const headers: AxiosRequestHeaders = {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
let jsonData: Params = { url, ...params };
|
||||
if (params?.extractorOptions?.extractionSchema) {
|
||||
let schema = params.extractorOptions.extractionSchema;
|
||||
let jsonData: any = { url, ...params };
|
||||
if (jsonData?.extractorOptions?.extractionSchema) {
|
||||
let schema = jsonData.extractorOptions.extractionSchema;
|
||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||
if (schema instanceof z.ZodSchema) {
|
||||
schema = zodToJsonSchema(schema);
|
||||
|
@ -163,22 +356,31 @@ export default class FirecrawlApp {
|
|||
jsonData = {
|
||||
...jsonData,
|
||||
extractorOptions: {
|
||||
...params.extractorOptions,
|
||||
...jsonData.extractorOptions,
|
||||
extractionSchema: schema,
|
||||
mode: params.extractorOptions.mode || "llm-extraction",
|
||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
||||
},
|
||||
};
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await axios.post(
|
||||
this.apiUrl + "/v0/scrape",
|
||||
this.apiUrl + `/${this.version}/scrape`,
|
||||
jsonData,
|
||||
{ headers }
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data;
|
||||
if (responseData.success) {
|
||||
return responseData;
|
||||
if (this.version == 'v0') {
|
||||
return responseData as ScrapeResponseV0;
|
||||
} else {
|
||||
return {
|
||||
success: true,
|
||||
warning: responseData.warning,
|
||||
error: responseData.error,
|
||||
...responseData.data
|
||||
} as ScrapeResponse;
|
||||
}
|
||||
} else {
|
||||
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
||||
}
|
||||
|
@ -193,19 +395,23 @@ export default class FirecrawlApp {
|
|||
|
||||
/**
|
||||
* Searches for a query using the Firecrawl API.
|
||||
* @param {string} query - The query to search for.
|
||||
* @param {Params | null} params - Additional parameters for the search request.
|
||||
* @returns {Promise<SearchResponse>} The response from the search operation.
|
||||
* @param query - The query to search for.
|
||||
* @param params - Additional parameters for the search request.
|
||||
* @returns The response from the search operation.
|
||||
*/
|
||||
async search(
|
||||
query: string,
|
||||
params: Params | null = null
|
||||
): Promise<SearchResponse> {
|
||||
params?: SearchParamsV0
|
||||
): Promise<SearchResponseV0> {
|
||||
if (this.version === "v1") {
|
||||
throw new Error("Search is not supported in v1");
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
let jsonData: Params = { query };
|
||||
let jsonData: any = { query };
|
||||
if (params) {
|
||||
jsonData = { ...jsonData, ...params };
|
||||
}
|
||||
|
@ -233,93 +439,157 @@ export default class FirecrawlApp {
|
|||
|
||||
/**
|
||||
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to crawl.
|
||||
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param {number} pollInterval - Time in seconds for job status checks.
|
||||
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
||||
* @param url - The URL to crawl.
|
||||
* @param params - Additional parameters for the crawl request.
|
||||
* @param waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param pollInterval - Time in seconds for job status checks.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns The response from the crawl operation.
|
||||
*/
|
||||
async crawlUrl(
|
||||
url: string,
|
||||
params: Params | null = null,
|
||||
params?: CrawlParams | CrawlParamsV0,
|
||||
waitUntilDone: boolean = true,
|
||||
pollInterval: number = 2,
|
||||
idempotencyKey?: string
|
||||
): Promise<CrawlResponse | any> {
|
||||
): Promise<CrawlResponse | CrawlResponseV0 | CrawlStatusResponse | CrawlStatusResponseV0> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: Params = { url };
|
||||
if (params) {
|
||||
jsonData = { ...jsonData, ...params };
|
||||
}
|
||||
let jsonData: any = { url, ...params };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + "/v0/crawl",
|
||||
this.apiUrl + `/${this.version}/crawl`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const jobId: string = response.data.jobId;
|
||||
const id: string = this.version == 'v0' ? response.data.jobId : response.data.id;
|
||||
let checkUrl: string | undefined = undefined;
|
||||
if (waitUntilDone) {
|
||||
return this.monitorJobStatus(jobId, headers, pollInterval);
|
||||
if (this.version == 'v1') { checkUrl = response.data.url }
|
||||
return this.monitorJobStatus(id, headers, pollInterval, checkUrl);
|
||||
} else {
|
||||
return { success: true, jobId };
|
||||
if (this.version == 'v0') {
|
||||
return {
|
||||
success: true,
|
||||
jobId: id
|
||||
} as CrawlResponseV0;
|
||||
} else {
|
||||
return {
|
||||
success: true,
|
||||
id: id
|
||||
} as CrawlResponse;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, "start crawl job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(error);
|
||||
throw new Error(error.message);
|
||||
if (error.response.data.error) {
|
||||
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
||||
} else {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the status of a crawl job using the Firecrawl API.
|
||||
* @param {string} jobId - The job ID of the crawl operation.
|
||||
* @returns {Promise<JobStatusResponse>} The response containing the job status.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @returns The response containing the job status.
|
||||
*/
|
||||
async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> {
|
||||
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
|
||||
if (!id) {
|
||||
throw new Error("No crawl ID provided");
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
this.apiUrl + `/v0/crawl/status/${jobId}`,
|
||||
this.version == 'v1' ?
|
||||
this.apiUrl + `/${this.version}/crawl/${id}` :
|
||||
this.apiUrl + `/${this.version}/crawl/status/${id}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return {
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
current: response.data.current,
|
||||
current_url: response.data.current_url,
|
||||
current_step: response.data.current_step,
|
||||
total: response.data.total,
|
||||
data: response.data.data,
|
||||
partial_data: !response.data.data
|
||||
? response.data.partial_data
|
||||
: undefined,
|
||||
};
|
||||
if (this.version == 'v0') {
|
||||
return {
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
current: response.data.current,
|
||||
current_url: response.data.current_url,
|
||||
current_step: response.data.current_step,
|
||||
total: response.data.total,
|
||||
data: response.data.data,
|
||||
partial_data: !response.data.data
|
||||
? response.data.partial_data
|
||||
: undefined,
|
||||
} as CrawlStatusResponseV0;
|
||||
} else if (this.version == 'v1') {
|
||||
return {
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
totalCount: response.data.totalCount,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: response.data.data,
|
||||
error: response.data.error
|
||||
} as CrawlStatusResponse;
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, "check crawl status");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
status: "unknown",
|
||||
current: 0,
|
||||
current_url: "",
|
||||
current_step: "",
|
||||
total: 0,
|
||||
error: "Internal server error.",
|
||||
};
|
||||
|
||||
if (this.version == 'v0') {
|
||||
return {
|
||||
success: false,
|
||||
status: "unknown",
|
||||
current: 0,
|
||||
current_url: "",
|
||||
current_step: "",
|
||||
total: 0,
|
||||
error: "Internal server error.",
|
||||
} as CrawlStatusResponseV0;
|
||||
} else {
|
||||
return {
|
||||
success: false,
|
||||
error: "Internal server error.",
|
||||
} as CrawlStatusResponse;
|
||||
}
|
||||
}
|
||||
|
||||
async mapUrl(url: string, params?: MapParams): Promise<MapResponse> {
|
||||
if (this.version == 'v0') {
|
||||
throw new Error("Map is not supported in v0");
|
||||
}
|
||||
const headers = this.prepareHeaders();
|
||||
let jsonData: { url: string } & MapParams = { url, ...params };
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/${this.version}/map`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data as MapResponse;
|
||||
} else {
|
||||
this.handleError(response, "map");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: "Internal server error." } as MapResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares the headers for an API request.
|
||||
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||
* @param idempotencyKey - Optional key to ensure idempotency.
|
||||
* @returns The prepared headers.
|
||||
*/
|
||||
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
||||
return {
|
||||
|
@ -331,14 +601,14 @@ export default class FirecrawlApp {
|
|||
|
||||
/**
|
||||
* Sends a POST request to the specified URL.
|
||||
* @param {string} url - The URL to send the request to.
|
||||
* @param {Params} data - The data to send in the request.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @returns {Promise<AxiosResponse>} The response from the POST request.
|
||||
* @param url - The URL to send the request to.
|
||||
* @param data - The data to send in the request.
|
||||
* @param headers - The headers for the request.
|
||||
* @returns The response from the POST request.
|
||||
*/
|
||||
postRequest(
|
||||
url: string,
|
||||
data: Params,
|
||||
data: any,
|
||||
headers: AxiosRequestHeaders
|
||||
): Promise<AxiosResponse> {
|
||||
return axios.post(url, data, { headers });
|
||||
|
@ -346,9 +616,9 @@ export default class FirecrawlApp {
|
|||
|
||||
/**
|
||||
* Sends a GET request to the specified URL.
|
||||
* @param {string} url - The URL to send the request to.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @returns {Promise<AxiosResponse>} The response from the GET request.
|
||||
* @param url - The URL to send the request to.
|
||||
* @param headers - The headers for the request.
|
||||
* @returns The response from the GET request.
|
||||
*/
|
||||
getRequest(
|
||||
url: string,
|
||||
|
@ -359,31 +629,38 @@ export default class FirecrawlApp {
|
|||
|
||||
/**
|
||||
* Monitors the status of a crawl job until completion or failure.
|
||||
* @param {string} jobId - The job ID of the crawl operation.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||
* @returns {Promise<any>} The final job status or data.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @param headers - The headers for the request.
|
||||
* @param checkInterval - Interval in seconds for job status checks.
|
||||
* @returns The final job status or data.
|
||||
*/
|
||||
async monitorJobStatus(
|
||||
jobId: string,
|
||||
id: string,
|
||||
headers: AxiosRequestHeaders,
|
||||
checkInterval: number
|
||||
): Promise<any> {
|
||||
checkInterval: number,
|
||||
checkUrl?: string
|
||||
): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
|
||||
let apiUrl: string = '';
|
||||
while (true) {
|
||||
if (this.version == 'v1') {
|
||||
apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`;
|
||||
} else if (this.version == 'v0') {
|
||||
apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`;
|
||||
}
|
||||
const statusResponse: AxiosResponse = await this.getRequest(
|
||||
this.apiUrl + `/v0/crawl/status/${jobId}`,
|
||||
apiUrl,
|
||||
headers
|
||||
);
|
||||
if (statusResponse.status === 200) {
|
||||
const statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
return statusData.data;
|
||||
return this.version == 'v0' ? statusData.data : statusData;
|
||||
} else {
|
||||
throw new Error("Crawl job completed but no data was returned");
|
||||
}
|
||||
} else if (
|
||||
["active", "paused", "pending", "queued"].includes(statusData.status)
|
||||
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
|
||||
) {
|
||||
if (checkInterval < 2) {
|
||||
checkInterval = 2;
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
|
||||
/* Language and Environment */
|
||||
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
"target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
||||
|
@ -25,9 +25,9 @@
|
|||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
"module": "NodeNext", /* Specify what module code is generated. */
|
||||
"module": "commonjs", /* Specify what module code is generated. */
|
||||
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||
"moduleResolution": "nodenext", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
"moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||
|
|
75
apps/python-sdk/examplev0.py
Normal file
75
apps/python-sdk/examplev0.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
|
||||
print(crawl_result)
|
||||
|
||||
# LLM Extraction:
|
||||
# Define schema to extract contents into using pydantic
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
||||
|
||||
# Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': json_schema,
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
|
@ -7,7 +7,7 @@ from dotenv import load_dotenv
|
|||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "http://127.0.0.1:3002";
|
||||
API_URL = "http://127.0.0.1:3002"
|
||||
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
||||
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
||||
|
||||
|
@ -20,32 +20,34 @@ FirecrawlApp = firecrawl.FirecrawlApp
|
|||
|
||||
def test_no_api_key():
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
||||
assert "No API key provided" in str(excinfo.value)
|
||||
|
||||
def test_scrape_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert "_Roast_" in response['content']
|
||||
|
||||
def test_scrape_url_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
print(response)
|
||||
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert 'markdown' in response
|
||||
|
@ -54,7 +56,7 @@ def test_scrape_url_e2e():
|
|||
assert "_Roast_" in response['content']
|
||||
|
||||
def test_successful_response_with_valid_api_key_and_include_html():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
|
@ -66,7 +68,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
|
|||
assert "<h1" in response['html']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
|
@ -74,7 +76,7 @@ def test_successful_response_for_valid_scrape_with_pdf_file():
|
|||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
||||
time.sleep(6) # wait for 6 seconds
|
||||
assert response is not None
|
||||
|
@ -83,20 +85,20 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
|
|||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_crawl_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
|
@ -104,7 +106,7 @@ def test_crawl_url_wait_for_completion_e2e():
|
|||
assert "_Roast_" in response[0]['content']
|
||||
|
||||
def test_crawl_url_with_idempotency_key_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
uniqueIdempotencyKey = str(uuid4())
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert response is not None
|
||||
|
@ -117,7 +119,7 @@ def test_crawl_url_with_idempotency_key_e2e():
|
|||
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
||||
|
||||
def test_check_crawl_status_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
||||
assert response is not None
|
||||
assert 'jobId' in response
|
||||
|
@ -131,21 +133,21 @@ def test_check_crawl_status_e2e():
|
|||
assert len(status_response['data']) > 0
|
||||
|
||||
def test_search_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.search("test query")
|
||||
assert response is not None
|
||||
assert 'content' in response[0]
|
||||
assert len(response) > 2
|
||||
|
||||
def test_search_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.search("test query")
|
||||
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_llm_extraction():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url("https://mendable.ai", {
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url("https://firecrawl.dev", {
|
||||
'extractorOptions': {
|
||||
'mode': 'llm-extraction',
|
||||
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
API_URL=http://localhost:3002
|
||||
ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
|
||||
TEST_API_KEY=fc-YOUR_API_KEY
|
Binary file not shown.
352
apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py
Normal file
352
apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py
Normal file
|
@ -0,0 +1,352 @@
|
|||
import importlib.util
|
||||
import pytest
|
||||
import time
|
||||
import os
|
||||
from uuid import uuid4
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "http://127.0.0.1:3002";
|
||||
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
||||
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
||||
|
||||
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
||||
|
||||
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
||||
firecrawl = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(firecrawl)
|
||||
FirecrawlApp = firecrawl.FirecrawlApp
|
||||
|
||||
def test_no_api_key():
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||
assert "No API key provided" in str(excinfo.value)
|
||||
|
||||
def test_scrape_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert "_Roast_" in response['markdown']
|
||||
assert "content" not in response
|
||||
assert "html" not in response
|
||||
assert "metadata" in response
|
||||
assert "links" not in response
|
||||
assert "rawHtml" not in response
|
||||
|
||||
def test_successful_response_for_valid_scrape():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert 'markdown' in response
|
||||
assert "_Roast_" in response['markdown']
|
||||
assert 'metadata' in response
|
||||
assert 'content' not in response
|
||||
assert 'html' not in response
|
||||
assert 'rawHtml' not in response
|
||||
assert 'screenshot' not in response
|
||||
assert 'links' not in response
|
||||
|
||||
def test_successful_response_with_valid_api_key_and_options():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
params = {
|
||||
'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
|
||||
'headers': {'x-key': 'test'},
|
||||
'includeTags': ['h1'],
|
||||
'excludeTags': ['h2'],
|
||||
'onlyMainContent': True,
|
||||
'timeout': 30000,
|
||||
'waitFor': 1000
|
||||
}
|
||||
response = app.scrape_url('https://roastmywebsite.ai', params)
|
||||
assert response is not None
|
||||
assert 'content' not in response
|
||||
assert 'markdown' in response
|
||||
assert 'html' in response
|
||||
assert 'rawHtml' in response
|
||||
assert 'screenshot' in response
|
||||
assert 'links' in response
|
||||
assert "_Roast_" in response['markdown']
|
||||
assert "<h1" in response['html']
|
||||
assert "<h1" in response['rawHtml']
|
||||
assert "https://" in response['screenshot']
|
||||
assert len(response['links']) > 0
|
||||
assert "https://" in response['links'][0]
|
||||
assert 'metadata' in response
|
||||
assert 'title' in response['metadata']
|
||||
assert 'description' in response['metadata']
|
||||
assert 'keywords' in response['metadata']
|
||||
assert 'robots' in response['metadata']
|
||||
assert 'ogTitle' in response['metadata']
|
||||
assert 'ogDescription' in response['metadata']
|
||||
assert 'ogUrl' in response['metadata']
|
||||
assert 'ogImage' in response['metadata']
|
||||
assert 'ogLocaleAlternate' in response['metadata']
|
||||
assert 'ogSiteName' in response['metadata']
|
||||
assert 'sourceURL' in response['metadata']
|
||||
assert 'statusCode' in response['metadata']
|
||||
assert 'pageStatusCode' not in response['metadata']
|
||||
assert 'pageError' not in response['metadata']
|
||||
assert 'error' not in response['metadata']
|
||||
assert response['metadata']['title'] == "Roast My Website"
|
||||
assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl"
|
||||
assert response['metadata']['robots'] == "follow, index"
|
||||
assert response['metadata']['ogTitle'] == "Roast My Website"
|
||||
assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai"
|
||||
assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png"
|
||||
assert response['metadata']['ogLocaleAlternate'] == []
|
||||
assert response['metadata']['ogSiteName'] == "Roast My Website"
|
||||
assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai"
|
||||
assert response['metadata']['statusCode'] == 200
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
||||
assert response is not None
|
||||
assert 'content' not in response
|
||||
assert 'metadata' in response
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
||||
time.sleep(1) # wait for 1 second
|
||||
assert response is not None
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
|
||||
|
||||
def test_crawl_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30)
|
||||
assert response is not None
|
||||
assert 'totalCount' in response
|
||||
assert response['totalCount'] > 0
|
||||
assert 'creditsUsed' in response
|
||||
assert response['creditsUsed'] > 0
|
||||
assert 'expiresAt' in response
|
||||
assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
|
||||
assert 'status' in response
|
||||
assert response['status'] == 'completed'
|
||||
assert 'next' not in response
|
||||
assert len(response['data']) > 0
|
||||
assert 'markdown' in response['data'][0]
|
||||
assert "_Roast_" in response['data'][0]['markdown']
|
||||
assert 'content' not in response['data'][0]
|
||||
assert 'html' not in response['data'][0]
|
||||
assert 'rawHtml' not in response['data'][0]
|
||||
assert 'screenshot' not in response['data'][0]
|
||||
assert 'links' not in response['data'][0]
|
||||
assert 'metadata' in response['data'][0]
|
||||
assert 'title' in response['data'][0]['metadata']
|
||||
assert 'description' in response['data'][0]['metadata']
|
||||
assert 'language' in response['data'][0]['metadata']
|
||||
assert 'sourceURL' in response['data'][0]['metadata']
|
||||
assert 'statusCode' in response['data'][0]['metadata']
|
||||
assert 'error' not in response['data'][0]['metadata']
|
||||
|
||||
def test_crawl_url_with_options_and_wait_for_completion():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {
|
||||
'excludePaths': ['blog/*'],
|
||||
'includePaths': ['/'],
|
||||
'maxDepth': 2,
|
||||
'ignoreSitemap': True,
|
||||
'limit': 10,
|
||||
'allowBackwardLinks': True,
|
||||
'allowExternalLinks': True,
|
||||
'scrapeOptions': {
|
||||
'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
|
||||
'headers': {"x-key": "test"},
|
||||
'includeTags': ['h1'],
|
||||
'excludeTags': ['h2'],
|
||||
'onlyMainContent': True,
|
||||
'waitFor': 1000
|
||||
}
|
||||
}, True, 30)
|
||||
assert response is not None
|
||||
assert 'totalCount' in response
|
||||
assert response['totalCount'] > 0
|
||||
assert 'creditsUsed' in response
|
||||
assert response['creditsUsed'] > 0
|
||||
assert 'expiresAt' in response
|
||||
assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
|
||||
assert 'status' in response
|
||||
assert response['status'] == 'completed'
|
||||
assert 'next' not in response
|
||||
assert len(response['data']) > 0
|
||||
assert 'markdown' in response['data'][0]
|
||||
assert "_Roast_" in response['data'][0]['markdown']
|
||||
assert 'content' not in response['data'][0]
|
||||
assert 'html' in response['data'][0]
|
||||
assert "<h1" in response['data'][0]['html']
|
||||
assert 'rawHtml' in response['data'][0]
|
||||
assert "<h1" in response['data'][0]['rawHtml']
|
||||
assert 'screenshot' in response['data'][0]
|
||||
assert "https://" in response['data'][0]['screenshot']
|
||||
assert 'links' in response['data'][0]
|
||||
assert len(response['data'][0]['links']) > 0
|
||||
assert 'metadata' in response['data'][0]
|
||||
assert 'title' in response['data'][0]['metadata']
|
||||
assert 'description' in response['data'][0]['metadata']
|
||||
assert 'language' in response['data'][0]['metadata']
|
||||
assert 'sourceURL' in response['data'][0]['metadata']
|
||||
assert 'statusCode' in response['data'][0]['metadata']
|
||||
assert 'error' not in response['data'][0]['metadata']
|
||||
|
||||
def test_crawl_url_with_idempotency_key_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
uniqueIdempotencyKey = str(uuid4())
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey)
|
||||
assert response is not None
|
||||
assert 'id' in response
|
||||
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey)
|
||||
assert "Idempotency key already used" in str(excinfo.value)
|
||||
|
||||
def test_check_crawl_status_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False)
|
||||
assert response is not None
|
||||
assert 'id' in response
|
||||
|
||||
max_checks = 15
|
||||
checks = 0
|
||||
status_response = app.check_crawl_status(response['id'])
|
||||
|
||||
while status_response['status'] == 'scraping' and checks < max_checks:
|
||||
time.sleep(1) # wait for 1 second
|
||||
assert 'partial_data' not in status_response
|
||||
assert 'current' not in status_response
|
||||
assert 'data' in status_response
|
||||
assert 'totalCount' in status_response
|
||||
assert 'creditsUsed' in status_response
|
||||
assert 'expiresAt' in status_response
|
||||
assert 'status' in status_response
|
||||
assert 'next' in status_response
|
||||
assert status_response['totalCount'] > 0
|
||||
assert status_response['creditsUsed'] > 0
|
||||
assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
|
||||
assert status_response['status'] == 'scraping'
|
||||
assert '/v1/crawl/' in status_response['next']
|
||||
status_response = app.check_crawl_status(response['id'])
|
||||
checks += 1
|
||||
|
||||
assert status_response is not None
|
||||
assert 'totalCount' in status_response
|
||||
assert status_response['totalCount'] > 0
|
||||
assert 'creditsUsed' in status_response
|
||||
assert status_response['creditsUsed'] > 0
|
||||
assert 'expiresAt' in status_response
|
||||
assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
|
||||
assert 'status' in status_response
|
||||
assert status_response['status'] == 'completed'
|
||||
assert len(status_response['data']) > 0
|
||||
assert 'markdown' in status_response['data'][0]
|
||||
assert len(status_response['data'][0]['markdown']) > 10
|
||||
assert 'content' not in status_response['data'][0]
|
||||
assert 'html' in status_response['data'][0]
|
||||
assert "<div" in status_response['data'][0]['html']
|
||||
assert 'rawHtml' in status_response['data'][0]
|
||||
assert "<div" in status_response['data'][0]['rawHtml']
|
||||
assert 'screenshot' in status_response['data'][0]
|
||||
assert "https://" in status_response['data'][0]['screenshot']
|
||||
assert 'links' in status_response['data'][0]
|
||||
assert status_response['data'][0]['links'] is not None
|
||||
assert len(status_response['data'][0]['links']) > 0
|
||||
assert 'metadata' in status_response['data'][0]
|
||||
assert 'title' in status_response['data'][0]['metadata']
|
||||
assert 'description' in status_response['data'][0]['metadata']
|
||||
assert 'language' in status_response['data'][0]['metadata']
|
||||
assert 'sourceURL' in status_response['data'][0]['metadata']
|
||||
assert 'statusCode' in status_response['data'][0]['metadata']
|
||||
assert 'error' not in status_response['data'][0]['metadata']
|
||||
|
||||
def test_invalid_api_key_on_map():
|
||||
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.map_url('https://roastmywebsite.ai')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url_on_map():
|
||||
app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.map_url(blocklisted_url)
|
||||
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token_on_map():
|
||||
app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)
|
||||
response = app.map_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
|
||||
def test_successful_response_for_valid_map():
|
||||
app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
|
||||
response = app.map_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
assert any("https://" in link for link in response)
|
||||
filtered_links = [link for link in response if "roastmywebsite.ai" in link]
|
||||
assert len(filtered_links) > 0
|
||||
|
||||
def test_search_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
with pytest.raises(NotImplementedError) as excinfo:
|
||||
app.search("test query")
|
||||
assert "Search is not supported in v1" in str(excinfo.value)
|
||||
|
||||
# def test_llm_extraction():
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
# response = app.scrape_url("https://mendable.ai", {
|
||||
# 'extractorOptions': {
|
||||
# 'mode': 'llm-extraction',
|
||||
# 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
# 'extractionSchema': {
|
||||
# 'type': 'object',
|
||||
# 'properties': {
|
||||
# 'company_mission': {'type': 'string'},
|
||||
# 'supports_sso': {'type': 'boolean'},
|
||||
# 'is_open_source': {'type': 'boolean'}
|
||||
# },
|
||||
# 'required': ['company_mission', 'supports_sso', 'is_open_source']
|
||||
# }
|
||||
# }
|
||||
# })
|
||||
# assert response is not None
|
||||
# assert 'llm_extraction' in response
|
||||
# llm_extraction = response['llm_extraction']
|
||||
# assert 'company_mission' in llm_extraction
|
||||
# assert isinstance(llm_extraction['supports_sso'], bool)
|
||||
# assert isinstance(llm_extraction['is_open_source'], bool)
|
||||
|
||||
|
||||
|
|
@ -19,24 +19,22 @@ import requests
|
|||
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||
|
||||
class FirecrawlApp:
|
||||
"""
|
||||
Initialize the FirecrawlApp instance.
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None:
|
||||
"""
|
||||
Initialize the FirecrawlApp instance with API key, API URL, and version.
|
||||
|
||||
Args:
|
||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||
"""
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
if self.api_key is None:
|
||||
logger.warning("No API key provided")
|
||||
raise ValueError('No API key provided')
|
||||
else:
|
||||
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
|
||||
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
if self.api_url != 'https://api.firecrawl.dev':
|
||||
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
|
||||
Args:
|
||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||
version (str): API version, either 'v0' or 'v1'.
|
||||
"""
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
self.version = version
|
||||
if self.api_key is None:
|
||||
logger.warning("No API key provided")
|
||||
raise ValueError('No API key provided')
|
||||
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}")
|
||||
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
|
@ -75,9 +73,11 @@ class FirecrawlApp:
|
|||
for key, value in params.items():
|
||||
if key != 'extractorOptions':
|
||||
scrape_params[key] = value
|
||||
|
||||
endpoint = f'/{self.version}/scrape'
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
f'{self.api_url}/v0/scrape',
|
||||
f'{self.api_url}{endpoint}',
|
||||
headers=headers,
|
||||
json=scrape_params,
|
||||
)
|
||||
|
@ -104,6 +104,9 @@ class FirecrawlApp:
|
|||
Raises:
|
||||
Exception: If the search request fails.
|
||||
"""
|
||||
if self.version == 'v1':
|
||||
raise NotImplementedError("Search is not supported in v1")
|
||||
|
||||
headers = self._prepare_headers()
|
||||
json_data = {'query': query}
|
||||
if params:
|
||||
|
@ -145,26 +148,37 @@ class FirecrawlApp:
|
|||
Raises:
|
||||
Exception: If the crawl job initiation or monitoring fails.
|
||||
"""
|
||||
endpoint = f'/{self.version}/crawl'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
job_id = response.json().get('jobId')
|
||||
if wait_until_done:
|
||||
return self._monitor_job_status(job_id, headers, poll_interval)
|
||||
if self.version == 'v0':
|
||||
id = response.json().get('jobId')
|
||||
else:
|
||||
return {'jobId': job_id}
|
||||
id = response.json().get('id')
|
||||
|
||||
if wait_until_done:
|
||||
check_url = None
|
||||
if self.version == 'v1':
|
||||
check_url = response.json().get('url')
|
||||
return self._monitor_job_status(id, headers, poll_interval, check_url)
|
||||
else:
|
||||
if self.version == 'v0':
|
||||
return {'jobId': id}
|
||||
else:
|
||||
return {'id': id}
|
||||
else:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
def check_crawl_status(self, job_id: str) -> Any:
|
||||
def check_crawl_status(self, id: str) -> Any:
|
||||
"""
|
||||
Check the status of a crawl job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
id (str): The ID of the crawl job.
|
||||
|
||||
Returns:
|
||||
Any: The status of the crawl job.
|
||||
|
@ -172,13 +186,72 @@ class FirecrawlApp:
|
|||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
|
||||
if self.version == 'v0':
|
||||
endpoint = f'/{self.version}/crawl/status/{id}'
|
||||
else:
|
||||
endpoint = f'/{self.version}/crawl/{id}'
|
||||
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
data = response.json()
|
||||
if self.version == 'v0':
|
||||
return {
|
||||
'success': True,
|
||||
'status': data.get('status'),
|
||||
'current': data.get('current'),
|
||||
'current_url': data.get('current_url'),
|
||||
'current_step': data.get('current_step'),
|
||||
'total': data.get('total'),
|
||||
'data': data.get('data'),
|
||||
'partial_data': data.get('partial_data') if not data.get('data') else None,
|
||||
}
|
||||
elif self.version == 'v1':
|
||||
return {
|
||||
'success': True,
|
||||
'status': data.get('status'),
|
||||
'totalCount': data.get('totalCount'),
|
||||
'creditsUsed': data.get('creditsUsed'),
|
||||
'expiresAt': data.get('expiresAt'),
|
||||
'next': data.get('next'),
|
||||
'data': data.get('data'),
|
||||
'error': data.get('error')
|
||||
}
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
Perform a map search using the Firecrawl API.
|
||||
"""
|
||||
if self.version == 'v0':
|
||||
raise NotImplementedError("Map is not supported in v0")
|
||||
|
||||
endpoint = f'/{self.version}/map'
|
||||
headers = self._prepare_headers()
|
||||
|
||||
# Prepare the base scrape parameters with the URL
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
f'{self.api_url}{endpoint}',
|
||||
headers=headers,
|
||||
json=json_data,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
print(response)
|
||||
if response['success'] and 'links' in response:
|
||||
return response['links']
|
||||
else:
|
||||
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
||||
else:
|
||||
self._handle_error(response, 'map')
|
||||
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
@ -257,15 +330,15 @@ class FirecrawlApp:
|
|||
return response
|
||||
return response
|
||||
|
||||
def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
|
||||
def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int, check_url: Optional[str] = None) -> Any:
|
||||
"""
|
||||
Monitor the status of a crawl job until completion.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
id (str): The ID of the crawl job.
|
||||
headers (Dict[str, str]): The headers to include in the status check requests.
|
||||
poll_interval (int): Secounds between status checks.
|
||||
|
||||
check_url (Optional[str]): The URL to check for the crawl job.
|
||||
Returns:
|
||||
Any: The crawl results if the job is completed successfully.
|
||||
|
||||
|
@ -273,15 +346,30 @@ class FirecrawlApp:
|
|||
Exception: If the job fails or an error occurs during status checks.
|
||||
"""
|
||||
while True:
|
||||
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
api_url = ''
|
||||
if (self.version == 'v0'):
|
||||
if check_url:
|
||||
api_url = check_url
|
||||
else:
|
||||
api_url = f'{self.api_url}/v0/crawl/status/{id}'
|
||||
else:
|
||||
if check_url:
|
||||
api_url = check_url
|
||||
else:
|
||||
api_url = f'{self.api_url}/v1/crawl/{id}'
|
||||
|
||||
status_response = self._get_request(api_url, headers)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
return status_data['data']
|
||||
if self.version == 'v0':
|
||||
return status_data['data']
|
||||
else:
|
||||
return status_data
|
||||
else:
|
||||
raise Exception('Crawl job completed but no data was returned')
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
||||
poll_interval=max(poll_interval,2)
|
||||
time.sleep(poll_interval) # Wait for the specified interval before checking again
|
||||
else:
|
||||
|
@ -300,18 +388,19 @@ class FirecrawlApp:
|
|||
Raises:
|
||||
Exception: An exception with a message containing the status code and error details from the response.
|
||||
"""
|
||||
error_message = response.json().get('error', 'No additional error details provided.')
|
||||
error_message = response.json().get('error', 'No error message provided.')
|
||||
error_details = response.json().get('details', 'No additional error details provided.')
|
||||
|
||||
if response.status_code == 402:
|
||||
message = f"Payment Required: Failed to {action}. {error_message}"
|
||||
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
||||
elif response.status_code == 408:
|
||||
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
|
||||
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
||||
elif response.status_code == 409:
|
||||
message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
|
||||
message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
|
||||
elif response.status_code == 500:
|
||||
message = f"Internal Server Error: Failed to {action}. {error_message}"
|
||||
message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
||||
else:
|
||||
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
|
||||
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
|
||||
|
||||
# Raise an HTTPError with the custom message and attach the response
|
||||
raise requests.exceptions.HTTPError(message, response=response)
|
||||
|
|
Loading…
Reference in New Issue
Block a user