diff --git a/README.md b/README.md index 01324690..e2197ded 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom ## What is Firecrawl? -[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. +[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev). _Pst. hey, you, join our stargazers :)_ @@ -41,18 +41,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl. ```bash -curl -X POST https://api.firecrawl.dev/v0/crawl \ +curl -X POST https://api.firecrawl.dev/v1/crawl \ -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer YOUR_API_KEY' \ + -H 'Authorization: Bearer fc-YOUR_API_KEY' \ -d '{ - "url": "https://mendable.ai" + "url": "https://docs.firecrawl.dev", + "limit": 100, + "scrapeOptions": { + "formats": ["markdown", "html"] + } }' ``` -Returns a jobId +Returns a crawl job id and the url to check the status of the crawl. ```json -{ "jobId": "1234-5678-9101" } +{ + "success": true, + "id": "123-456-789", + "url": "https://api.firecrawl.dev/v1/crawl/123-456-789" +} ``` ### Check Crawl Job @@ -60,7 +68,7 @@ Returns a jobId Used to check the status of a crawl job and get its result. ```bash -curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ +curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' ``` @@ -68,18 +76,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ ```json { "status": "completed", - "current": 22, - "total": 22, + "totalCount": 36, + "creditsUsed": 36, + "expiresAt": "2024-00-00T00:00:00.000Z", "data": [ { - "content": "Raw Content ", - "markdown": "# Markdown Content", - "provider": "web-scraper", + "markdown": "[Firecrawl Docs home page![light logo](https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/logo/light.svg)!...", + "html": "...", "metadata": { - "title": "Mendable | AI for CX and Sales", - "description": "AI for CX and Sales", - "language": null, - "sourceURL": "https://www.mendable.ai/" + "title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl", + "language": "en", + "sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3", + "description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.", + "ogLocaleAlternate": [], + "statusCode": 200 } } ] @@ -88,14 +98,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ ### Scraping -Used to scrape a URL and get its content. +Used to scrape a URL and get its content in the specified formats. ```bash -curl -X POST https://api.firecrawl.dev/v0/scrape \ +curl -X POST https://api.firecrawl.dev/v1/scrape \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ - "url": "https://mendable.ai" + "url": "https://docs.firecrawl.dev", + "formats" : ["markdown", "html"] }' ``` @@ -105,55 +116,83 @@ Response: { "success": true, "data": { - "content": "Raw Content ", - "markdown": "# Markdown Content", - "provider": "web-scraper", + "markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...", + "html": " { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); + .send({ url: 'https://docs.firecrawl.dev' }); expect(firstResponse.statusCode).toBe(200); @@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); + .send({ url: 'https://docs.firecrawl.dev' }); expect(secondResponse.statusCode).toBe(409); expect(secondResponse.body.error).toBe('Idempotency key already used'); diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 9c2eaec5..b960ae55 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -863,7 +863,7 @@ describe("GET /v1/crawl/:jobId", () => { .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://mendable.ai/blog" }); + .send({ url: "https://roastmywebsite.ai" }); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index da66830b..c4c7de65 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -31,7 +31,8 @@ it('should return a list of links on the firecrawl.ai page', async () => { // Check if the result contains a list of links expect(result.linksOnPage).toBeDefined(); + console.log({result}); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage).toContain('https://flutterbricks.com/features') -}, 10000); +}, 15000); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 781d7026..bdcd62cd 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -146,7 +146,7 @@ export async function scrapSingleUrl( if (extractorOptions) { extractorOptions = { - mode: extractorOptions.mode ?? "llm-extraction-from-markdown", + mode: extractorOptions?.mode ?? "llm-extraction-from-markdown", } } @@ -392,7 +392,7 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions?.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, @@ -411,7 +411,7 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions?.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: {