mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'main' into feat/go-html-to-md-parser
This commit is contained in:
commit
34adf43200
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
|
@ -28,6 +28,7 @@ env:
|
|||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
|
||||
|
||||
jobs:
|
||||
|
|
5
.github/workflows/fly.yml
vendored
5
.github/workflows/fly.yml
vendored
|
@ -28,6 +28,7 @@ env:
|
|||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
|
||||
jobs:
|
||||
pre-deploy-e2e-tests:
|
||||
|
@ -57,6 +58,9 @@ jobs:
|
|||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Wait for the application to be ready
|
||||
run: |
|
||||
sleep 10
|
||||
- name: Run E2E tests
|
||||
run: |
|
||||
npm run test:prod
|
||||
|
@ -338,6 +342,7 @@ jobs:
|
|||
build-and-publish-rust-sdk:
|
||||
name: Build and publish Rust SDK
|
||||
runs-on: ubuntu-latest
|
||||
needs: deploy
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
|
10
README.md
10
README.md
|
@ -14,10 +14,9 @@
|
|||
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
|
||||
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
|
||||
</a>
|
||||
<a href="https://github.com/mendableai/firecrawl">
|
||||
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source">
|
||||
<a href="https://firecrawl.dev">
|
||||
<img src="https://img.shields.io/badge/Visit-firecrawl.dev-orange" alt="Visit firecrawl.dev">
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div>
|
||||
<p align="center">
|
||||
|
@ -391,7 +390,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
|
|||
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
|
@ -466,8 +465,7 @@ import FirecrawlApp from "@mendable/firecrawl-js";
|
|||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({
|
||||
apiKey: "fc-YOUR_API_KEY",
|
||||
version: "v0"
|
||||
apiKey: "fc-YOUR_API_KEY"
|
||||
});
|
||||
|
||||
// Define schema to extract contents into
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
dotenv.config();
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
|
@ -22,6 +22,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
||||
"/is-production"
|
||||
);
|
||||
|
||||
console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
|
||||
console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
console.log('!!useDbAuthentication', !!useDbAuthentication);
|
||||
console.log('!useDbAuthentication', !useDbAuthentication);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("isProduction");
|
||||
});
|
||||
|
@ -29,9 +36,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
|
||||
describe("POST /v1/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/scrape"
|
||||
);
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.send({ url: "https://firecrawl.dev"})
|
||||
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
|
@ -389,7 +397,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 5000
|
||||
waitFor: 8000
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
|
@ -451,9 +459,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
|
||||
describe("POST /v1/map", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/map"
|
||||
);
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
|
@ -534,7 +542,9 @@ describe("POST /v1/map", () => {
|
|||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
|
||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||
expect(containsDocsFirecrawlDev).toBe(true);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||
|
@ -559,7 +569,9 @@ describe("POST /v1/map", () => {
|
|||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
|
||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||
expect(containsDocsFirecrawlDev).toBe(true);
|
||||
}, 10000)
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||
|
@ -609,9 +621,9 @@ describe("POST /v1/map", () => {
|
|||
|
||||
describe("POST /v1/crawl", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/crawl"
|
||||
);
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
|
@ -863,7 +875,7 @@ describe("GET /v1/crawl/:jobId", () => {
|
|||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.mendable.ai" });
|
||||
.send({ url: "https://docs.firecrawl.dev" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
@ -893,9 +905,7 @@ describe("GET /v1/crawl/:jobId", () => {
|
|||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.error
|
||||
).toBeUndefined();
|
||||
|
|
|
@ -659,7 +659,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://mendable.ai/blog" });
|
||||
.send({ url: "https://firecrawl.dev/blog" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
@ -689,10 +689,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.pageError
|
||||
).toBeUndefined();
|
||||
|
@ -701,7 +699,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
(doc) =>
|
||||
doc.metadata &&
|
||||
doc.metadata.sourceURL &&
|
||||
doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||
doc.metadata.sourceURL.includes("firecrawl.dev/blog")
|
||||
);
|
||||
|
||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||
|
|
|
@ -5,6 +5,8 @@ import { supabase_service } from "../../../src/services/supabase";
|
|||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
|
|
|
@ -6,6 +6,8 @@ import { Logger } from "../../../src/lib/logger";
|
|||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
|
|
@ -244,16 +244,12 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
}
|
||||
if (creditsToBeBilled > 0) {
|
||||
// billing for doc done on queue end, bill only for llm extraction
|
||||
const billingResult = await billTeam(team_id, creditsToBeBilled);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
billTeam(team_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let doc = result.data;
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
|
|
|
@ -54,18 +54,10 @@ export async function searchHelper(
|
|||
|
||||
|
||||
if (justSearch) {
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
res.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
billTeam(team_id, res.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
}
|
||||
|
||||
|
|
|
@ -5,6 +5,8 @@ import { supabase_service } from "../../services/supabase";
|
|||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
|
|
|
@ -3,6 +3,8 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentCo
|
|||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
|
|
|
@ -18,6 +18,7 @@ import { fireEngineMap } from "../../search/fireEngine";
|
|||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
configDotenv();
|
||||
|
||||
|
@ -61,8 +62,8 @@ export async function mapController(
|
|||
: `site:${req.body.url}`;
|
||||
// www. seems to exclude subdomains in some cases
|
||||
const mapResults = await fireEngineMap(mapUrl, {
|
||||
// limit to 50 results (beta)
|
||||
numResults: Math.min(limit, 50),
|
||||
// limit to 100 results (beta)
|
||||
numResults: Math.min(limit, 100),
|
||||
});
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
|
@ -100,7 +101,10 @@ export async function mapController(
|
|||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
await billTeam(req.auth.team_id, 1);
|
||||
billTeam(req.auth.team_id, 1).catch(error => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
@ -127,5 +131,6 @@ export async function mapController(
|
|||
return res.status(200).json({
|
||||
success: true,
|
||||
links: linksToReturn,
|
||||
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -106,14 +106,10 @@ export async function scrapeController(
|
|||
creditsToBeBilled = 50;
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
|
@ -147,5 +143,6 @@ export async function scrapeController(
|
|||
return res.status(200).json({
|
||||
success: true,
|
||||
data: legacyDocumentConverter(doc),
|
||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -225,6 +225,7 @@ export type ScrapeResponse =
|
|||
success: true;
|
||||
warning?: string;
|
||||
data: Document;
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export interface ScrapeResponseRequestTest {
|
||||
|
@ -246,6 +247,7 @@ export type MapResponse =
|
|||
| {
|
||||
success: true;
|
||||
links: string[];
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export type CrawlStatusParams = {
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
enum LogLevel {
|
||||
NONE = 'NONE', // No logs will be output.
|
||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||
|
@ -25,7 +28,8 @@ export class Logger {
|
|||
const color = Logger.colors[level];
|
||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||
|
||||
// if (process.env.USE_DB_AUTH) {
|
||||
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
// if (useDbAuthentication) {
|
||||
// save to supabase? another place?
|
||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||
// }
|
||||
|
|
|
@ -2,6 +2,8 @@ import { Job } from "bullmq";
|
|||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
type: "error",
|
||||
|
@ -36,7 +38,8 @@ export class ScrapeEvents {
|
|||
static async insert(jobId: string, content: ScrapeEvent) {
|
||||
if (jobId === "TEST") return null;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION) {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
try {
|
||||
const result = await supabase.from("scrape_events").insert({
|
||||
job_id: jobId,
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
import { AuthResponse } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
let warningCount = 0;
|
||||
|
||||
|
@ -7,7 +10,8 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||
originalFunction: (...args: U) => Promise<T>
|
||||
) {
|
||||
return async function (...args: U): Promise<T> {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
if (warningCount < 5) {
|
||||
Logger.warn("You're bypassing authentication");
|
||||
warningCount++;
|
||||
|
@ -17,6 +21,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||
try {
|
||||
return await originalFunction(...args);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error in withAuth function: ${error}`);
|
||||
return { success: false, error: error.message } as T;
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@ import { Document } from "../lib/entities";
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
|
@ -118,15 +120,10 @@ export async function runWebScraper({
|
|||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||
if (!billingResult.success) {
|
||||
// throw new Error("Failed to bill team, no subscription was found");
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to bill team, no subscription was found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
billTeam(team_id, filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
@ -144,7 +141,8 @@ export async function runWebScraper({
|
|||
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||
try {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.update({ docs: result })
|
||||
|
|
|
@ -33,8 +33,10 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
if (!res.headersSent) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
}
|
||||
req.account = { remainingCredits }
|
||||
next();
|
||||
})()
|
||||
|
@ -52,8 +54,10 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW
|
|||
);
|
||||
|
||||
if (!success) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
next();
|
||||
|
@ -67,8 +71,10 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
|||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
}
|
||||
createIdempotencyKey(req);
|
||||
}
|
||||
next();
|
||||
|
@ -78,8 +84,10 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
|||
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (req.body.url && isUrlBlocked(req.body.url)) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
}
|
||||
}
|
||||
next();
|
||||
}
|
||||
|
||||
|
@ -96,26 +104,26 @@ export const v1Router = express.Router();
|
|||
|
||||
v1Router.post(
|
||||
"/scrape",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
blocklistMiddleware,
|
||||
wrap(scrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/crawl",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
idempotencyMiddleware,
|
||||
checkCreditsMiddleware(),
|
||||
blocklistMiddleware,
|
||||
idempotencyMiddleware,
|
||||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
checkCreditsMiddleware(1),
|
||||
blocklistMiddleware,
|
||||
wrap(mapController)
|
||||
);
|
||||
|
||||
|
|
|
@ -23,12 +23,15 @@ import { clientSideError } from "../../strings";
|
|||
|
||||
dotenv.config();
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
|
||||
export const baseScrapers = [
|
||||
"fire-engine;chrome-cdp",
|
||||
"fire-engine",
|
||||
"scrapingBee",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
|
@ -85,18 +88,18 @@ function getScrapingFallbackOrder(
|
|||
});
|
||||
|
||||
let defaultOrder = [
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
|
||||
"scrapingBee",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
defaultOrder = [
|
||||
"fire-engine",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
useFireEngine ? undefined : "playwright",
|
||||
...defaultOrder.filter(
|
||||
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
),
|
||||
|
|
|
@ -5,7 +5,7 @@ import { supabase_service } from "../supabase";
|
|||
import { Logger } from "../../lib/logger";
|
||||
import { getValue, setValue } from "../redis";
|
||||
import { redlock } from "../redlock";
|
||||
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
|
@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||
]);
|
||||
|
||||
let couponCredits = 0;
|
||||
let sortedCoupons = [];
|
||||
|
||||
if (coupons && coupons.length > 0) {
|
||||
couponCredits = coupons.reduce(
|
||||
(total, coupon) => total + coupon.credits,
|
||||
0
|
||||
);
|
||||
sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits);
|
||||
}
|
||||
|
||||
let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits);
|
||||
// using coupon credits:
|
||||
if (couponCredits > 0) {
|
||||
// if there is no subscription and they have enough coupon credits
|
||||
|
@ -175,9 +176,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||
}
|
||||
|
||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
|
||||
await Promise.all([
|
||||
|
||||
let cacheKeySubscription = `subscription_${team_id}`;
|
||||
let cacheKeyCoupons = `coupons_${team_id}`;
|
||||
|
||||
// Try to get data from cache first
|
||||
const [cachedSubscription, cachedCoupons] = await Promise.all([
|
||||
getValue(cacheKeySubscription),
|
||||
getValue(cacheKeyCoupons)
|
||||
]);
|
||||
|
||||
let subscription, subscriptionError, coupons;
|
||||
|
||||
if (cachedSubscription && cachedCoupons) {
|
||||
subscription = JSON.parse(cachedSubscription);
|
||||
coupons = JSON.parse(cachedCoupons);
|
||||
} else {
|
||||
// If not in cache, retrieve from database
|
||||
const [subscriptionResult, couponsResult] = await Promise.all([
|
||||
supabase_service
|
||||
.from("subscriptions")
|
||||
.select("id, price_id, current_period_start, current_period_end")
|
||||
|
@ -191,6 +207,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
.eq("status", "active"),
|
||||
]);
|
||||
|
||||
subscription = subscriptionResult.data;
|
||||
subscriptionError = subscriptionResult.error;
|
||||
coupons = couponsResult.data;
|
||||
|
||||
// Cache the results for a minute, sub can be null and that's fine
|
||||
await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null
|
||||
await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute
|
||||
|
||||
}
|
||||
|
||||
let couponCredits = 0;
|
||||
if (coupons && coupons.length > 0) {
|
||||
couponCredits = coupons.reduce(
|
||||
|
@ -211,6 +237,15 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
let creditUsages;
|
||||
let creditUsageError;
|
||||
let totalCreditsUsed = 0;
|
||||
const cacheKeyCreditUsage = `credit_usage_${team_id}`;
|
||||
|
||||
// Try to get credit usage from cache
|
||||
const cachedCreditUsage = await getValue(cacheKeyCreditUsage);
|
||||
|
||||
if (cachedCreditUsage) {
|
||||
totalCreditsUsed = parseInt(cachedCreditUsage);
|
||||
} else {
|
||||
let retries = 0;
|
||||
const maxRetries = 3;
|
||||
const retryInterval = 2000; // 2 seconds
|
||||
|
@ -242,11 +277,15 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
);
|
||||
}
|
||||
|
||||
const totalCreditsUsed = creditUsages.reduce(
|
||||
totalCreditsUsed = creditUsages.reduce(
|
||||
(acc, usage) => acc + usage.credits_used,
|
||||
0
|
||||
);
|
||||
|
||||
// Cache the result for 30 seconds
|
||||
await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30);
|
||||
}
|
||||
|
||||
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
||||
|
||||
const end = new Date();
|
||||
|
@ -255,7 +294,9 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
const creditLimit = FREE_CREDITS;
|
||||
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
|
||||
|
||||
if (creditUsagePercentage >= 0.8) {
|
||||
// Add a check to ensure totalCreditsUsed is greater than 0
|
||||
if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||
Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`);
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.APPROACHING_LIMIT,
|
||||
|
@ -309,7 +350,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes
|
||||
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
|
||||
}
|
||||
}
|
||||
|
@ -322,8 +363,17 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
||||
// Get the price details
|
||||
const { data: price, error: priceError } = await supabase_service
|
||||
|
||||
// Get the price details from cache or database
|
||||
const priceCacheKey = `price_${subscription.price_id}`;
|
||||
let price;
|
||||
|
||||
try {
|
||||
const cachedPrice = await getValue(priceCacheKey);
|
||||
if (cachedPrice) {
|
||||
price = JSON.parse(cachedPrice);
|
||||
} else {
|
||||
const { data, error: priceError } = await supabase_service
|
||||
.from("prices")
|
||||
.select("credits")
|
||||
.eq("id", subscription.price_id)
|
||||
|
@ -335,6 +385,18 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
);
|
||||
}
|
||||
|
||||
price = data;
|
||||
// There are only 21 records, so this is super fine
|
||||
// Cache the price for a long time (e.g., 1 day)
|
||||
await setValue(priceCacheKey, JSON.stringify(price), 86400);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error retrieving or caching price: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
// If errors, just assume it's a big number so user don't get an error
|
||||
price = { credits: 1000000 };
|
||||
}
|
||||
|
||||
const creditLimit = price.credits;
|
||||
const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit;
|
||||
|
||||
|
@ -462,7 +524,7 @@ async function createCreditUsage({
|
|||
subscription_id?: string;
|
||||
credits: number;
|
||||
}) {
|
||||
const { data: credit_usage } = await supabase_service
|
||||
await supabase_service
|
||||
.from("credit_usage")
|
||||
.insert([
|
||||
{
|
||||
|
@ -471,8 +533,7 @@ async function createCreditUsage({
|
|||
subscription_id: subscription_id || null,
|
||||
created_at: new Date(),
|
||||
},
|
||||
])
|
||||
.select();
|
||||
]);
|
||||
|
||||
return { success: true, credit_usage };
|
||||
return { success: true };
|
||||
}
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import "dotenv/config";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logCrawl(job_id: string, team_id: string) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === 'true') {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
try {
|
||||
const { data, error } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
|
|
|
@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types";
|
|||
import { posthog } from "../posthog";
|
||||
import "dotenv/config";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
try {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types";
|
|||
import { supabase_service } from "../supabase";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logScrape(
|
||||
scrapeLog: ScrapeLog,
|
||||
pageOptions?: PageOptions
|
||||
) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
Logger.debug("Skipping logging scrape to Supabase");
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -67,6 +67,6 @@ export function waitForJob(jobId: string, timeout: number) {
|
|||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
}, 500);
|
||||
})
|
||||
}
|
||||
|
|
|
@ -36,6 +36,8 @@ import {
|
|||
} from "../../src/lib/job-priority";
|
||||
import { PlanType } from "../types";
|
||||
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
|
@ -8,8 +10,9 @@ class SupabaseService {
|
|||
constructor() {
|
||||
const supabaseUrl = process.env.SUPABASE_URL;
|
||||
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
if (!useDbAuthentication) {
|
||||
// Warn the user that Authentication is disabled by setting the client to null
|
||||
Logger.warn(
|
||||
"Authentication is disabled. Supabase client will not be initialized."
|
||||
|
|
|
@ -3,6 +3,8 @@ import { legacyDocumentConverter } from "../../src/controllers/v1/types";
|
|||
import { Logger } from "../../src/lib/logger";
|
||||
import { supabase_service } from "./supabase";
|
||||
import { WebhookEventType } from "../types";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export const callWebhook = async (
|
||||
teamId: string,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.2.1",
|
||||
"version": "1.2.2",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/cjs/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
|
|
|
@ -454,20 +454,27 @@ export default class FirecrawlApp {
|
|||
checkInterval: number
|
||||
): Promise<CrawlStatusResponse> {
|
||||
while (true) {
|
||||
const statusResponse: AxiosResponse = await this.getRequest(
|
||||
let statusResponse: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/crawl/${id}`,
|
||||
headers
|
||||
);
|
||||
if (statusResponse.status === 200) {
|
||||
const statusData = statusResponse.data;
|
||||
let statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
statusResponse = await this.getRequest(statusData.next, headers);
|
||||
statusData = statusResponse.data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
statusData.data = data;
|
||||
return statusData;
|
||||
} else {
|
||||
throw new Error("Crawl job completed but no data was returned");
|
||||
}
|
||||
} else if (
|
||||
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
|
||||
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
||||
) {
|
||||
checkInterval = Math.max(checkInterval, 2);
|
||||
await new Promise((resolve) =>
|
||||
|
|
|
@ -13,7 +13,7 @@ import os
|
|||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "1.2.1"
|
||||
__version__ = "1.2.3"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
|
|
@ -238,7 +238,6 @@ class FirecrawlApp:
|
|||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
print(response)
|
||||
if response['success'] and 'links' in response:
|
||||
return response['links']
|
||||
else:
|
||||
|
@ -346,6 +345,12 @@ class FirecrawlApp:
|
|||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
while 'next' in status_data:
|
||||
status_response = self._get_request(status_data['next'], headers)
|
||||
status_data = status_response.json()
|
||||
data.extend(status_data['data'])
|
||||
status_data['data'] = data
|
||||
return status_data
|
||||
else:
|
||||
raise Exception('Crawl job completed but no data was returned')
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import "dotenv/config";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
|
@ -9,7 +10,8 @@ class SupabaseService {
|
|||
const supabaseUrl = process.env.SUPABASE_URL;
|
||||
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
|
||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
// Warn the user that Authentication is disabled by setting the client to null
|
||||
console.warn(
|
||||
"Authentication is disabled. Supabase client will not be initialized."
|
||||
|
@ -36,7 +38,8 @@ export const supabase_service: SupabaseClient = new Proxy(
|
|||
new SupabaseService(),
|
||||
{
|
||||
get: function (target, prop, receiver) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
console.debug(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
|
|
Loading…
Reference in New Issue
Block a user