mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'main' into fix-attw
This commit is contained in:
commit
51da038405
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
|
@ -28,7 +28,8 @@ env:
|
|||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
ENV: ${{ secrets.ENV }}
|
||||
|
||||
jobs:
|
||||
pre-deploy:
|
||||
|
|
6
.github/workflows/fly-direct.yml
vendored
6
.github/workflows/fly-direct.yml
vendored
|
@ -22,7 +22,13 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
ENV: ${{ secrets.ENV }}
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
|
6
.github/workflows/fly.yml
vendored
6
.github/workflows/fly.yml
vendored
|
@ -28,6 +28,8 @@ env:
|
|||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
ENV: ${{ secrets.ENV }}
|
||||
|
||||
jobs:
|
||||
pre-deploy-e2e-tests:
|
||||
|
@ -57,6 +59,9 @@ jobs:
|
|||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Wait for the application to be ready
|
||||
run: |
|
||||
sleep 10
|
||||
- name: Run E2E tests
|
||||
run: |
|
||||
npm run test:prod
|
||||
|
@ -338,6 +343,7 @@ jobs:
|
|||
build-and-publish-rust-sdk:
|
||||
name: Build and publish Rust SDK
|
||||
runs-on: ubuntu-latest
|
||||
needs: deploy
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
|
21
README.md
21
README.md
|
@ -14,10 +14,9 @@
|
|||
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
|
||||
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
|
||||
</a>
|
||||
<a href="https://github.com/mendableai/firecrawl">
|
||||
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source">
|
||||
<a href="https://firecrawl.dev">
|
||||
<img src="https://img.shields.io/badge/Visit-firecrawl.dev-orange" alt="Visit firecrawl.dev">
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div>
|
||||
<p align="center">
|
||||
|
@ -391,7 +390,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
|
|||
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
|
@ -403,15 +402,12 @@ class TopArticlesSchema(BaseModel):
|
|||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
data = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
'formats': ['extract'],
|
||||
'extract': {
|
||||
'schema': TopArticlesSchema.model_json_schema()
|
||||
}
|
||||
})
|
||||
print(data["llm_extraction"])
|
||||
print(data["extract"])
|
||||
```
|
||||
|
||||
## Using the Node SDK
|
||||
|
@ -466,8 +462,7 @@ import FirecrawlApp from "@mendable/firecrawl-js";
|
|||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({
|
||||
apiKey: "fc-YOUR_API_KEY",
|
||||
version: "v0"
|
||||
apiKey: "fc-YOUR_API_KEY"
|
||||
});
|
||||
|
||||
// Define schema to extract contents into
|
||||
|
|
|
@ -106,7 +106,7 @@ You should be able to see the Bull Queue Manager UI on `http://localhost:3002/ad
|
|||
If you’d like to test the crawl endpoint, you can run this:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3002/v0/crawl \
|
||||
curl -X POST http://localhost:3002/v1/crawl \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
|
|
|
@ -17,8 +17,15 @@ RUN pnpm install
|
|||
RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \
|
||||
bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi'
|
||||
|
||||
# Install packages needed for deployment
|
||||
# Install Go
|
||||
FROM golang:1.19 AS go-base
|
||||
COPY src/lib/go-html-to-md /app/src/lib/go-html-to-md
|
||||
|
||||
# Install Go dependencies and build parser lib
|
||||
RUN cd /app/src/lib/go-html-to-md && \
|
||||
go mod tidy && \
|
||||
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \
|
||||
chmod +x html-to-markdown.so
|
||||
|
||||
FROM base
|
||||
RUN apt-get update -qq && \
|
||||
|
@ -26,10 +33,8 @@ RUN apt-get update -qq && \
|
|||
rm -rf /var/lib/apt/lists /var/cache/apt/archives
|
||||
COPY --from=prod-deps /app/node_modules /app/node_modules
|
||||
COPY --from=build /app /app
|
||||
|
||||
|
||||
|
||||
COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src/lib/go-html-to-md/html-to-markdown.so
|
||||
|
||||
# Start the server by default, this can be overwritten at runtime
|
||||
EXPOSE 8080
|
||||
ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
|
||||
ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
|
|
@ -86,6 +86,7 @@
|
|||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||
"json-schema-to-zod": "^2.3.0",
|
||||
"keyword-extractor": "^0.0.28",
|
||||
"koffi": "^2.9.0",
|
||||
"langchain": "^0.2.8",
|
||||
"languagedetect": "^2.0.0",
|
||||
"logsnag": "^1.0.0",
|
||||
|
|
|
@ -122,6 +122,9 @@ importers:
|
|||
keyword-extractor:
|
||||
specifier: ^0.0.28
|
||||
version: 0.0.28
|
||||
koffi:
|
||||
specifier: ^2.9.0
|
||||
version: 2.9.0
|
||||
langchain:
|
||||
specifier: ^0.2.8
|
||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||
|
@ -3170,6 +3173,9 @@ packages:
|
|||
resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
koffi@2.9.0:
|
||||
resolution: {integrity: sha512-KCsuJ2gM58n6bNdR2Z7gqsh/3TchxxQFbVgax2/UvAjRTgwNSYAJDx9E3jrkBP4jEDHWRCfE47Y2OG+/fiSvEw==}
|
||||
|
||||
langchain@0.2.8:
|
||||
resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==}
|
||||
engines: {node: '>=18'}
|
||||
|
@ -8492,6 +8498,8 @@ snapshots:
|
|||
|
||||
kleur@3.0.3: {}
|
||||
|
||||
koffi@2.9.0: {}
|
||||
|
||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
dotenv.config();
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
|
@ -22,6 +22,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
||||
"/is-production"
|
||||
);
|
||||
|
||||
console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
|
||||
console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
console.log('!!useDbAuthentication', !!useDbAuthentication);
|
||||
console.log('!useDbAuthentication', !useDbAuthentication);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("isProduction");
|
||||
});
|
||||
|
@ -29,9 +36,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
|
||||
describe("POST /v1/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/scrape"
|
||||
);
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.send({ url: "https://firecrawl.dev"})
|
||||
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
|
@ -389,7 +397,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 5000
|
||||
waitFor: 8000
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
|
@ -451,9 +459,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
|
||||
describe("POST /v1/map", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/map"
|
||||
);
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
|
@ -534,7 +542,9 @@ describe("POST /v1/map", () => {
|
|||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
|
||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||
expect(containsDocsFirecrawlDev).toBe(true);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||
|
@ -559,7 +569,9 @@ describe("POST /v1/map", () => {
|
|||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
|
||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||
expect(containsDocsFirecrawlDev).toBe(true);
|
||||
}, 10000)
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||
|
@ -609,9 +621,9 @@ describe("POST /v1/map", () => {
|
|||
|
||||
describe("POST /v1/crawl", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/crawl"
|
||||
);
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
|
@ -863,7 +875,7 @@ describe("GET /v1/crawl/:jobId", () => {
|
|||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.mendable.ai" });
|
||||
.send({ url: "https://docs.firecrawl.dev" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
@ -893,9 +905,7 @@ describe("GET /v1/crawl/:jobId", () => {
|
|||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.error
|
||||
).toBeUndefined();
|
||||
|
|
|
@ -659,7 +659,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://mendable.ai/blog" });
|
||||
.send({ url: "https://firecrawl.dev/blog" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
@ -689,10 +689,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.pageError
|
||||
).toBeUndefined();
|
||||
|
@ -701,7 +699,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
(doc) =>
|
||||
doc.metadata &&
|
||||
doc.metadata.sourceURL &&
|
||||
doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||
doc.metadata.sourceURL.includes("firecrawl.dev/blog")
|
||||
);
|
||||
|
||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||
|
|
|
@ -5,6 +5,8 @@ import { supabase_service } from "../../../src/services/supabase";
|
|||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
|
|
|
@ -6,6 +6,8 @@ import { Logger } from "../../../src/lib/logger";
|
|||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
|
|
@ -39,7 +39,7 @@ export async function scrapeHelper(
|
|||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
if (typeof url !== "string") {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
|
@ -244,14 +244,10 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
}
|
||||
if (creditsToBeBilled > 0) {
|
||||
// billing for doc done on queue end, bill only for llm extraction
|
||||
const billingResult = await billTeam(team_id, creditsToBeBilled);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
billTeam(team_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -54,18 +54,10 @@ export async function searchHelper(
|
|||
|
||||
|
||||
if (justSearch) {
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
res.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
billTeam(team_id, res.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
}
|
||||
|
||||
|
|
|
@ -5,6 +5,8 @@ import { supabase_service } from "../../services/supabase";
|
|||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
|
|
|
@ -103,6 +103,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
send(ws, {
|
||||
type: "catchup",
|
||||
data: {
|
||||
success: true,
|
||||
status,
|
||||
total: jobIDs.length,
|
||||
completed: doneJobIDs.length,
|
||||
|
|
|
@ -3,6 +3,8 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentCo
|
|||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
|
@ -92,7 +94,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
|
@ -111,6 +114,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
}
|
||||
|
||||
res.status(200).json({
|
||||
success: true,
|
||||
status,
|
||||
completed: doneJobsLength,
|
||||
total: jobIDs.length,
|
||||
|
|
|
@ -155,10 +155,12 @@ export async function crawlController(
|
|||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
||||
}
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ import { fireEngineMap } from "../../search/fireEngine";
|
|||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
configDotenv();
|
||||
|
||||
|
@ -61,8 +62,8 @@ export async function mapController(
|
|||
: `site:${req.body.url}`;
|
||||
// www. seems to exclude subdomains in some cases
|
||||
const mapResults = await fireEngineMap(mapUrl, {
|
||||
// limit to 50 results (beta)
|
||||
numResults: Math.min(limit, 50),
|
||||
// limit to 100 results (beta)
|
||||
numResults: Math.min(limit, 100),
|
||||
});
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
|
@ -87,7 +88,13 @@ export async function mapController(
|
|||
links = performCosineSimilarity(links, searchQuery);
|
||||
}
|
||||
|
||||
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
||||
links = links.map((x) => {
|
||||
try {
|
||||
return checkAndUpdateURLForMap(x).url.trim()
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
}).filter(x => x !== null);
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
@ -100,7 +107,10 @@ export async function mapController(
|
|||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
await billTeam(req.auth.team_id, 1);
|
||||
billTeam(req.auth.team_id, 1).catch(error => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
|
|
@ -106,14 +106,10 @@ export async function scrapeController(
|
|||
creditsToBeBilled = 50;
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
|
|
|
@ -30,7 +30,14 @@ export const url = z.preprocess(
|
|||
"URL must have a valid top-level domain or be a valid path"
|
||||
)
|
||||
.refine(
|
||||
(x) => checkUrl(x as string),
|
||||
(x) => {
|
||||
try {
|
||||
checkUrl(x as string)
|
||||
return true;
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
"Invalid URL"
|
||||
)
|
||||
.refine(
|
||||
|
@ -257,6 +264,7 @@ export type CrawlStatusParams = {
|
|||
export type CrawlStatusResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
|
@ -322,6 +330,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
headers: x.headers,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
|
@ -339,7 +348,7 @@ export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
|||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc === null || doc === undefined) return doc;
|
||||
if (doc === null || doc === undefined) return null;
|
||||
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
|
|
|
@ -201,16 +201,20 @@ if (cluster.isMaster) {
|
|||
Sentry.setupExpressErrorHandler(app);
|
||||
|
||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) {
|
||||
return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' });
|
||||
}
|
||||
|
||||
const id = res.sentry ?? uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
|
|
40
apps/api/src/lib/__tests__/html-to-markdown.test.ts
Normal file
40
apps/api/src/lib/__tests__/html-to-markdown.test.ts
Normal file
|
@ -0,0 +1,40 @@
|
|||
import { parseMarkdown } from '../html-to-markdown';
|
||||
|
||||
describe('parseMarkdown', () => {
|
||||
it('should correctly convert simple HTML to Markdown', async () => {
|
||||
const html = '<p>Hello, world!</p>';
|
||||
const expectedMarkdown = 'Hello, world!';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should convert complex HTML with nested elements to Markdown', async () => {
|
||||
const html = '<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>';
|
||||
const expectedMarkdown = 'Hello **bold** world!\n\n- List item';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should return empty string when input is empty', async () => {
|
||||
const html = '';
|
||||
const expectedMarkdown = '';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should handle null input gracefully', async () => {
|
||||
const html = null;
|
||||
const expectedMarkdown = '';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should handle various types of invalid HTML gracefully', async () => {
|
||||
const invalidHtmls = [
|
||||
{ html: '<html><p>Unclosed tag', expected: 'Unclosed tag' },
|
||||
{ html: '<div><span>Missing closing div', expected: 'Missing closing div' },
|
||||
{ html: '<p><strong>Wrong nesting</em></strong></p>', expected: '**Wrong nesting**' },
|
||||
{ html: '<a href="http://example.com">Link without closing tag', expected: '[Link without closing tag](http://example.com)' }
|
||||
];
|
||||
|
||||
for (const { html, expected } of invalidHtmls) {
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expected);
|
||||
}
|
||||
});
|
||||
});
|
|
@ -28,7 +28,7 @@ export type PageOptions = {
|
|||
onlyIncludeTags?: string | string[];
|
||||
includeLinks?: boolean;
|
||||
useFastMode?: boolean; // beta
|
||||
disableJSDom?: boolean; // beta
|
||||
disableJsDom?: boolean; // beta
|
||||
atsv?: boolean; // beta
|
||||
};
|
||||
|
||||
|
|
7
apps/api/src/lib/go-html-to-md/README.md
Normal file
7
apps/api/src/lib/go-html-to-md/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
To build the go-html-to-md library, run the following command:
|
||||
|
||||
```bash
|
||||
cd apps/api/src/lib/go-html-to-md
|
||||
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
|
||||
chmod +x html-to-markdown.so
|
||||
```
|
14
apps/api/src/lib/go-html-to-md/go.mod
Normal file
14
apps/api/src/lib/go-html-to-md/go.mod
Normal file
|
@ -0,0 +1,14 @@
|
|||
module html-to-markdown.go
|
||||
|
||||
go 1.19
|
||||
|
||||
require github.com/JohannesKaufmann/html-to-markdown v1.6.0
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.9.2 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
||||
github.com/kr/pretty v0.3.0 // indirect
|
||||
golang.org/x/net v0.25.0 // indirect
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
)
|
93
apps/api/src/lib/go-html-to-md/go.sum
Normal file
93
apps/api/src/lib/go-html-to-md/go.sum
Normal file
|
@ -0,0 +1,93 @@
|
|||
github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k=
|
||||
github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ=
|
||||
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
|
||||
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
|
||||
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
|
||||
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
|
||||
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
|
||||
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
|
||||
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
|
||||
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
|
||||
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
|
||||
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
|
||||
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
|
||||
github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||
golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
|
||||
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
|
||||
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
|
||||
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||
golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk=
|
||||
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
25
apps/api/src/lib/go-html-to-md/html-to-markdown.go
Normal file
25
apps/api/src/lib/go-html-to-md/html-to-markdown.go
Normal file
|
@ -0,0 +1,25 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"C"
|
||||
"log"
|
||||
|
||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/plugin"
|
||||
)
|
||||
|
||||
//export ConvertHTMLToMarkdown
|
||||
func ConvertHTMLToMarkdown(html *C.char) *C.char {
|
||||
converter := md.NewConverter("", true, nil)
|
||||
converter.Use(plugin.GitHubFlavored())
|
||||
|
||||
markdown, err := converter.ConvertString(C.GoString(html))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
return C.CString(markdown)
|
||||
}
|
||||
|
||||
func main() {
|
||||
// This function is required for the main package
|
||||
}
|
|
@ -1,8 +1,68 @@
|
|||
|
||||
export async function parseMarkdown(html: string) {
|
||||
import koffi from 'koffi';
|
||||
import { join } from 'path';
|
||||
import "../services/sentry"
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import { Logger } from './logger';
|
||||
dotenv.config();
|
||||
|
||||
// TODO: add a timeout to the Go parser
|
||||
|
||||
class GoMarkdownConverter {
|
||||
private static instance: GoMarkdownConverter;
|
||||
private convert: any;
|
||||
|
||||
private constructor() {
|
||||
const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so');
|
||||
const lib = koffi.load(goExecutablePath);
|
||||
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
|
||||
}
|
||||
|
||||
public static getInstance(): GoMarkdownConverter {
|
||||
if (!GoMarkdownConverter.instance) {
|
||||
GoMarkdownConverter.instance = new GoMarkdownConverter();
|
||||
}
|
||||
return GoMarkdownConverter.instance;
|
||||
}
|
||||
|
||||
public async convertHTMLToMarkdown(html: string): Promise<string> {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this.convert.async(html, (err: Error, res: string) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(res);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function parseMarkdown(html: string): Promise<string> {
|
||||
if (!html) {
|
||||
return '';
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
|
||||
const converter = GoMarkdownConverter.getInstance();
|
||||
let markdownContent = await converter.convertHTMLToMarkdown(html);
|
||||
|
||||
markdownContent = processMultiLineLinks(markdownContent);
|
||||
markdownContent = removeSkipToContentLinks(markdownContent);
|
||||
Logger.info(`HTML to Markdown conversion using Go parser successful`);
|
||||
return markdownContent;
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
}
|
||||
|
||||
// Fallback to TurndownService if Go parser fails or is not enabled
|
||||
var TurndownService = require("turndown");
|
||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
||||
|
||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
||||
|
||||
const turndownService = new TurndownService();
|
||||
turndownService.addRule("inlineLink", {
|
||||
|
@ -21,29 +81,20 @@ export async function parseMarkdown(html: string) {
|
|||
});
|
||||
var gfm = turndownPluginGfm.gfm;
|
||||
turndownService.use(gfm);
|
||||
let markdownContent = "";
|
||||
const turndownPromise = new Promise<string>((resolve, reject) => {
|
||||
try {
|
||||
const result = turndownService.turndown(html);
|
||||
resolve(result);
|
||||
} catch (error) {
|
||||
reject("Error converting HTML to Markdown: " + error);
|
||||
}
|
||||
});
|
||||
|
||||
const timeoutPromise = new Promise<string>((resolve, reject) => {
|
||||
const timeout = 5000; // Timeout in milliseconds
|
||||
setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout);
|
||||
});
|
||||
|
||||
try {
|
||||
markdownContent = await Promise.race([turndownPromise, timeoutPromise]);
|
||||
let markdownContent = await turndownService.turndown(html);
|
||||
markdownContent = processMultiLineLinks(markdownContent);
|
||||
markdownContent = removeSkipToContentLinks(markdownContent);
|
||||
|
||||
return markdownContent;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
console.error("Error converting HTML to Markdown: ", error);
|
||||
return ""; // Optionally return an empty string or handle the error as needed
|
||||
}
|
||||
}
|
||||
|
||||
// multiple line links
|
||||
function processMultiLineLinks(markdownContent: string): string {
|
||||
let insideLinkContent = false;
|
||||
let newMarkdownContent = "";
|
||||
let linkOpenCount = 0;
|
||||
|
@ -63,12 +114,14 @@ export async function parseMarkdown(html: string) {
|
|||
newMarkdownContent += char;
|
||||
}
|
||||
}
|
||||
markdownContent = newMarkdownContent;
|
||||
return newMarkdownContent;
|
||||
}
|
||||
|
||||
function removeSkipToContentLinks(markdownContent: string): string {
|
||||
// Remove [Skip to Content](#page) and [Skip to content](#skip)
|
||||
markdownContent = markdownContent.replace(
|
||||
const newMarkdownContent = markdownContent.replace(
|
||||
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
||||
""
|
||||
);
|
||||
return markdownContent;
|
||||
}
|
||||
return newMarkdownContent;
|
||||
}
|
|
@ -1,3 +1,6 @@
|
|||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
enum LogLevel {
|
||||
NONE = 'NONE', // No logs will be output.
|
||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||
|
|
|
@ -2,6 +2,8 @@ import { Job } from "bullmq";
|
|||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
type: "error",
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
import { AuthResponse } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
let warningCount = 0;
|
||||
|
||||
|
@ -18,6 +21,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||
try {
|
||||
return await originalFunction(...args);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error in withAuth function: ${error}`);
|
||||
return { success: false, error: error.message } as T;
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@ import { Document } from "../lib/entities";
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
|
@ -118,15 +120,10 @@ export async function runWebScraper({
|
|||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||
if (!billingResult.success) {
|
||||
// throw new Error("Failed to bill team, no subscription was found");
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to bill team, no subscription was found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
billTeam(team_id, filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -33,7 +33,9 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
if (!res.headersSent) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
}
|
||||
req.account = { remainingCredits }
|
||||
next();
|
||||
|
@ -52,7 +54,9 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW
|
|||
);
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
if (!res.headersSent) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
|
@ -67,7 +71,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
|||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
if (!res.headersSent) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
}
|
||||
createIdempotencyKey(req);
|
||||
}
|
||||
|
@ -77,8 +83,10 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
|||
}
|
||||
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (req.body.url && isUrlBlocked(req.body.url)) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
}
|
||||
}
|
||||
next();
|
||||
}
|
||||
|
@ -96,26 +104,26 @@ export const v1Router = express.Router();
|
|||
|
||||
v1Router.post(
|
||||
"/scrape",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
blocklistMiddleware,
|
||||
wrap(scrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/crawl",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
idempotencyMiddleware,
|
||||
checkCreditsMiddleware(),
|
||||
blocklistMiddleware,
|
||||
idempotencyMiddleware,
|
||||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
checkCreditsMiddleware(1),
|
||||
blocklistMiddleware,
|
||||
wrap(mapController)
|
||||
);
|
||||
|
||||
|
|
|
@ -589,6 +589,9 @@ export class WebScraperDataProvider {
|
|||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
useFastMode: options.pageOptions?.useFastMode ?? false,
|
||||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
||||
atsv: options.pageOptions?.atsv ?? false
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
|
|
@ -55,7 +55,7 @@ export async function scrapWithFireEngine({
|
|||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
@ -69,15 +69,15 @@ export async function scrapWithFireEngine({
|
|||
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
if (pageOptions?.useFastMode) {
|
||||
fireEngineOptionsParam.engine = "tlsclient";
|
||||
engine = "tlsclient";
|
||||
}
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
// atsv is only available for beta customers
|
||||
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
||||
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
||||
|
@ -96,6 +96,7 @@ export async function scrapWithFireEngine({
|
|||
const _response = await Sentry.startSpan({
|
||||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
|
@ -104,12 +105,13 @@ export async function scrapWithFireEngine({
|
|||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
instantReturn: true,
|
||||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
|
|
|
@ -96,15 +96,15 @@ function getScrapingFallbackOrder(
|
|||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
defaultOrder = [
|
||||
"fire-engine",
|
||||
useFireEngine ? undefined : "playwright",
|
||||
...defaultOrder.filter(
|
||||
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
),
|
||||
].filter(Boolean);
|
||||
}
|
||||
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
// defaultOrder = [
|
||||
// "fire-engine",
|
||||
// useFireEngine ? undefined : "playwright",
|
||||
// ...defaultOrder.filter(
|
||||
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
// ),
|
||||
// ].filter(Boolean);
|
||||
// }
|
||||
|
||||
const filteredDefaultOrder = defaultOrder.filter(
|
||||
(scraper: (typeof baseScrapers)[number]) =>
|
||||
|
@ -146,6 +146,9 @@ export async function scrapSingleUrl(
|
|||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
||||
useFastMode: pageOptions.useFastMode ?? false,
|
||||
disableJsDom: pageOptions.disableJsDom ?? false,
|
||||
atsv: pageOptions.atsv ?? false
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
|
@ -200,6 +203,7 @@ export async function scrapSingleUrl(
|
|||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
atsv: pageOptions.atsv,
|
||||
disableJsDom: pageOptions.disableJsDom,
|
||||
},
|
||||
priority,
|
||||
teamId,
|
||||
|
|
|
@ -242,5 +242,13 @@ export const urlSpecificParams = {
|
|||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"lorealparis.hu":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
||||
|
|
|
@ -39,16 +39,8 @@ export const excludeNonMainTags = [
|
|||
"#search",
|
||||
".share",
|
||||
"#share",
|
||||
".pagination",
|
||||
"#pagination",
|
||||
".widget",
|
||||
"#widget",
|
||||
".related",
|
||||
"#related",
|
||||
".tag",
|
||||
"#tag",
|
||||
".category",
|
||||
"#category",
|
||||
".cookie",
|
||||
"#cookie"
|
||||
];
|
||||
|
|
|
@ -5,7 +5,7 @@ import { supabase_service } from "../supabase";
|
|||
import { Logger } from "../../lib/logger";
|
||||
import { getValue, setValue } from "../redis";
|
||||
import { redlock } from "../redlock";
|
||||
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
|
@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||
]);
|
||||
|
||||
let couponCredits = 0;
|
||||
let sortedCoupons = [];
|
||||
|
||||
if (coupons && coupons.length > 0) {
|
||||
couponCredits = coupons.reduce(
|
||||
(total, coupon) => total + coupon.credits,
|
||||
0
|
||||
);
|
||||
sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits);
|
||||
}
|
||||
|
||||
let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits);
|
||||
// using coupon credits:
|
||||
if (couponCredits > 0) {
|
||||
// if there is no subscription and they have enough coupon credits
|
||||
|
@ -175,9 +176,25 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||
}
|
||||
|
||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
|
||||
await Promise.all([
|
||||
|
||||
let cacheKeySubscription = `subscription_${team_id}`;
|
||||
let cacheKeyCoupons = `coupons_${team_id}`;
|
||||
|
||||
// Try to get data from cache first
|
||||
const [cachedSubscription, cachedCoupons] = await Promise.all([
|
||||
getValue(cacheKeySubscription),
|
||||
getValue(cacheKeyCoupons)
|
||||
]);
|
||||
|
||||
let subscription, subscriptionError;
|
||||
let coupons : {credits: number}[];
|
||||
|
||||
if (cachedSubscription && cachedCoupons) {
|
||||
subscription = JSON.parse(cachedSubscription);
|
||||
coupons = JSON.parse(cachedCoupons);
|
||||
} else {
|
||||
// If not in cache, retrieve from database
|
||||
const [subscriptionResult, couponsResult] = await Promise.all([
|
||||
supabase_service
|
||||
.from("subscriptions")
|
||||
.select("id, price_id, current_period_start, current_period_end")
|
||||
|
@ -191,6 +208,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
.eq("status", "active"),
|
||||
]);
|
||||
|
||||
subscription = subscriptionResult.data;
|
||||
subscriptionError = subscriptionResult.error;
|
||||
coupons = couponsResult.data;
|
||||
|
||||
// Cache the results for a minute, sub can be null and that's fine
|
||||
await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null
|
||||
await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute
|
||||
|
||||
}
|
||||
|
||||
let couponCredits = 0;
|
||||
if (coupons && coupons.length > 0) {
|
||||
couponCredits = coupons.reduce(
|
||||
|
@ -199,53 +226,67 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
);
|
||||
}
|
||||
|
||||
|
||||
// If there are available coupons and they are enough for the operation
|
||||
if (couponCredits >= credits) {
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
||||
}
|
||||
|
||||
|
||||
// Free credits, no coupons
|
||||
if (!subscription || subscriptionError) {
|
||||
|
||||
// If there is no active subscription but there are available coupons
|
||||
if (couponCredits >= credits) {
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
||||
}
|
||||
|
||||
let creditUsages;
|
||||
let creditUsageError;
|
||||
let retries = 0;
|
||||
const maxRetries = 3;
|
||||
const retryInterval = 2000; // 2 seconds
|
||||
let totalCreditsUsed = 0;
|
||||
const cacheKeyCreditUsage = `credit_usage_${team_id}`;
|
||||
|
||||
while (retries < maxRetries) {
|
||||
const result = await supabase_service
|
||||
.from("credit_usage")
|
||||
.select("credits_used")
|
||||
.is("subscription_id", null)
|
||||
.eq("team_id", team_id);
|
||||
// Try to get credit usage from cache
|
||||
const cachedCreditUsage = await getValue(cacheKeyCreditUsage);
|
||||
|
||||
creditUsages = result.data;
|
||||
creditUsageError = result.error;
|
||||
if (cachedCreditUsage) {
|
||||
totalCreditsUsed = parseInt(cachedCreditUsage);
|
||||
} else {
|
||||
let retries = 0;
|
||||
const maxRetries = 3;
|
||||
const retryInterval = 2000; // 2 seconds
|
||||
|
||||
if (!creditUsageError) {
|
||||
break;
|
||||
while (retries < maxRetries) {
|
||||
// Reminder, this has an 1000 limit.
|
||||
const result = await supabase_service
|
||||
.from("credit_usage")
|
||||
.select("credits_used")
|
||||
.is("subscription_id", null)
|
||||
.eq("team_id", team_id);
|
||||
|
||||
creditUsages = result.data;
|
||||
creditUsageError = result.error;
|
||||
|
||||
if (!creditUsageError) {
|
||||
break;
|
||||
}
|
||||
|
||||
retries++;
|
||||
if (retries < maxRetries) {
|
||||
await new Promise(resolve => setTimeout(resolve, retryInterval));
|
||||
}
|
||||
}
|
||||
|
||||
retries++;
|
||||
if (retries < maxRetries) {
|
||||
await new Promise(resolve => setTimeout(resolve, retryInterval));
|
||||
if (creditUsageError) {
|
||||
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
|
||||
throw new Error(
|
||||
`Failed to retrieve credit usage for team_id: ${team_id}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (creditUsageError) {
|
||||
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
|
||||
throw new Error(
|
||||
`Failed to retrieve credit usage for team_id: ${team_id}`
|
||||
totalCreditsUsed = creditUsages.reduce(
|
||||
(acc, usage) => acc + usage.credits_used,
|
||||
0
|
||||
);
|
||||
}
|
||||
|
||||
const totalCreditsUsed = creditUsages.reduce(
|
||||
(acc, usage) => acc + usage.credits_used,
|
||||
0
|
||||
);
|
||||
// Cache the result for 30 seconds
|
||||
await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30);
|
||||
}
|
||||
|
||||
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
||||
|
||||
|
@ -253,9 +294,11 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
end.setDate(end.getDate() + 30);
|
||||
// check if usage is within 80% of the limit
|
||||
const creditLimit = FREE_CREDITS;
|
||||
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
|
||||
const creditUsagePercentage = totalCreditsUsed / creditLimit;
|
||||
|
||||
if (creditUsagePercentage >= 0.8) {
|
||||
// Add a check to ensure totalCreditsUsed is greater than 0
|
||||
if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||
Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`);
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.APPROACHING_LIMIT,
|
||||
|
@ -265,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
}
|
||||
|
||||
// 5. Compare the total credits used with the credits allowed by the plan.
|
||||
if (totalCreditsUsed + credits > FREE_CREDITS) {
|
||||
if (totalCreditsUsed >= FREE_CREDITS) {
|
||||
// Send email notification for insufficient credits
|
||||
await sendNotification(
|
||||
team_id,
|
||||
|
@ -309,7 +352,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes
|
||||
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
|
||||
}
|
||||
}
|
||||
|
@ -322,39 +365,62 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
||||
// Get the price details
|
||||
const { data: price, error: priceError } = await supabase_service
|
||||
.from("prices")
|
||||
.select("credits")
|
||||
.eq("id", subscription.price_id)
|
||||
.single();
|
||||
|
||||
if (priceError) {
|
||||
throw new Error(
|
||||
`Failed to retrieve price for price_id: ${subscription.price_id}`
|
||||
);
|
||||
// Get the price details from cache or database
|
||||
const priceCacheKey = `price_${subscription.price_id}`;
|
||||
let price : {credits: number};
|
||||
|
||||
try {
|
||||
const cachedPrice = await getValue(priceCacheKey);
|
||||
if (cachedPrice) {
|
||||
price = JSON.parse(cachedPrice);
|
||||
} else {
|
||||
const { data, error: priceError } = await supabase_service
|
||||
.from("prices")
|
||||
.select("credits")
|
||||
.eq("id", subscription.price_id)
|
||||
.single();
|
||||
|
||||
if (priceError) {
|
||||
throw new Error(
|
||||
`Failed to retrieve price for price_id: ${subscription.price_id}`
|
||||
);
|
||||
}
|
||||
|
||||
price = data;
|
||||
// There are only 21 records, so this is super fine
|
||||
// Cache the price for a long time (e.g., 1 day)
|
||||
await setValue(priceCacheKey, JSON.stringify(price), 86400);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error retrieving or caching price: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
// If errors, just assume it's a big number so user don't get an error
|
||||
price = { credits: 10000000 };
|
||||
}
|
||||
|
||||
const creditLimit = price.credits;
|
||||
const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit;
|
||||
|
||||
// Removal of + credits
|
||||
const creditUsagePercentage = adjustedCreditsUsed / creditLimit;
|
||||
|
||||
// Compare the adjusted total credits used with the credits allowed by the plan
|
||||
if (adjustedCreditsUsed + credits > price.credits) {
|
||||
// await sendNotification(
|
||||
// team_id,
|
||||
// NotificationType.LIMIT_REACHED,
|
||||
// subscription.current_period_start,
|
||||
// subscription.current_period_end
|
||||
// );
|
||||
if (adjustedCreditsUsed >= price.credits) {
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
subscription.current_period_start,
|
||||
subscription.current_period_end
|
||||
);
|
||||
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||
} else if (creditUsagePercentage >= 0.8) {
|
||||
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||
// Send email notification for approaching credit limit
|
||||
// await sendNotification(
|
||||
// team_id,
|
||||
// NotificationType.APPROACHING_LIMIT,
|
||||
// subscription.current_period_start,
|
||||
// subscription.current_period_end
|
||||
// );
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.APPROACHING_LIMIT,
|
||||
subscription.current_period_start,
|
||||
subscription.current_period_end
|
||||
);
|
||||
}
|
||||
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||
|
@ -462,8 +528,8 @@ async function createCreditUsage({
|
|||
subscription_id?: string;
|
||||
credits: number;
|
||||
}) {
|
||||
const { data: credit_usage } = await supabase_service
|
||||
.from("credit_usage")
|
||||
await supabase_service
|
||||
.from("credit_usage")
|
||||
.insert([
|
||||
{
|
||||
team_id,
|
||||
|
@ -471,8 +537,7 @@ async function createCreditUsage({
|
|||
subscription_id: subscription_id || null,
|
||||
created_at: new Date(),
|
||||
},
|
||||
])
|
||||
.select();
|
||||
]);
|
||||
|
||||
return { success: true, credit_usage };
|
||||
return { success: true };
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import "dotenv/config";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logCrawl(job_id: string, team_id: string) {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
|
|
@ -4,6 +4,8 @@ import { FirecrawlJob } from "../../types";
|
|||
import { posthog } from "../posthog";
|
||||
import "dotenv/config";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
try {
|
||||
|
|
|
@ -3,6 +3,8 @@ import { ScrapeLog } from "../../types";
|
|||
import { supabase_service } from "../supabase";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logScrape(
|
||||
scrapeLog: ScrapeLog,
|
||||
|
|
|
@ -67,6 +67,6 @@ export function waitForJob(jobId: string, timeout: number) {
|
|||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
}, 500);
|
||||
})
|
||||
}
|
||||
|
|
|
@ -36,6 +36,8 @@ import {
|
|||
} from "../../src/lib/job-priority";
|
||||
import { PlanType } from "../types";
|
||||
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({
|
||||
|
|
|
@ -6,7 +6,7 @@ const RATE_LIMITS = {
|
|||
crawl: {
|
||||
default: 3,
|
||||
free: 2,
|
||||
starter: 3,
|
||||
starter: 10,
|
||||
standard: 5,
|
||||
standardOld: 40,
|
||||
scale: 50,
|
||||
|
@ -19,9 +19,9 @@ const RATE_LIMITS = {
|
|||
scrape: {
|
||||
default: 20,
|
||||
free: 10,
|
||||
starter: 20,
|
||||
starter: 100,
|
||||
standard: 100,
|
||||
standardOld: 40,
|
||||
standardOld: 100,
|
||||
scale: 500,
|
||||
hobby: 20,
|
||||
standardNew: 100,
|
||||
|
@ -32,8 +32,8 @@ const RATE_LIMITS = {
|
|||
search: {
|
||||
default: 20,
|
||||
free: 5,
|
||||
starter: 20,
|
||||
standard: 40,
|
||||
starter: 50,
|
||||
standard: 50,
|
||||
standardOld: 40,
|
||||
scale: 500,
|
||||
hobby: 10,
|
||||
|
@ -45,9 +45,9 @@ const RATE_LIMITS = {
|
|||
map:{
|
||||
default: 20,
|
||||
free: 5,
|
||||
starter: 20,
|
||||
standard: 40,
|
||||
standardOld: 40,
|
||||
starter: 50,
|
||||
standard: 50,
|
||||
standardOld: 50,
|
||||
scale: 500,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
|
|
|
@ -3,6 +3,8 @@ import { legacyDocumentConverter } from "../../src/controllers/v1/types";
|
|||
import { Logger } from "../../src/lib/logger";
|
||||
import { supabase_service } from "./supabase";
|
||||
import { WebhookEventType } from "../types";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export const callWebhook = async (
|
||||
teamId: string,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import FirecrawlApp from 'firecrawl';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from '@mendable/firecrawl-js';
|
||||
import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from 'firecrawl';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
|
|
8
apps/js-sdk/firecrawl/package-lock.json
generated
8
apps/js-sdk/firecrawl/package-lock.json
generated
|
@ -10,10 +10,8 @@
|
|||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"isows": "^1.0.4",
|
||||
"typescript-event-target": "^1.1.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
},
|
||||
|
@ -25,10 +23,12 @@
|
|||
"@types/mocha": "^10.0.6",
|
||||
"@types/node": "^20.12.12",
|
||||
"@types/uuid": "^9.0.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"jest": "^29.7.0",
|
||||
"ts-jest": "^29.2.2",
|
||||
"tsup": "^8.2.4",
|
||||
"typescript": "^5.4.5"
|
||||
"typescript": "^5.4.5",
|
||||
"uuid": "^9.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/@ampproject/remapping": {
|
||||
|
@ -2502,6 +2502,7 @@
|
|||
"version": "16.4.5",
|
||||
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
|
||||
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
|
@ -5290,6 +5291,7 @@
|
|||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.2.1",
|
||||
"version": "1.2.3",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
@ -26,10 +26,8 @@
|
|||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"isows": "^1.0.4",
|
||||
"typescript-event-target": "^1.1.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
},
|
||||
|
@ -38,6 +36,8 @@
|
|||
},
|
||||
"homepage": "https://github.com/mendableai/firecrawl#readme",
|
||||
"devDependencies": {
|
||||
"uuid": "^9.0.1",
|
||||
"dotenv": "^16.4.5",
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/dotenv": "^8.2.0",
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import axios, { type AxiosResponse, type AxiosRequestHeaders } from "axios";
|
||||
import { z } from "zod";
|
||||
import type { ZodSchema } from "zod";
|
||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||
import { WebSocket } from "isows";
|
||||
import { TypedEventTarget } from "typescript-event-target";
|
||||
|
@ -81,7 +81,7 @@ export interface ScrapeParams {
|
|||
onlyMainContent?: boolean;
|
||||
extract?: {
|
||||
prompt?: string;
|
||||
schema?: z.ZodSchema | any;
|
||||
schema?: ZodSchema | any;
|
||||
systemPrompt?: string;
|
||||
};
|
||||
waitFor?: number;
|
||||
|
@ -131,15 +131,14 @@ export interface CrawlResponse {
|
|||
*/
|
||||
export interface CrawlStatusResponse {
|
||||
success: true;
|
||||
total: number;
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: Date;
|
||||
status: "scraping" | "completed" | "failed";
|
||||
next: string;
|
||||
data?: FirecrawlDocument[];
|
||||
error?: string;
|
||||
}
|
||||
next?: string;
|
||||
data: FirecrawlDocument[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Parameters for mapping operations.
|
||||
|
@ -329,9 +328,10 @@ export default class FirecrawlApp {
|
|||
/**
|
||||
* Checks the status of a crawl job using the Firecrawl API.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
||||
* @returns The response containing the job status.
|
||||
*/
|
||||
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
|
||||
async checkCrawlStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
|
||||
if (!id) {
|
||||
throw new Error("No crawl ID provided");
|
||||
}
|
||||
|
@ -342,17 +342,29 @@ export default class FirecrawlApp {
|
|||
`${this.apiUrl}/v1/crawl/${id}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
if (response.status === 200 && getAllData) {
|
||||
let allData = response.data.data;
|
||||
if (response.data.status === "completed") {
|
||||
let statusData = response.data
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
allData = data;
|
||||
}
|
||||
}
|
||||
return ({
|
||||
success: true,
|
||||
success: response.data.success,
|
||||
status: response.data.status,
|
||||
total: response.data.total,
|
||||
completed: response.data.completed,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: response.data.data,
|
||||
error: response.data.error
|
||||
data: allData,
|
||||
error: response.data.error,
|
||||
})
|
||||
} else {
|
||||
this.handleError(response, "check crawl status");
|
||||
|
@ -452,22 +464,29 @@ export default class FirecrawlApp {
|
|||
id: string,
|
||||
headers: AxiosRequestHeaders,
|
||||
checkInterval: number
|
||||
): Promise<CrawlStatusResponse> {
|
||||
): Promise<CrawlStatusResponse | ErrorResponse> {
|
||||
while (true) {
|
||||
const statusResponse: AxiosResponse = await this.getRequest(
|
||||
let statusResponse: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/crawl/${id}`,
|
||||
headers
|
||||
);
|
||||
if (statusResponse.status === 200) {
|
||||
const statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
return statusData;
|
||||
} else {
|
||||
throw new Error("Crawl job completed but no data was returned");
|
||||
}
|
||||
} else if (
|
||||
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
|
||||
let statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
statusResponse = await this.getRequest(statusData.next, headers);
|
||||
statusData = statusResponse.data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
statusData.data = data;
|
||||
return statusData;
|
||||
} else {
|
||||
throw new Error("Crawl job completed but no data was returned");
|
||||
}
|
||||
} else if (
|
||||
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
||||
) {
|
||||
checkInterval = Math.max(checkInterval, 2);
|
||||
await new Promise((resolve) =>
|
||||
|
|
76
apps/js-sdk/package-lock.json
generated
76
apps/js-sdk/package-lock.json
generated
|
@ -9,8 +9,8 @@
|
|||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.36",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
"uuid": "^10.0.0",
|
||||
|
@ -422,12 +422,14 @@
|
|||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js": {
|
||||
"version": "0.0.36",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.36.tgz",
|
||||
"integrity": "sha512-5zQMWUD49r6Q7cxj+QBthQ964Bm9fMooW4E8E4nIca3BMXCeEuQFVf5C3OEWwZf0SjJvR+5Yx2wUbXJWd1wCOA==",
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz",
|
||||
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"isows": "^1.0.4",
|
||||
"typescript-event-target": "^1.1.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
|
@ -594,6 +596,32 @@
|
|||
"@esbuild/win32-x64": "0.20.2"
|
||||
}
|
||||
},
|
||||
"node_modules/firecrawl": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/firecrawl/-/firecrawl-1.2.0.tgz",
|
||||
"integrity": "sha512-Sy1BCCvs5FhGc4yxPP7NG9iWnK8RXdvA1ZS/K1Gj+LrEN3iAT2WRzhYET7x8G2bif25F6rHJg57vdVb5sr6RyQ==",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"isows": "^1.0.4",
|
||||
"typescript-event-target": "^1.1.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
}
|
||||
},
|
||||
"node_modules/firecrawl/node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/follow-redirects": {
|
||||
"version": "1.15.6",
|
||||
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
|
||||
|
@ -652,6 +680,20 @@
|
|||
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/isows": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz",
|
||||
"integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/wagmi-dev"
|
||||
}
|
||||
],
|
||||
"peerDependencies": {
|
||||
"ws": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/make-error": {
|
||||
"version": "1.3.6",
|
||||
"resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz",
|
||||
|
@ -763,6 +805,11 @@
|
|||
"node": ">=14.17"
|
||||
}
|
||||
},
|
||||
"node_modules/typescript-event-target": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/typescript-event-target/-/typescript-event-target-1.1.1.tgz",
|
||||
"integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg=="
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "5.26.5",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
|
||||
|
@ -786,6 +833,27 @@
|
|||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg=="
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.18.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz",
|
||||
"integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/yn": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz",
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^1.0.3",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
"uuid": "^10.0.0",
|
||||
|
|
|
@ -13,7 +13,7 @@ import os
|
|||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "1.2.1"
|
||||
__version__ = "1.2.4"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
|
|
@ -13,7 +13,6 @@ import logging
|
|||
import os
|
||||
import time
|
||||
from typing import Any, Dict, Optional, List
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
@ -238,7 +237,6 @@ class FirecrawlApp:
|
|||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
print(response)
|
||||
if response['success'] and 'links' in response:
|
||||
return response['links']
|
||||
else:
|
||||
|
@ -346,6 +344,12 @@ class FirecrawlApp:
|
|||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
while 'next' in status_data:
|
||||
status_response = self._get_request(status_data['next'], headers)
|
||||
status_data = status_response.json()
|
||||
data.extend(status_data['data'])
|
||||
status_data['data'] = data
|
||||
return status_data
|
||||
else:
|
||||
raise Exception('Crawl job completed but no data was returned')
|
||||
|
|
|
@ -12,8 +12,7 @@ dependencies = [
|
|||
"requests",
|
||||
"python-dotenv",
|
||||
"websockets",
|
||||
"asyncio",
|
||||
"nest-asyncio"
|
||||
"nest-asyncio"
|
||||
]
|
||||
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||
|
|
|
@ -2,5 +2,4 @@ requests
|
|||
pytest
|
||||
python-dotenv
|
||||
websockets
|
||||
asyncio
|
||||
nest-asyncio
|
|
@ -31,6 +31,7 @@ describe("Scraping Checkup (E2E)", () => {
|
|||
|
||||
describe("Scraping website tests with a dataset", () => {
|
||||
it("Should scrape the website and prompt it against OpenAI", async () => {
|
||||
let totalTimeTaken = 0;
|
||||
let passedTests = 0;
|
||||
const batchSize = 15; // Adjusted to comply with the rate limit of 15 per minute
|
||||
const batchPromises = [];
|
||||
|
@ -51,11 +52,16 @@ describe("Scraping Checkup (E2E)", () => {
|
|||
const batchPromise = Promise.all(
|
||||
batch.map(async (websiteData: WebsiteData) => {
|
||||
try {
|
||||
const startTime = new Date().getTime();
|
||||
const scrapedContent = await request(TEST_URL || "")
|
||||
.post("/v0/scrape")
|
||||
.post("/v1/scrape")
|
||||
.set("Content-Type", "application/json")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.send({ url: websiteData.website, pageOptions: { onlyMainContent: true } });
|
||||
.send({ url: websiteData.website });
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTaken = endTime - startTime;
|
||||
totalTimeTaken += timeTaken;
|
||||
|
||||
if (scrapedContent.statusCode !== 200) {
|
||||
console.error(`Failed to scrape ${websiteData.website} ${scrapedContent.statusCode}`);
|
||||
|
@ -165,6 +171,7 @@ describe("Scraping Checkup (E2E)", () => {
|
|||
const timeTaken = (endTime - startTime) / 1000;
|
||||
console.log(`Score: ${score}%`);
|
||||
console.log(`Total tokens: ${totalTokens}`);
|
||||
console.log(`Total time taken: ${totalTimeTaken} miliseconds`);
|
||||
|
||||
await logErrors(errorLog, timeTaken, totalTokens, score, websitesData.length);
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import "dotenv/config";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
# %%
|
||||
import os
|
||||
import datetime
|
||||
import time
|
||||
from firecrawl import FirecrawlApp
|
||||
import json
|
||||
import google.generativeai as genai
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Retrieve API keys from environment variables
|
||||
google_api_key = os.getenv("GOOGLE_API_KEY")
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
|
||||
# Configure the Google Generative AI module with the API key
|
||||
genai.configure(api_key=google_api_key)
|
||||
model = genai.GenerativeModel("gemini-1.5-pro-001")
|
||||
|
||||
# Set the docs URL
|
||||
docs_url = "https://docs.firecrawl.dev/api-reference"
|
||||
|
||||
# Initialize the FirecrawlApp with your API key
|
||||
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||
|
||||
# %%
|
||||
# Crawl all pages on docs
|
||||
crawl_result = app.crawl_url(docs_url)
|
||||
print(f"Total pages crawled: {len(crawl_result['data'])}")
|
||||
|
||||
# %%
|
||||
# Define the prompt instructions for generating OpenAPI specs
|
||||
prompt_instructions = """
|
||||
Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less.
|
||||
|
||||
If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}.
|
||||
|
||||
Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in.
|
||||
|
||||
API Documentation Content:
|
||||
{{content}}
|
||||
|
||||
Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum.
|
||||
|
||||
Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}.
|
||||
|
||||
To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}.
|
||||
"""
|
||||
|
||||
# %%
|
||||
# Initialize a list to store all API specs
|
||||
all_api_specs = []
|
||||
|
||||
# Process each page in crawl_result
|
||||
for index, page in enumerate(crawl_result['data']):
|
||||
if 'markdown' in page:
|
||||
# Update prompt_instructions with the current page's content
|
||||
current_prompt = prompt_instructions.replace("{content}", page['markdown'])
|
||||
try:
|
||||
# Query the model
|
||||
response = model.generate_content([current_prompt])
|
||||
response_dict = response.to_dict()
|
||||
response_text = response_dict['candidates'][0]['content']['parts'][0]['text']
|
||||
|
||||
# Remove the ```json code wrap if present
|
||||
response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip()
|
||||
|
||||
# Parse JSON
|
||||
json_data = json.loads(response_text)
|
||||
|
||||
# Add non-empty API specs to the list
|
||||
if json_data != {}:
|
||||
all_api_specs.append(json_data)
|
||||
print(f"API specification generated for page {index}")
|
||||
else:
|
||||
print(f"No API specification found for page {index}")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f"Error parsing JSON response for page {index}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred for page {index}: {str(e)}")
|
||||
|
||||
# Print the total number of API specs collected
|
||||
print(f"Total API specifications collected: {len(all_api_specs)}")
|
||||
|
||||
# %%
|
||||
# Combine all API specs and keep the most filled out spec for each path and method
|
||||
combined_spec = {
|
||||
"openapi": "3.0.0",
|
||||
"info": {
|
||||
"title": f"{docs_url} API Specification",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"paths": {},
|
||||
"components": {
|
||||
"schemas": {}
|
||||
}
|
||||
}
|
||||
|
||||
# Helper function to count properties in an object
|
||||
def count_properties(obj):
|
||||
if isinstance(obj, dict):
|
||||
return sum(count_properties(v) for v in obj.values()) + len(obj)
|
||||
elif isinstance(obj, list):
|
||||
return sum(count_properties(item) for item in obj)
|
||||
else:
|
||||
return 1
|
||||
|
||||
# Combine specs, keeping the most detailed version of each path and schema
|
||||
for spec in all_api_specs:
|
||||
# Combine paths
|
||||
if "paths" in spec:
|
||||
for path, methods in spec["paths"].items():
|
||||
if path not in combined_spec["paths"]:
|
||||
combined_spec["paths"][path] = {}
|
||||
for method, details in methods.items():
|
||||
if method not in combined_spec["paths"][path] or count_properties(details) > count_properties(combined_spec["paths"][path][method]):
|
||||
combined_spec["paths"][path][method] = details
|
||||
|
||||
# Combine schemas
|
||||
if "components" in spec and "schemas" in spec["components"]:
|
||||
for schema_name, schema in spec["components"]["schemas"].items():
|
||||
if schema_name not in combined_spec["components"]["schemas"] or count_properties(schema) > count_properties(combined_spec["components"]["schemas"][schema_name]):
|
||||
combined_spec["components"]["schemas"][schema_name] = schema
|
||||
|
||||
# Print summary of combined spec
|
||||
print(f"Combined API specification generated")
|
||||
print(f"Total paths in combined spec: {len(combined_spec['paths'])}")
|
||||
print(f"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}")
|
||||
|
||||
# Save the combined spec to a JSON file in the same directory as the Python file
|
||||
output_file = os.path.join(os.path.dirname(__file__), "combined_api_spec.json")
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(combined_spec, f, indent=2)
|
||||
|
||||
print(f"Combined API specification saved to {output_file}")
|
Loading…
Reference in New Issue
Block a user