mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge pull request #326 from mendableai/feat/save-docs-on-supabase
[Feat] Added implementation for saving docs on supabase
This commit is contained in:
commit
30c1118713
6
.github/workflows/fly-direct.yml
vendored
6
.github/workflows/fly-direct.yml
vendored
|
@ -29,9 +29,9 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Change directory
|
||||
run: cd apps/api
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
- run: flyctl deploy ./apps/api --remote-only -a firecrawl-scraper-js
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js && curl -X POST https://api.firecrawl.dev/admin/$BULL_AUTH_KEY/unpause
|
||||
working-directory: ./apps/api
|
||||
env:
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
|
|
6
.github/workflows/fly.yml
vendored
6
.github/workflows/fly.yml
vendored
|
@ -175,12 +175,12 @@ jobs:
|
|||
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Change directory
|
||||
run: cd apps/api
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
- run: flyctl deploy ./apps/api --remote-only -a firecrawl-scraper-js
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js && curl -X POST https://api.firecrawl.dev/admin/$BULL_AUTH_KEY/unpause
|
||||
working-directory: ./apps/api
|
||||
env:
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
|
||||
build-and-publish-python-sdk:
|
||||
name: Build and publish Python SDK
|
||||
|
|
|
@ -31,6 +31,3 @@ COPY --from=build /app /app
|
|||
# Start the server by default, this can be overwritten at runtime
|
||||
EXPOSE 8080
|
||||
ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
|
||||
CMD [ "pnpm", "run", "start:production" ]
|
||||
CMD [ "pnpm", "run", "worker:production" ]
|
||||
|
||||
|
|
|
@ -6,13 +6,16 @@
|
|||
app = 'staging-firecrawl-scraper-js'
|
||||
primary_region = 'mia'
|
||||
kill_signal = 'SIGINT'
|
||||
kill_timeout = '5s'
|
||||
kill_timeout = '30s'
|
||||
|
||||
[deploy]
|
||||
release_command = 'node dist/src/trigger-shutdown.js https://staging-firecrawl-scraper-js.fly.dev'
|
||||
|
||||
[build]
|
||||
|
||||
[processes]
|
||||
app = 'npm run start:production'
|
||||
worker = 'npm run worker:production'
|
||||
app = 'node dist/src/index.js'
|
||||
worker = 'node dist/src/services/queue-worker.js'
|
||||
|
||||
[http_service]
|
||||
internal_port = 8080
|
||||
|
|
|
@ -6,13 +6,16 @@
|
|||
app = 'firecrawl-scraper-js'
|
||||
primary_region = 'mia'
|
||||
kill_signal = 'SIGINT'
|
||||
kill_timeout = '5s'
|
||||
kill_timeout = '30s'
|
||||
|
||||
[deploy]
|
||||
release_command = 'node dist/src/trigger-shutdown.js https://api.firecrawl.dev'
|
||||
|
||||
[build]
|
||||
|
||||
[processes]
|
||||
app = 'npm run start:production'
|
||||
worker = 'npm run worker:production'
|
||||
app = 'node dist/src/index.js'
|
||||
worker = 'node dist/src/services/queue-worker.js'
|
||||
|
||||
[http_service]
|
||||
internal_port = 8080
|
||||
|
|
|
@ -19,7 +19,8 @@
|
|||
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
|
||||
"mongo-docker-console": "docker exec -it mongodb mongosh",
|
||||
"run-example": "npx ts-node src/example.ts",
|
||||
"deploy:fly:staging": "fly deploy -c fly.staging.toml"
|
||||
"deploy:fly": "flyctl deploy && node postdeploy.js https://api.firecrawl.dev",
|
||||
"deploy:fly:staging": "fly deploy -c fly.staging.toml && node postdeploy.js https://staging-firecrawl-scraper-js.fly.dev"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
|
|
11
apps/api/postdeploy.js
Normal file
11
apps/api/postdeploy.js
Normal file
|
@ -0,0 +1,11 @@
|
|||
require("dotenv").config();
|
||||
|
||||
fetch(process.argv[2] + "/admin/" + process.env.BULL_AUTH_KEY + "/unpause", {
|
||||
method: "POST"
|
||||
}).then(async x => {
|
||||
console.log(await x.text());
|
||||
process.exit(0);
|
||||
}).catch(e => {
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
});
|
|
@ -331,7 +331,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
}, 180000); // 180 seconds
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
|
|
|
@ -3,6 +3,7 @@ import { authenticateUser } from "./auth";
|
|||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -20,15 +21,27 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
|
||||
let data = job.returnvalue;
|
||||
if (process.env.USE_DB_AUTHENTICATION) {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
const jobStatus = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: await job.getState(),
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current: current,
|
||||
current_url: current_url,
|
||||
current_step: current_step,
|
||||
total: total,
|
||||
data: job.returnvalue,
|
||||
partial_data: partialDocs ?? [],
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -9,15 +10,26 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||
}
|
||||
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
let data = job.returnvalue;
|
||||
if (process.env.USE_DB_AUTHENTICATION) {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
const jobStatus = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: await job.getState(),
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current: current,
|
||||
current_url: current_url,
|
||||
current_step: current_step,
|
||||
total: total,
|
||||
data: job.returnvalue,
|
||||
partial_data: partialDocs ?? [],
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
|
|
@ -26,9 +26,11 @@ if (cluster.isMaster) {
|
|||
}
|
||||
|
||||
cluster.on("exit", (worker, code, signal) => {
|
||||
if (code !== null) {
|
||||
console.log(`Worker ${worker.process.pid} exited`);
|
||||
console.log("Starting a new worker");
|
||||
cluster.fork();
|
||||
}
|
||||
});
|
||||
} else {
|
||||
const app = express();
|
||||
|
@ -97,6 +99,7 @@ if (cluster.isMaster) {
|
|||
app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
]);
|
||||
|
@ -113,6 +116,49 @@ if (cluster.isMaster) {
|
|||
}
|
||||
});
|
||||
|
||||
app.post(`/admin/${process.env.BULL_AUTH_KEY}/shutdown`, async (req, res) => {
|
||||
try {
|
||||
const wsq = getWebScraperQueue();
|
||||
|
||||
console.log("Gracefully shutting down...");
|
||||
|
||||
await wsq.pause(false, true);
|
||||
|
||||
const jobs = await wsq.getActive();
|
||||
|
||||
if (jobs.length > 0) {
|
||||
console.log("Removing", jobs.length, "jobs...");
|
||||
|
||||
await Promise.all(jobs.map(async x => {
|
||||
await wsq.client.del(await x.lockKey());
|
||||
await x.takeLock();
|
||||
await x.moveToFailed({ message: "interrupted" });
|
||||
await x.remove();
|
||||
}));
|
||||
|
||||
console.log("Re-adding", jobs.length, "jobs...");
|
||||
await wsq.addBulk(jobs.map(x => ({
|
||||
data: x.data,
|
||||
opts: {
|
||||
jobId: x.id,
|
||||
},
|
||||
})));
|
||||
|
||||
console.log("Done!");
|
||||
|
||||
res.json({ ok: true });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
app.post(`/admin/${process.env.BULL_AUTH_KEY}/unpause`, async (req, res) => {
|
||||
await getWebScraperQueue().resume(false);
|
||||
res.json({ ok: true });
|
||||
});
|
||||
|
||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
|
19
apps/api/src/lib/supabase-jobs.ts
Normal file
19
apps/api/src/lib/supabase-jobs.ts
Normal file
|
@ -0,0 +1,19 @@
|
|||
import { supabase_service } from "../services/supabase";
|
||||
|
||||
export const supabaseGetJobById = async (jobId: string) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from('firecrawl_jobs')
|
||||
.select('*')
|
||||
.eq('job_id', jobId)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
|
@ -4,6 +4,7 @@ import { WebScraperDataProvider } from "../scraper/WebScraper";
|
|||
import { DocumentUrl, Progress } from "../lib/entities";
|
||||
import { billTeam } from "../services/billing/credit_billing";
|
||||
import { Document } from "../lib/entities";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
|
@ -26,7 +27,7 @@ export async function startWebScraperPipeline({
|
|||
}
|
||||
},
|
||||
onSuccess: (result) => {
|
||||
job.moveToCompleted(result);
|
||||
saveJob(job, result);
|
||||
},
|
||||
onError: (error) => {
|
||||
job.moveToFailed(error);
|
||||
|
@ -107,3 +108,22 @@ export async function runWebScraper({
|
|||
return { success: false, message: error.message, docs: [] };
|
||||
}
|
||||
}
|
||||
|
||||
const saveJob = async (job: Job, result: any) => {
|
||||
try {
|
||||
if (process.env.USE_DB_AUTHENTICATION) {
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.update({ docs: result })
|
||||
.eq("job_id", job.id);
|
||||
|
||||
if (error) throw new Error(error.message);
|
||||
await job.moveToCompleted(null);
|
||||
} else {
|
||||
await job.moveToCompleted(result);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to update job status:", error);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,8 +6,7 @@ import "dotenv/config";
|
|||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
try {
|
||||
// Only log jobs in production
|
||||
if (process.env.ENV !== "production") {
|
||||
if (!process.env.USE_DB_AUTHENTICATION) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -25,6 +24,7 @@ export async function logJob(job: FirecrawlJob) {
|
|||
.from("firecrawl_jobs")
|
||||
.insert([
|
||||
{
|
||||
job_id: job.job_id ? job.job_id : null,
|
||||
success: job.success,
|
||||
message: job.message,
|
||||
num_docs: job.num_docs,
|
||||
|
@ -38,6 +38,7 @@ export async function logJob(job: FirecrawlJob) {
|
|||
origin: job.origin,
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
},
|
||||
]);
|
||||
|
||||
|
@ -61,6 +62,7 @@ export async function logJob(job: FirecrawlJob) {
|
|||
origin: job.origin,
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: job.retry,
|
||||
},
|
||||
};
|
||||
posthog.capture(phLog);
|
||||
|
|
|
@ -8,13 +8,17 @@ import { logJob } from "./logging/log_job";
|
|||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||
|
||||
if(process.env.ENV === 'production') {
|
||||
initSDK({ consoleCapture: true, additionalInstrumentations: []});
|
||||
initSDK({
|
||||
consoleCapture: true,
|
||||
additionalInstrumentations: [],
|
||||
});
|
||||
}
|
||||
|
||||
getWebScraperQueue().process(
|
||||
const wsq = getWebScraperQueue();
|
||||
|
||||
wsq.process(
|
||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||
async function (job, done) {
|
||||
|
||||
try {
|
||||
job.progress({
|
||||
current: 1,
|
||||
|
@ -41,6 +45,7 @@ getWebScraperQueue().process(
|
|||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
|
@ -55,6 +60,10 @@ getWebScraperQueue().process(
|
|||
});
|
||||
done(null, data);
|
||||
} catch (error) {
|
||||
if (await getWebScraperQueue().isPaused(false)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
console.error(error.message); // or any other error handling
|
||||
|
@ -80,6 +89,7 @@ getWebScraperQueue().process(
|
|||
};
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
||||
num_docs: 0,
|
||||
|
|
9
apps/api/src/trigger-shutdown.ts
Normal file
9
apps/api/src/trigger-shutdown.ts
Normal file
|
@ -0,0 +1,9 @@
|
|||
fetch(process.argv[2] + "/admin/" + process.env.BULL_AUTH_KEY + "/shutdown", {
|
||||
method: "POST"
|
||||
}).then(async x => {
|
||||
console.log(await x.text());
|
||||
process.exit(0);
|
||||
}).catch(e => {
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
});
|
|
@ -48,6 +48,7 @@ export interface RunWebScraperResult {
|
|||
}
|
||||
|
||||
export interface FirecrawlJob {
|
||||
job_id?: string;
|
||||
success: boolean;
|
||||
message: string;
|
||||
num_docs: number;
|
||||
|
@ -61,6 +62,7 @@ export interface FirecrawlJob {
|
|||
origin: string;
|
||||
extractor_options?: ExtractorOptions,
|
||||
num_tokens?: number,
|
||||
retry?: boolean,
|
||||
}
|
||||
|
||||
export interface FirecrawlScrapeResponse {
|
||||
|
|
Loading…
Reference in New Issue
Block a user