bunch of stuff

This commit is contained in:
Gergő Móricz 2024-10-05 00:25:18 +02:00
parent 33555742a7
commit 33cde0588d
39 changed files with 261 additions and 2145 deletions

View File

@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs:

View File

@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs:

View File

@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1

View File

@ -12,7 +12,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -25,7 +24,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}

View File

@ -9,7 +9,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}

View File

@ -10,7 +10,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}

View File

@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages

View File

@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages

View File

@ -28,8 +28,6 @@ SCRAPING_BEE_API_KEY=
# add for LLM dependednt features (image alt generation, etc.)
OPENAI_API_KEY=
BULL_AUTH_KEY=@
# use if you're configuring basic logging with logtail
LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY=
# set if you'd like to send slack server health status messages
@ -49,9 +47,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
STRIPE_PRICE_ID_GROWTH=
STRIPE_PRICE_ID_GROWTH_YEARLY=
HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1
# set if you'd like to use the fire engine closed beta
FIRE_ENGINE_BETA_URL=

View File

@ -56,8 +56,6 @@
"@bull-board/express": "^5.20.5",
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.16",
"@hyperdx/node-opentelemetry": "^0.8.1",
"@logtail/node": "^0.4.12",
"@nangohq/node": "^0.40.8",
"@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0",

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
process.env.SCRAPING_BEE_API_KEY = "";
process.env.OPENAI_API_KEY = "";
process.env.BULL_AUTH_KEY = "";
process.env.LOGTAIL_KEY = "";
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
process.env.LLAMAPARSE_API_KEY = "";
process.env.TEST_API_KEY = "";

View File

@ -9,7 +9,6 @@ import {
import { supabase_service } from "../services/supabase";
import { withAuth } from "../lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification";
import { logger } from "../lib/logger";
import { redlock } from "../services/redlock";
@ -109,18 +108,6 @@ export async function authenticateUser(
return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
}
function setTrace(team_id: string, api_key: string) {
try {
setTraceAttributes({
team_id,
api_key,
});
} catch (error) {
Sentry.captureException(error);
logger.error(`Error setting trace attributes: ${error.message}`);
}
}
export async function supaAuthenticateUser(
req,
res,
@ -182,8 +169,6 @@ export async function supaAuthenticateUser(
priceId = chunk.price_id;
const plan = getPlanByPriceId(priceId);
// HyperDX Logging
setTrace(teamId, normalizedApi);
subscriptionData = {
team_id: teamId,
plan,

View File

@ -1,7 +1,7 @@
import { authMiddleware } from "../../routes/v1";
import { RateLimiterMode } from "../../types";
import { authenticateUser } from "../auth";
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid";
import { logger } from "../../lib/logger";
@ -78,7 +78,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
if (job.returnvalue) {
send(ws, {
type: "document",
data: legacyDocumentConverter(job.returnvalue),
data: job.returnvalue,
})
} else {
return close(ws, 3000, { type: "error", error: job.failedReason });
@ -109,7 +109,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
completed: doneJobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
data: data.map(x => legacyDocumentConverter(x)),
data: data,
}
});

View File

@ -1,5 +1,5 @@
import { Response } from "express";
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
@ -81,7 +81,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
const job = jobs[ii];
doneJobs.push(job);
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
bytes += JSON.stringify(job.returnvalue).length;
}
}
@ -105,7 +105,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
}
if (data.length > 0) {
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
for (let ii = 0; ii < doneJobs.length; ii++) {
if (data[ii]) {
delete data[ii].rawHtml;
@ -125,7 +125,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
? undefined
: nextURL.href,
data: data.map(x => legacyDocumentConverter(x)),
data: data,
});
}

View File

@ -5,7 +5,6 @@ import {
crawlRequestSchema,
CrawlResponse,
legacyCrawlerOptions,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
@ -23,6 +22,7 @@ import { addScrapeJob } from "../../services/queue-jobs";
import { logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook";
import { scrapeOptions as scrapeOptionsSchema } from "./types";
export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
@ -37,7 +37,7 @@ export async function crawlController(
const remainingCredits = req.account?.remainingCredits ?? 0;
const crawlerOptions = legacyCrawlerOptions(req.body);
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
const scrapeOptions = req.body.scrapeOptions;
// TODO: @rafa, is this right? copied from v0
if (Array.isArray(crawlerOptions.includes)) {
@ -65,7 +65,8 @@ export async function crawlController(
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
pageOptions,
scrapeOptions,
internalOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
@ -107,7 +108,7 @@ export async function crawlController(
mode: "single_urls",
team_id: req.auth.team_id,
crawlerOptions,
pageOptions,
scrapeOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
@ -136,9 +137,8 @@ export async function crawlController(
{
url: req.body.url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: req.auth.team_id,
pageOptions: pageOptions,
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
origin: "api",
crawl_id: id,
webhook: req.body.webhook,

View File

@ -4,6 +4,7 @@ import {
legacyCrawlerOptions,
mapRequestSchema,
RequestWithAuth,
scrapeOptions,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
@ -45,7 +46,8 @@ export async function mapController(
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions: legacyCrawlerOptions(req.body),
pageOptions: {},
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
@ -175,9 +177,8 @@ export async function mapController(
mode: "map",
url: req.body.url,
crawlerOptions: {},
pageOptions: {},
scrapeOptions: {},
origin: req.body.origin,
extractor_options: { mode: "markdown" },
num_tokens: 0,
});

View File

@ -1,10 +1,7 @@
import { Request, Response } from "express";
import { Response } from "express";
import { logger } from "../../lib/logger";
import {
Document,
legacyDocumentConverter,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth,
ScrapeRequest,
scrapeRequestSchema,
@ -27,8 +24,6 @@ export async function scrapeController(
const origin = req.body.origin;
const timeout = req.body.timeout;
const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const jobId = uuidv4();
const startTime = new Date().getTime();
@ -42,10 +37,9 @@ export async function scrapeController(
{
url: req.body.url,
mode: "single_urls",
crawlerOptions: {},
team_id: req.auth.team_id,
pageOptions,
extractorOptions,
scrapeOptions: req.body,
internalOptions: {},
origin: req.body.origin,
is_scrape: true,
},
@ -56,9 +50,9 @@ export async function scrapeController(
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
let doc: any | undefined;
let doc: Document;
try {
doc = (await waitForJob<any[]>(job.id, timeout + totalWait))[0]; // TODO: better types for this
doc = await waitForJob<Document>(job.id, timeout + totalWait); // TODO: better types for this
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
if (e instanceof Error && e.message.startsWith("Job wait")) {
@ -69,34 +63,19 @@ export async function scrapeController(
} else {
return res.status(500).json({
success: false,
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
extractorOptions && extractorOptions.mode !== "markdown"
? " - Could be due to LLM parsing issues"
: ""
}`,
error: `(Internal server error) - ${e && e?.message ? e.message : e}`,
});
}
}
await job.remove();
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return res.status(200).json({
success: true,
warning: "No page found",
data: doc,
});
}
delete doc.index;
delete doc.provider;
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
doc && doc.markdown
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
doc && doc.extract
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
? 0 // TODO: fix
: 0;
let creditsToBeBilled = 1; // Assuming 1 credit per document
@ -113,18 +92,12 @@ export async function scrapeController(
// Optionally, you could notify an admin or add to a retry queue here
});
if (!pageOptions || !pageOptions.includeRawHtml) {
if (!req.body.formats.includes("rawHtml")) {
if (doc && doc.rawHtml) {
delete doc.rawHtml;
}
}
if(pageOptions && pageOptions.includeExtract) {
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
delete doc.markdown;
}
}
logJob({
job_id: jobId,
success: true,
@ -135,16 +108,14 @@ export async function scrapeController(
team_id: req.auth.team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: {},
pageOptions: pageOptions,
scrapeOptions: req.body,
origin: origin,
extractor_options: { mode: "markdown" },
num_tokens: numTokens,
});
return res.status(200).json({
success: true,
data: legacyDocumentConverter(doc),
data: doc,
scrape_id: origin?.includes("website") ? jobId : undefined,
});
}

View File

@ -1,7 +1,6 @@
import { Request, Response } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
@ -394,68 +393,3 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
allowExternalContentLinks: x.allowExternalLinks,
};
}
export function legacyScrapeOptions(x: Omit<ScrapeOptions, "timeout">): PageOptions {
return {
includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"),
includeExtract: x.formats.includes("extract"),
onlyIncludeTags: x.includeTags,
removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent,
waitFor: x.waitFor,
headers: x.headers,
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
};
}
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
return {
mode: x.mode ? "llm-extraction" : "markdown",
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
extractionSchema: x.schema,
userPrompt: x.prompt ?? "",
};
}
export function legacyDocumentConverter(doc: null | undefined): null
export function legacyDocumentConverter(doc: any): Document
export function legacyDocumentConverter(doc: any): Document | null {
if (doc === null || doc === undefined) return null as any;
if (doc.metadata) {
if (doc.metadata.screenshot) {
doc.screenshot = doc.metadata.screenshot;
delete doc.metadata.screenshot;
}
if (doc.metadata.fullPageScreenshot) {
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
delete doc.metadata.fullPageScreenshot;
}
}
const document: Document = {
markdown: doc.markdown,
links: doc.linksOnPage,
rawHtml: doc.rawHtml,
html: doc.html,
extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
actions: doc.actions ?? undefined,
metadata: {
...doc.metadata,
pageError: undefined,
pageStatusCode: undefined,
error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode,
},
};
return document;
}

View File

@ -5,8 +5,7 @@ import express, { NextFunction, Request, Response } from "express";
import bodyParser from "body-parser";
import cors from "cors";
import { getScrapeQueue } from "./services/queue-service";
import { v0Router } from "./routes/v0";
import { initSDK } from "@hyperdx/node-opentelemetry";
// import { v0Router } from "./routes/v0";
import cluster from "cluster";
import os from "os";
import { logger } from "./lib/logger";
@ -86,18 +85,13 @@ if (cluster.isMaster) {
});
// register router
app.use(v0Router);
// app.use(v0Router);
app.use("/v1", v1Router);
app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002;
const HOST = process.env.HOST ?? "localhost";
// HyperDX OpenTelemetry
if (process.env.ENV === "production") {
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
}
function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => {
logger.info(`Worker ${process.pid} listening on port ${port}`);

View File

@ -1,10 +1,13 @@
import { InternalOptions } from "../scraper/scrapeURL";
import { ScrapeOptions } from "../controllers/v1/types";
import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service";
export type StoredCrawl = {
originUrl: string;
crawlerOptions: any;
pageOptions: any;
scrapeOptions: Omit<ScrapeOptions, "timeout">;
internalOptions: InternalOptions;
team_id: string;
plan?: string;
robots?: string;

View File

@ -1,3 +1,5 @@
import type { Document as V1Document } from "../controllers/v1/types";
export interface Progress {
current: number;
total: number;

View File

@ -1,42 +0,0 @@
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
// const scrapInBatches = async (
// urls: string[],
// batchSize: number,
// delayMs: number
// ) => {
// let successCount = 0;
// let errorCount = 0;
// for (let i = 0; i < urls.length; i += batchSize) {
// const batch = urls
// .slice(i, i + batchSize)
// .map((url) => scrapWithFireEngine(url));
// try {
// const results = await Promise.all(batch);
// results.forEach((data, index) => {
// if (data.trim() === "") {
// errorCount++;
// } else {
// successCount++;
// console.log(
// `Scraping result ${i + index + 1}:`,
// data.trim().substring(0, 20) + "..."
// );
// }
// });
// } catch (error) {
// console.error("Error during scraping:", error);
// }
// await delay(delayMs);
// }
// console.log(`Total successful scrapes: ${successCount}`);
// console.log(`Total errored scrapes: ${errorCount}`);
// };
// function run() {
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
// scrapInBatches(urls, 10, 1000);
// }

View File

@ -1,8 +1,8 @@
import { Job } from "bullmq";
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
import { logger } from "./logger";
import { configDotenv } from "dotenv";
import { Engine } from "../scraper/scrapeURL/engines";
configDotenv();
export type ScrapeErrorEvent = {
@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
type: "scrape",
url: string,
worker?: string,
method: (typeof baseScrapers)[number],
method: Engine,
result: null | {
success: boolean,
response_code?: number,

View File

@ -1,18 +1,16 @@
import { Job } from "bullmq";
import {
CrawlResult,
WebScraperOptions,
RunWebScraperParams,
RunWebScraperResult,
} from "../types";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../lib/entities";
import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase";
import { logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv";
import { scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
configDotenv();
export async function startWebScraperPipeline({
@ -22,120 +20,75 @@ export async function startWebScraperPipeline({
job: Job<WebScraperOptions> & { id: string };
token: string;
}) {
let partialDocs: Document[] = [];
return (await runWebScraper({
url: job.data.url,
mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions,
extractorOptions: job.data.extractorOptions,
pageOptions: {
...job.data.pageOptions,
scrapeOptions: {
...job.data.scrapeOptions,
...(job.data.crawl_id ? ({
includeRawHtml: true,
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
}): {}),
},
inProgress: (progress) => {
logger.debug(`🐂 Job in progress ${job.id}`);
if (progress.currentDocument) {
partialDocs.push(progress.currentDocument);
if (partialDocs.length > 50) {
partialDocs = partialDocs.slice(-50);
}
// job.updateProgress({ ...progress, partialDocs: partialDocs });
}
},
onSuccess: (result, mode) => {
logger.debug(`🐂 Job completed ${job.id}`);
saveJob(job, result, token, mode);
},
onError: (error) => {
logger.error(`🐂 Job failed ${job.id}`);
ScrapeEvents.logJobEvent(job, "failed");
job.moveToFailed(error, token, false);
},
internalOptions: job.data.internalOptions,
// onSuccess: (result, mode) => {
// logger.debug(`🐂 Job completed ${job.id}`);
// saveJob(job, result, token, mode);
// },
// onError: (error) => {
// logger.error(`🐂 Job failed ${job.id}`);
// ScrapeEvents.logJobEvent(job, "failed");
// },
team_id: job.data.team_id,
bull_job_id: job.id.toString(),
priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false,
})) as { success: boolean; message: string; docs: Document[] };
}));
}
export async function runWebScraper({
url,
mode,
crawlerOptions,
pageOptions,
extractorOptions,
inProgress,
onSuccess,
onError,
scrapeOptions,
internalOptions,
// onSuccess,
// onError,
team_id,
bull_job_id,
priority,
is_scrape=false,
}: RunWebScraperParams): Promise<RunWebScraperResult> {
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
let response: ScrapeUrlResponse | undefined = undefined;
try {
const provider = new WebScraperDataProvider();
if (mode === "crawl") {
await provider.setOptions({
jobId: bull_job_id,
mode: mode,
urls: [url],
extractorOptions,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
bullJobId: bull_job_id,
priority,
});
} else {
await provider.setOptions({
jobId: bull_job_id,
mode: mode,
urls: url.split(","),
extractorOptions,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
priority,
teamId: team_id
});
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
if (!response.success) {
if (response.error instanceof Error) {
throw response.error;
} else {
throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
}
}
const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress);
})) as Document[];
if (docs.length === 0) {
return {
success: true,
message: "No pages found",
docs: [],
};
}
// remove docs with empty content
const filteredDocs = crawlerOptions.returnOnlyUrls
? docs.map((doc) => {
if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL };
}
}).filter(x => x !== undefined) as DocumentUrl[]
: docs;
if(is_scrape === false) {
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
billTeam(team_id, undefined, 1).catch(error => {
logger.error(`Failed to bill team ${team_id} for 1 credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}
// This is where the returnvalue from the job is set
onSuccess(filteredDocs, mode);
// onSuccess(response.document, mode);
// this return doesn't matter too much for the job completion result
return { success: true, message: "", docs: filteredDocs };
return response;
} catch (error) {
onError(error);
return { success: false, message: error.message, docs: [] };
if (response !== undefined) {
return {
...response,
success: false,
error,
}
} else {
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"] };
}
// onError(error);
}
}

View File

@ -1,35 +1,35 @@
import express from "express";
import { redisHealthController } from "../controllers/v0/admin/redis-health";
import {
autoscalerController,
checkQueuesController,
cleanBefore24hCompleteJobsController,
queuesController,
} from "../controllers/v0/admin/queue";
// import { redisHealthController } from "../controllers/v0/admin/redis-health";
// import {
// autoscalerController,
// checkQueuesController,
// cleanBefore24hCompleteJobsController,
// queuesController,
// } from "../controllers/v0/admin/queue";
export const adminRouter = express.Router();
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
redisHealthController
);
// adminRouter.get(
// `/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
// redisHealthController
// );
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
cleanBefore24hCompleteJobsController
);
// adminRouter.get(
// `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
// cleanBefore24hCompleteJobsController
// );
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
checkQueuesController
);
// adminRouter.get(
// `/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
// checkQueuesController
// );
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
queuesController
);
// adminRouter.get(
// `/admin/${process.env.BULL_AUTH_KEY}/queues`,
// queuesController
// );
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
autoscalerController
);
// adminRouter.get(
// `/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
// autoscalerController
// );

View File

@ -1,30 +1,30 @@
import express from "express";
import { crawlController } from "../../src/controllers/v0/crawl";
import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
import { scrapeController } from "../../src/controllers/v0/scrape";
import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
import { searchController } from "../../src/controllers/v0/search";
import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
import { keyAuthController } from "../../src/controllers/v0/keyAuth";
import { livenessController } from "../controllers/v0/liveness";
import { readinessController } from "../controllers/v0/readiness";
// import { crawlController } from "../../src/controllers/v0/crawl";
// import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
// import { scrapeController } from "../../src/controllers/v0/scrape";
// import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
// import { searchController } from "../../src/controllers/v0/search";
// import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
// import { keyAuthController } from "../../src/controllers/v0/keyAuth";
// import { livenessController } from "../controllers/v0/liveness";
// import { readinessController } from "../controllers/v0/readiness";
export const v0Router = express.Router();
v0Router.post("/v0/scrape", scrapeController);
v0Router.post("/v0/crawl", crawlController);
v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
v0Router.delete("/v0/crawl/cancel/:jobId", crawlCancelController);
v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// v0Router.post("/v0/scrape", scrapeController);
// v0Router.post("/v0/crawl", crawlController);
// v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
// v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
// v0Router.delete("/v0/crawl/cancel/:jobId", crawlCancelController);
// v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// Auth route for key based authentication
v0Router.get("/v0/keyAuth", keyAuthController);
// // Auth route for key based authentication
// v0Router.get("/v0/keyAuth", keyAuthController);
// Search routes
v0Router.post("/v0/search", searchController);
// // Search routes
// v0Router.post("/v0/search", searchController);
// Health/Probe routes
v0Router.get("/v0/health/liveness", livenessController);
v0Router.get("/v0/health/readiness", readinessController);
// // Health/Probe routes
// v0Router.get("/v0/health/liveness", livenessController);
// v0Router.get("/v0/health/readiness", readinessController);

View File

@ -327,27 +327,27 @@ export class WebScraperDataProvider {
return Promise.all(
pdfLinks.map(async (pdfLink) => {
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: pdfLink,
worker: process.env.FLY_MACHINE_ID,
method: "pdf-scrape",
result: null,
});
// const logInsertPromise = ScrapeEvents.insert(this.jobId, {
// type: "scrape",
// url: pdfLink,
// worker: process.env.FLY_MACHINE_ID,
// method: "pdf-scrape",
// result: null,
// });
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
pdfLink,
this.pageOptions?.parsePDF ?? true
);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
// const insertedLogId = await logInsertPromise;
// ScrapeEvents.updateScrapeResult(insertedLogId, {
// response_size: content.length,
// success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
// error: pageError,
// response_code: pageStatusCode,
// time_taken: Date.now() - timer,
// });
return {
content: content,
markdown: content,
@ -361,26 +361,26 @@ export class WebScraperDataProvider {
return Promise.all(
docxLinks.map(async (docxLink) => {
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: docxLink,
worker: process.env.FLY_MACHINE_ID,
method: "docx-scrape",
result: null,
});
// const logInsertPromise = ScrapeEvents.insert(this.jobId, {
// type: "scrape",
// url: docxLink,
// worker: process.env.FLY_MACHINE_ID,
// method: "docx-scrape",
// result: null,
// });
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
docxLink
);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
// const insertedLogId = await logInsertPromise;
// ScrapeEvents.updateScrapeResult(insertedLogId, {
// response_size: content.length,
// success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
// error: pageError,
// response_code: pageStatusCode,
// time_taken: Date.now() - timer,
// });
return {
content,

View File

@ -187,13 +187,13 @@ export async function scrapSingleUrl(
let screenshot = "";
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(jobId, {
type: "scrape",
url,
worker: process.env.FLY_MACHINE_ID,
method,
result: null,
});
// const logInsertPromise = ScrapeEvents.insert(jobId, {
// type: "scrape",
// url,
// worker: process.env.FLY_MACHINE_ID,
// method,
// result: null,
// });
switch (method) {
case "fire-engine":
@ -348,14 +348,14 @@ export async function scrapSingleUrl(
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
const text = await parseMarkdown(cleanedHtml);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: scraperResponse.text.length,
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
error: scraperResponse.metadata.pageError ?? undefined,
response_code: scraperResponse.metadata.pageStatusCode,
time_taken: Date.now() - timer,
});
// const insertedLogId = await logInsertPromise;
// ScrapeEvents.updateScrapeResult(insertedLogId, {
// response_size: scraperResponse.text.length,
// success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
// error: scraperResponse.metadata.pageError ?? undefined,
// response_code: scraperResponse.metadata.pageStatusCode,
// time_taken: Date.now() - timer,
// });
return {
text,

View File

@ -56,7 +56,7 @@ export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "net
url: body["resolved-url"] ?? meta.url,
html: body.body,
error: response.status >= 300 ? "Request failed with error code " + response.status : undefined,
error: response.status >= 300 ? response.statusText : undefined,
statusCode: response.status,
...(body.screenshot ? ({
screenshot: `data:image/png;base64,${body.screenshot}`,

View File

@ -9,7 +9,7 @@ import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
type ScrapeUrlResponse = ({
export type ScrapeUrlResponse = ({
success: true,
document: Document,
} | {
@ -104,6 +104,7 @@ async function scrapeURLLoop(
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`,);
// TODO: handle sitemap data, see WebScraper/index.ts:280
// TODO: ScrapeEvents
const fallbackList = buildFallbackList(meta);

View File

@ -94,6 +94,10 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
meta.logger.warn("Request had format: extract, but there was no extract field in the result.");
}
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
delete document.actions;
}
return document;
}

View File

@ -16,11 +16,11 @@ export async function logJob(job: FirecrawlJob) {
// Redact any pages that have an authorization header
if (
job.pageOptions &&
job.pageOptions.headers &&
job.pageOptions.headers["Authorization"]
job.scrapeOptions &&
job.scrapeOptions.headers &&
job.scrapeOptions.headers["Authorization"]
) {
job.pageOptions.headers["Authorization"] = "REDACTED";
job.scrapeOptions.headers["Authorization"] = "REDACTED";
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
}
@ -38,9 +38,8 @@ export async function logJob(job: FirecrawlJob) {
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.pageOptions,
page_options: job.scrapeOptions,
origin: job.origin,
extractor_options: job.extractor_options,
num_tokens: job.num_tokens,
retry: !!job.retry,
crawl_id: job.crawl_id,
@ -63,9 +62,8 @@ export async function logJob(job: FirecrawlJob) {
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.pageOptions,
page_options: job.scrapeOptions,
origin: job.origin,
extractor_options: job.extractor_options,
num_tokens: job.num_tokens,
retry: job.retry,
},

View File

@ -1,20 +0,0 @@
import { Logtail } from "@logtail/node";
import "dotenv/config";
import { logger } from "../lib/logger";
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
class MockLogtail {
info(message: string, context?: Record<string, any>): void {
logger.debug(`${message} - ${context}`);
}
error(message: string, context: Record<string, any> = {}): void {
logger.error(`${message} - ${context}`);
}
}
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
return new MockLogtail();
})();

View File

@ -7,11 +7,9 @@ import {
redisConnection,
scrapeQueueName,
} from "./queue-service";
import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job";
import { initSDK } from "@hyperdx/node-opentelemetry";
import { Job } from "bullmq";
import { logger } from "../lib/logger";
import { Worker } from "bullmq";
@ -28,23 +26,17 @@ import {
} from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import {
addJobPriority,
deleteJobPriority,
getJobPriority,
} from "../../src/lib/job-priority";
import { PlanType } from "../types";
import { getJobs } from "../../src/controllers/v1/crawl-status";
import { getJobs } from "..//controllers/v1/crawl-status";
import { configDotenv } from "dotenv";
import { scrapeOptions } from "../controllers/v1/types";
configDotenv();
if (process.env.ENV === "production") {
initSDK({
consoleCapture: true,
additionalInstrumentations: [],
});
}
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
@ -71,13 +63,18 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
let err = null;
try {
const result = await processJob(job, token);
try {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
await job.moveToCompleted(null, token, false);
} else {
await job.moveToCompleted(result.docs, token, false);
}
} catch (e) {}
if (result.success) {
try {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
await job.moveToCompleted(null, token, false);
} else {
await job.moveToCompleted(result.document, token, false);
}
} catch (e) {}
} else {
await job.moveToFailed((result as any).error, token, false);
}
} catch (error) {
console.log("Job failed, error:", error);
Sentry.captureException(error);
@ -210,12 +207,11 @@ async function processJob(job: Job & { id: string }, token: string) {
logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
const data = {
success: false,
docs: [],
document: null,
project_id: job.data.project_id,
error:
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
};
await job.moveToCompleted(data.docs, token, false);
return data;
}
@ -228,46 +224,36 @@ async function processJob(job: Job & { id: string }, token: string) {
});
const start = Date.now();
const { success, message, docs } = await startWebScraperPipeline({
const pipeline = await startWebScraperPipeline({
job,
token,
});
// Better if we throw here so we capture with the correct error
if (!success) {
throw new Error(message);
if (!pipeline.success) {
// TODO: let's Not do this
throw pipeline.error;
}
const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000;
const rawHtml = docs[0] ? docs[0].rawHtml : "";
const doc = pipeline.document;
const rawHtml = doc.rawHtml ?? "";
const data = {
success,
success: true,
result: {
links: docs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
links: [{
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? "",
}],
},
project_id: job.data.project_id,
error: message /* etc... */,
docs,
document: doc,
};
// No idea what this does and when it is called.
if (job.data.mode === "crawl" && !job.data.v1) {
callWebhook(
job.data.team_id,
job.id as string,
data,
job.data.webhook,
job.data.v1
);
}
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
if (job.data.webhook && job.data.v1) {
await callWebhook(
job.data.team_id,
job.data.crawl_id,
@ -280,26 +266,25 @@ async function processJob(job: Job & { id: string }, token: string) {
}
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
success: true,
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
crawlerOptions: sc.crawlerOptions,
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
await addCrawlJobDone(job.data.crawl_id, job.id);
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (!job.data.sitemapped) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
@ -329,9 +314,9 @@ async function processJob(job: Job & { id: string }, token: string) {
{
url: link,
mode: "single_urls",
crawlerOptions: sc.crawlerOptions,
team_id: sc.team_id,
pageOptions: sc.pageOptions,
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
internalOptions: sc.internalOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
v1: job.data.v1,
@ -367,15 +352,15 @@ async function processJob(job: Job & { id: string }, token: string) {
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : message,
message: sc.cancelled ? "Cancelled" : undefined,
num_docs: fullDocs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
scrapeOptions: sc.scrapeOptions,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
@ -390,7 +375,6 @@ async function processJob(job: Job & { id: string }, token: string) {
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: fullDocs,
};
@ -428,15 +412,15 @@ async function processJob(job: Job & { id: string }, token: string) {
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : message,
message: sc.cancelled ? "Cancelled" : undefined,
num_docs: jobIDs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
scrapeOptions: sc.scrapeOptions,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
}
@ -448,37 +432,24 @@ async function processJob(job: Job & { id: string }, token: string) {
} catch (error) {
logger.error(`🐂 Job errored ${job.id} - ${error}`);
if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) {
Sentry.captureException(error, {
data: {
job: job.id,
},
});
}
Sentry.captureException(error, {
data: {
job: job.id,
},
});
if (error instanceof CustomError) {
// Here we handle the error, then save the failed job
logger.error(error.message); // or any other error handling
logtail.error("Custom error while ingesting", {
job_id: job.id,
error: error.message,
dataIngestionJob: error.dataIngestionJob,
});
}
logger.error(error);
if (error.stack) {
logger.error(error.stack);
}
logtail.error("Overall error ingesting", {
job_id: job.id,
error: error.message,
});
const data = {
success: false,
docs: [],
document: null,
project_id: job.data.project_id,
error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
@ -505,6 +476,8 @@ async function processJob(job: Job & { id: string }, token: string) {
}
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
await logJob({
job_id: job.id as string,
success: false,
@ -519,14 +492,12 @@ async function processJob(job: Job & { id: string }, token: string) {
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
crawlerOptions: sc.crawlerOptions,
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
const sc = await getCrawl(job.data.crawl_id);
await logJob({
job_id: job.data.crawl_id,
success: false,
@ -541,8 +512,8 @@ async function processJob(job: Job & { id: string }, token: string) {
team_id: job.data.team_id,
mode: "crawl",
url: sc ? sc.originUrl : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
crawlerOptions: sc ? sc.crawlerOptions : undefined,
scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
origin: job.data.origin,
});
}

View File

@ -1,5 +1,4 @@
import axios from "axios";
import { Document, legacyDocumentConverter } from "../../src/controllers/v1/types";
import { logger } from "../../src/lib/logger";
import { supabase_service } from "./supabase";
import { WebhookEventType } from "../types";
@ -59,7 +58,7 @@ export const callWebhook = async (
for (let i = 0; i < data.result.links.length; i++) {
if (v1) {
dataToSend.push(
legacyDocumentConverter(data.result.links[i].content)
data.result.links[i].content
);
} else {
dataToSend.push({

View File

@ -1,5 +1,6 @@
import { AuthCreditUsageChunk } from "./controllers/v1/types";
import { ExtractorOptions, Document, DocumentUrl } from "./lib/entities";
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document } from "./controllers/v1/types";
import { ExtractorOptions, Document } from "./lib/entities";
import { InternalOptions } from "./scraper/scrapeURL";
type Mode = "crawl" | "single_urls" | "sitemap";
@ -24,9 +25,8 @@ export interface IngestResult {
export interface WebScraperOptions {
url: string;
mode: Mode;
crawlerOptions: any;
pageOptions: any;
extractorOptions?: any;
scrapeOptions: ScrapeOptions;
internalOptions?: InternalOptions;
team_id: string;
origin?: string;
crawl_id?: string;
@ -39,22 +39,22 @@ export interface WebScraperOptions {
export interface RunWebScraperParams {
url: string;
mode: Mode;
crawlerOptions: any;
pageOptions?: any;
extractorOptions?: any;
inProgress: (progress: any) => void;
onSuccess: (result: any, mode: string) => void;
onError: (error: Error) => void;
scrapeOptions: ScrapeOptions;
internalOptions?: InternalOptions;
// onSuccess: (result: V1Document, mode: string) => void;
// onError: (error: Error) => void;
team_id: string;
bull_job_id: string;
priority?: number;
is_scrape?: boolean;
}
export interface RunWebScraperResult {
success: boolean;
message: string;
docs: Document[] | DocumentUrl[];
export type RunWebScraperResult = {
success: false;
error: Error;
} | {
success: true;
document: V1Document;
}
export interface FirecrawlJob {
@ -68,9 +68,8 @@ export interface FirecrawlJob {
mode: string;
url: string;
crawlerOptions?: any;
pageOptions?: any;
scrapeOptions?: any;
origin: string;
extractor_options?: ExtractorOptions,
num_tokens?: number,
retry?: boolean,
crawl_id?: string;

View File

@ -16,7 +16,6 @@ x-common-service: &common-service
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}

View File

@ -7,7 +7,6 @@ data:
OPENAI_API_KEY: ""
SLACK_WEBHOOK_URL: ""
LLAMAPARSE_API_KEY: ""
LOGTAIL_KEY: ""
BULL_AUTH_KEY: ""
TEST_API_KEY: ""
POSTHOG_API_KEY: ""
@ -15,6 +14,5 @@ data:
SCRAPING_BEE_API_KEY: ""
STRIPE_PRICE_ID_STANDARD: ""
STRIPE_PRICE_ID_SCALE: ""
HYPERDX_API_KEY: ""
FIRE_ENGINE_BETA_URL: ""
REDIS_PASSWORD: ""