mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
bunch of stuff
This commit is contained in:
parent
33555742a7
commit
33cde0588d
2
.github/archive/js-sdk.yml
vendored
2
.github/archive/js-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -21,7 +20,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
|
|
2
.github/archive/python-sdk.yml
vendored
2
.github/archive/python-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -21,7 +20,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
|
|
2
.github/archive/rust-sdk.yml
vendored
2
.github/archive/rust-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -21,7 +20,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
|
||||
|
|
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
|
@ -12,7 +12,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -25,7 +24,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
|
|
1
.github/workflows/fly-direct.yml
vendored
1
.github/workflows/fly-direct.yml
vendored
|
@ -9,7 +9,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
|
1
.github/workflows/fly.yml
vendored
1
.github/workflows/fly.yml
vendored
|
@ -10,7 +10,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
|
|
@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
|||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
|
|
|
@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
|||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
|
|
|
@ -28,8 +28,6 @@ SCRAPING_BEE_API_KEY=
|
|||
# add for LLM dependednt features (image alt generation, etc.)
|
||||
OPENAI_API_KEY=
|
||||
BULL_AUTH_KEY=@
|
||||
# use if you're configuring basic logging with logtail
|
||||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
|
@ -49,9 +47,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
|
|||
STRIPE_PRICE_ID_GROWTH=
|
||||
STRIPE_PRICE_ID_GROWTH_YEARLY=
|
||||
|
||||
HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
# set if you'd like to use the fire engine closed beta
|
||||
FIRE_ENGINE_BETA_URL=
|
||||
|
||||
|
|
|
@ -56,8 +56,6 @@
|
|||
"@bull-board/express": "^5.20.5",
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.16",
|
||||
"@hyperdx/node-opentelemetry": "^0.8.1",
|
||||
"@logtail/node": "^0.4.12",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||
process.env.SCRAPING_BEE_API_KEY = "";
|
||||
process.env.OPENAI_API_KEY = "";
|
||||
process.env.BULL_AUTH_KEY = "";
|
||||
process.env.LOGTAIL_KEY = "";
|
||||
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
||||
process.env.LLAMAPARSE_API_KEY = "";
|
||||
process.env.TEST_API_KEY = "";
|
||||
|
|
|
@ -9,7 +9,6 @@ import {
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { withAuth } from "../lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { logger } from "../lib/logger";
|
||||
import { redlock } from "../services/redlock";
|
||||
|
@ -109,18 +108,6 @@ export async function authenticateUser(
|
|||
return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
|
||||
}
|
||||
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key,
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
res,
|
||||
|
@ -182,8 +169,6 @@ export async function supaAuthenticateUser(
|
|||
priceId = chunk.price_id;
|
||||
|
||||
const plan = getPlanByPriceId(priceId);
|
||||
// HyperDX Logging
|
||||
setTrace(teamId, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: teamId,
|
||||
plan,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { logger } from "../../lib/logger";
|
||||
|
@ -78,7 +78,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
if (job.returnvalue) {
|
||||
send(ws, {
|
||||
type: "document",
|
||||
data: legacyDocumentConverter(job.returnvalue),
|
||||
data: job.returnvalue,
|
||||
})
|
||||
} else {
|
||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||
|
@ -109,7 +109,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
completed: doneJobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
data: data,
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { Response } from "express";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
|
@ -81,7 +81,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||
const job = jobs[ii];
|
||||
doneJobs.push(job);
|
||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||
bytes += JSON.stringify(job.returnvalue).length;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -105,7 +105,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
}
|
||||
|
||||
if (data.length > 0) {
|
||||
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
|
||||
if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
|
||||
for (let ii = 0; ii < doneJobs.length; ii++) {
|
||||
if (data[ii]) {
|
||||
delete data[ii].rawHtml;
|
||||
|
@ -125,7 +125,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||
? undefined
|
||||
: nextURL.href,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
data: data,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ import {
|
|||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyCrawlerOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
|
@ -23,6 +22,7 @@ import { addScrapeJob } from "../../services/queue-jobs";
|
|||
import { logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
||||
|
||||
export async function crawlController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||
|
@ -37,7 +37,7 @@ export async function crawlController(
|
|||
const remainingCredits = req.account?.remainingCredits ?? 0;
|
||||
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
const scrapeOptions = req.body.scrapeOptions;
|
||||
|
||||
// TODO: @rafa, is this right? copied from v0
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
|
@ -65,7 +65,8 @@ export async function crawlController(
|
|||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
scrapeOptions,
|
||||
internalOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -107,7 +108,7 @@ export async function crawlController(
|
|||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
scrapeOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -136,9 +137,8 @@ export async function crawlController(
|
|||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions: pageOptions,
|
||||
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
|
|
|
@ -4,6 +4,7 @@ import {
|
|||
legacyCrawlerOptions,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
scrapeOptions,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
|
@ -45,7 +46,8 @@ export async function mapController(
|
|||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
pageOptions: {},
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
internalOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -175,9 +177,8 @@ export async function mapController(
|
|||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: {},
|
||||
scrapeOptions: {},
|
||||
origin: req.body.origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
|
|
|
@ -1,10 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Response } from "express";
|
||||
import { logger } from "../../lib/logger";
|
||||
import {
|
||||
Document,
|
||||
legacyDocumentConverter,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
ScrapeRequest,
|
||||
scrapeRequestSchema,
|
||||
|
@ -27,8 +24,6 @@ export async function scrapeController(
|
|||
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
|
@ -42,10 +37,9 @@ export async function scrapeController(
|
|||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: {},
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
|
@ -56,9 +50,9 @@ export async function scrapeController(
|
|||
|
||||
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
|
||||
|
||||
let doc: any | undefined;
|
||||
let doc: Document;
|
||||
try {
|
||||
doc = (await waitForJob<any[]>(job.id, timeout + totalWait))[0]; // TODO: better types for this
|
||||
doc = await waitForJob<Document>(job.id, timeout + totalWait); // TODO: better types for this
|
||||
} catch (e) {
|
||||
logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
|
@ -69,34 +63,19 @@ export async function scrapeController(
|
|||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||
extractorOptions && extractorOptions.mode !== "markdown"
|
||||
? " - Could be due to LLM parsing issues"
|
||||
: ""
|
||||
}`,
|
||||
error: `(Internal server error) - ${e && e?.message ? e.message : e}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
warning: "No page found",
|
||||
data: doc,
|
||||
});
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
doc && doc.markdown
|
||||
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
doc && doc.extract
|
||||
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
? 0 // TODO: fix
|
||||
: 0;
|
||||
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
|
@ -113,18 +92,12 @@ export async function scrapeController(
|
|||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (!req.body.formats.includes("rawHtml")) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
|
@ -135,16 +108,14 @@ export async function scrapeController(
|
|||
team_id: req.auth.team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
scrapeOptions: req.body,
|
||||
origin: origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: legacyDocumentConverter(doc),
|
||||
data: doc,
|
||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
|
@ -394,68 +393,3 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
|
|||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyScrapeOptions(x: Omit<ScrapeOptions, "timeout">): PageOptions {
|
||||
return {
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
includeExtract: x.formats.includes("extract"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
headers: x.headers,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
||||
return {
|
||||
mode: x.mode ? "llm-extraction" : "markdown",
|
||||
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
||||
extractionSchema: x.schema,
|
||||
userPrompt: x.prompt ?? "",
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: null | undefined): null
|
||||
export function legacyDocumentConverter(doc: any): Document
|
||||
export function legacyDocumentConverter(doc: any): Document | null {
|
||||
if (doc === null || doc === undefined) return null as any;
|
||||
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
doc.screenshot = doc.metadata.screenshot;
|
||||
delete doc.metadata.screenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.fullPageScreenshot) {
|
||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
}
|
||||
|
||||
const document: Document = {
|
||||
markdown: doc.markdown,
|
||||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
actions: doc.actions ?? undefined,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
pageStatusCode: undefined,
|
||||
error: doc.metadata.pageError,
|
||||
statusCode: doc.metadata.pageStatusCode,
|
||||
},
|
||||
};
|
||||
|
||||
return document;
|
||||
}
|
||||
|
|
|
@ -5,8 +5,7 @@ import express, { NextFunction, Request, Response } from "express";
|
|||
import bodyParser from "body-parser";
|
||||
import cors from "cors";
|
||||
import { getScrapeQueue } from "./services/queue-service";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
// import { v0Router } from "./routes/v0";
|
||||
import cluster from "cluster";
|
||||
import os from "os";
|
||||
import { logger } from "./lib/logger";
|
||||
|
@ -86,18 +85,13 @@ if (cluster.isMaster) {
|
|||
});
|
||||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
// app.use(v0Router);
|
||||
app.use("/v1", v1Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
const HOST = process.env.HOST ?? "localhost";
|
||||
|
||||
// HyperDX OpenTelemetry
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
|
||||
}
|
||||
|
||||
function startServer(port = DEFAULT_PORT) {
|
||||
const server = app.listen(Number(port), HOST, () => {
|
||||
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
import { InternalOptions } from "../scraper/scrapeURL";
|
||||
import { ScrapeOptions } from "../controllers/v1/types";
|
||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl: string;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
scrapeOptions: Omit<ScrapeOptions, "timeout">;
|
||||
internalOptions: InternalOptions;
|
||||
team_id: string;
|
||||
plan?: string;
|
||||
robots?: string;
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import type { Document as V1Document } from "../controllers/v1/types";
|
||||
|
||||
export interface Progress {
|
||||
current: number;
|
||||
total: number;
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
|
||||
|
||||
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
// const scrapInBatches = async (
|
||||
// urls: string[],
|
||||
// batchSize: number,
|
||||
// delayMs: number
|
||||
// ) => {
|
||||
// let successCount = 0;
|
||||
// let errorCount = 0;
|
||||
|
||||
// for (let i = 0; i < urls.length; i += batchSize) {
|
||||
// const batch = urls
|
||||
// .slice(i, i + batchSize)
|
||||
// .map((url) => scrapWithFireEngine(url));
|
||||
// try {
|
||||
// const results = await Promise.all(batch);
|
||||
// results.forEach((data, index) => {
|
||||
// if (data.trim() === "") {
|
||||
// errorCount++;
|
||||
// } else {
|
||||
// successCount++;
|
||||
// console.log(
|
||||
// `Scraping result ${i + index + 1}:`,
|
||||
// data.trim().substring(0, 20) + "..."
|
||||
// );
|
||||
// }
|
||||
// });
|
||||
// } catch (error) {
|
||||
// console.error("Error during scraping:", error);
|
||||
// }
|
||||
// await delay(delayMs);
|
||||
// }
|
||||
|
||||
// console.log(`Total successful scrapes: ${successCount}`);
|
||||
// console.log(`Total errored scrapes: ${errorCount}`);
|
||||
// };
|
||||
// function run() {
|
||||
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
|
||||
// scrapInBatches(urls, 10, 1000);
|
||||
// }
|
|
@ -1,8 +1,8 @@
|
|||
import { Job } from "bullmq";
|
||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { logger } from "./logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
configDotenv();
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
|
@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
|
|||
type: "scrape",
|
||||
url: string,
|
||||
worker?: string,
|
||||
method: (typeof baseScrapers)[number],
|
||||
method: Engine,
|
||||
result: null | {
|
||||
success: boolean,
|
||||
response_code?: number,
|
||||
|
|
|
@ -1,18 +1,16 @@
|
|||
import { Job } from "bullmq";
|
||||
import {
|
||||
CrawlResult,
|
||||
WebScraperOptions,
|
||||
RunWebScraperParams,
|
||||
RunWebScraperResult,
|
||||
} from "../types";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { DocumentUrl, Progress } from "../lib/entities";
|
||||
import { billTeam } from "../services/billing/credit_billing";
|
||||
import { Document } from "../lib/entities";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
|
@ -22,120 +20,75 @@ export async function startWebScraperPipeline({
|
|||
job: Job<WebScraperOptions> & { id: string };
|
||||
token: string;
|
||||
}) {
|
||||
let partialDocs: Document[] = [];
|
||||
return (await runWebScraper({
|
||||
url: job.data.url,
|
||||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
extractorOptions: job.data.extractorOptions,
|
||||
pageOptions: {
|
||||
...job.data.pageOptions,
|
||||
scrapeOptions: {
|
||||
...job.data.scrapeOptions,
|
||||
...(job.data.crawl_id ? ({
|
||||
includeRawHtml: true,
|
||||
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
|
||||
}): {}),
|
||||
},
|
||||
inProgress: (progress) => {
|
||||
logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
partialDocs.push(progress.currentDocument);
|
||||
if (partialDocs.length > 50) {
|
||||
partialDocs = partialDocs.slice(-50);
|
||||
}
|
||||
// job.updateProgress({ ...progress, partialDocs: partialDocs });
|
||||
}
|
||||
},
|
||||
onSuccess: (result, mode) => {
|
||||
logger.debug(`🐂 Job completed ${job.id}`);
|
||||
saveJob(job, result, token, mode);
|
||||
},
|
||||
onError: (error) => {
|
||||
logger.error(`🐂 Job failed ${job.id}`);
|
||||
ScrapeEvents.logJobEvent(job, "failed");
|
||||
job.moveToFailed(error, token, false);
|
||||
},
|
||||
internalOptions: job.data.internalOptions,
|
||||
// onSuccess: (result, mode) => {
|
||||
// logger.debug(`🐂 Job completed ${job.id}`);
|
||||
// saveJob(job, result, token, mode);
|
||||
// },
|
||||
// onError: (error) => {
|
||||
// logger.error(`🐂 Job failed ${job.id}`);
|
||||
// ScrapeEvents.logJobEvent(job, "failed");
|
||||
// },
|
||||
team_id: job.data.team_id,
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
is_scrape: job.data.is_scrape ?? false,
|
||||
})) as { success: boolean; message: string; docs: Document[] };
|
||||
}));
|
||||
}
|
||||
export async function runWebScraper({
|
||||
url,
|
||||
mode,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
inProgress,
|
||||
onSuccess,
|
||||
onError,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
// onSuccess,
|
||||
// onError,
|
||||
team_id,
|
||||
bull_job_id,
|
||||
priority,
|
||||
is_scrape=false,
|
||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||
let response: ScrapeUrlResponse | undefined = undefined;
|
||||
try {
|
||||
const provider = new WebScraperDataProvider();
|
||||
if (mode === "crawl") {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: [url],
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
bullJobId: bull_job_id,
|
||||
priority,
|
||||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: url.split(","),
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
teamId: team_id
|
||||
});
|
||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
|
||||
if (!response.success) {
|
||||
if (response.error instanceof Error) {
|
||||
throw response.error;
|
||||
} else {
|
||||
throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
|
||||
}
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
inProgress(progress);
|
||||
})) as Document[];
|
||||
|
||||
if (docs.length === 0) {
|
||||
return {
|
||||
success: true,
|
||||
message: "No pages found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
|
||||
// remove docs with empty content
|
||||
const filteredDocs = crawlerOptions.returnOnlyUrls
|
||||
? docs.map((doc) => {
|
||||
if (doc.metadata.sourceURL) {
|
||||
return { url: doc.metadata.sourceURL };
|
||||
}
|
||||
}).filter(x => x !== undefined) as DocumentUrl[]
|
||||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
|
||||
logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||
billTeam(team_id, undefined, 1).catch(error => {
|
||||
logger.error(`Failed to bill team ${team_id} for 1 credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
onSuccess(filteredDocs, mode);
|
||||
// onSuccess(response.document, mode);
|
||||
|
||||
// this return doesn't matter too much for the job completion result
|
||||
return { success: true, message: "", docs: filteredDocs };
|
||||
return response;
|
||||
} catch (error) {
|
||||
onError(error);
|
||||
return { success: false, message: error.message, docs: [] };
|
||||
if (response !== undefined) {
|
||||
return {
|
||||
...response,
|
||||
success: false,
|
||||
error,
|
||||
}
|
||||
} else {
|
||||
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"] };
|
||||
}
|
||||
// onError(error);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,35 +1,35 @@
|
|||
import express from "express";
|
||||
import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
||||
import {
|
||||
autoscalerController,
|
||||
checkQueuesController,
|
||||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/v0/admin/queue";
|
||||
// import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
||||
// import {
|
||||
// autoscalerController,
|
||||
// checkQueuesController,
|
||||
// cleanBefore24hCompleteJobsController,
|
||||
// queuesController,
|
||||
// } from "../controllers/v0/admin/queue";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
|
||||
redisHealthController
|
||||
);
|
||||
// adminRouter.get(
|
||||
// `/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
|
||||
// redisHealthController
|
||||
// );
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||
cleanBefore24hCompleteJobsController
|
||||
);
|
||||
// adminRouter.get(
|
||||
// `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||
// cleanBefore24hCompleteJobsController
|
||||
// );
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
|
||||
checkQueuesController
|
||||
);
|
||||
// adminRouter.get(
|
||||
// `/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
|
||||
// checkQueuesController
|
||||
// );
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||
queuesController
|
||||
);
|
||||
// adminRouter.get(
|
||||
// `/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||
// queuesController
|
||||
// );
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||
autoscalerController
|
||||
);
|
||||
// adminRouter.get(
|
||||
// `/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||
// autoscalerController
|
||||
// );
|
||||
|
|
|
@ -1,30 +1,30 @@
|
|||
import express from "express";
|
||||
import { crawlController } from "../../src/controllers/v0/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v0/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
|
||||
import { searchController } from "../../src/controllers/v0/search";
|
||||
import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/v0/keyAuth";
|
||||
import { livenessController } from "../controllers/v0/liveness";
|
||||
import { readinessController } from "../controllers/v0/readiness";
|
||||
// import { crawlController } from "../../src/controllers/v0/crawl";
|
||||
// import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
|
||||
// import { scrapeController } from "../../src/controllers/v0/scrape";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
|
||||
// import { searchController } from "../../src/controllers/v0/search";
|
||||
// import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
|
||||
// import { keyAuthController } from "../../src/controllers/v0/keyAuth";
|
||||
// import { livenessController } from "../controllers/v0/liveness";
|
||||
// import { readinessController } from "../controllers/v0/readiness";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
v0Router.post("/v0/scrape", scrapeController);
|
||||
v0Router.post("/v0/crawl", crawlController);
|
||||
v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
|
||||
v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
|
||||
v0Router.delete("/v0/crawl/cancel/:jobId", crawlCancelController);
|
||||
v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
// v0Router.post("/v0/scrape", scrapeController);
|
||||
// v0Router.post("/v0/crawl", crawlController);
|
||||
// v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
|
||||
// v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
|
||||
// v0Router.delete("/v0/crawl/cancel/:jobId", crawlCancelController);
|
||||
// v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// Auth route for key based authentication
|
||||
v0Router.get("/v0/keyAuth", keyAuthController);
|
||||
// // Auth route for key based authentication
|
||||
// v0Router.get("/v0/keyAuth", keyAuthController);
|
||||
|
||||
// Search routes
|
||||
v0Router.post("/v0/search", searchController);
|
||||
// // Search routes
|
||||
// v0Router.post("/v0/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
v0Router.get("/v0/health/liveness", livenessController);
|
||||
v0Router.get("/v0/health/readiness", readinessController);
|
||||
// // Health/Probe routes
|
||||
// v0Router.get("/v0/health/liveness", livenessController);
|
||||
// v0Router.get("/v0/health/readiness", readinessController);
|
||||
|
|
|
@ -327,27 +327,27 @@ export class WebScraperDataProvider {
|
|||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: pdfLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "pdf-scrape",
|
||||
result: null,
|
||||
});
|
||||
// const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
// type: "scrape",
|
||||
// url: pdfLink,
|
||||
// worker: process.env.FLY_MACHINE_ID,
|
||||
// method: "pdf-scrape",
|
||||
// result: null,
|
||||
// });
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
pdfLink,
|
||||
this.pageOptions?.parsePDF ?? true
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
// const insertedLogId = await logInsertPromise;
|
||||
// ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
// response_size: content.length,
|
||||
// success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
// error: pageError,
|
||||
// response_code: pageStatusCode,
|
||||
// time_taken: Date.now() - timer,
|
||||
// });
|
||||
return {
|
||||
content: content,
|
||||
markdown: content,
|
||||
|
@ -361,26 +361,26 @@ export class WebScraperDataProvider {
|
|||
return Promise.all(
|
||||
docxLinks.map(async (docxLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: docxLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "docx-scrape",
|
||||
result: null,
|
||||
});
|
||||
// const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
// type: "scrape",
|
||||
// url: docxLink,
|
||||
// worker: process.env.FLY_MACHINE_ID,
|
||||
// method: "docx-scrape",
|
||||
// result: null,
|
||||
// });
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
|
||||
docxLink
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
// const insertedLogId = await logInsertPromise;
|
||||
// ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
// response_size: content.length,
|
||||
// success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
// error: pageError,
|
||||
// response_code: pageStatusCode,
|
||||
// time_taken: Date.now() - timer,
|
||||
// });
|
||||
|
||||
return {
|
||||
content,
|
||||
|
|
|
@ -187,13 +187,13 @@ export async function scrapSingleUrl(
|
|||
let screenshot = "";
|
||||
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||
type: "scrape",
|
||||
url,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method,
|
||||
result: null,
|
||||
});
|
||||
// const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||
// type: "scrape",
|
||||
// url,
|
||||
// worker: process.env.FLY_MACHINE_ID,
|
||||
// method,
|
||||
// result: null,
|
||||
// });
|
||||
|
||||
switch (method) {
|
||||
case "fire-engine":
|
||||
|
@ -348,14 +348,14 @@ export async function scrapSingleUrl(
|
|||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
const text = await parseMarkdown(cleanedHtml);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: scraperResponse.text.length,
|
||||
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
|
||||
error: scraperResponse.metadata.pageError ?? undefined,
|
||||
response_code: scraperResponse.metadata.pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
// const insertedLogId = await logInsertPromise;
|
||||
// ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
// response_size: scraperResponse.text.length,
|
||||
// success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
|
||||
// error: scraperResponse.metadata.pageError ?? undefined,
|
||||
// response_code: scraperResponse.metadata.pageStatusCode,
|
||||
// time_taken: Date.now() - timer,
|
||||
// });
|
||||
|
||||
return {
|
||||
text,
|
||||
|
|
|
@ -56,7 +56,7 @@ export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "net
|
|||
url: body["resolved-url"] ?? meta.url,
|
||||
|
||||
html: body.body,
|
||||
error: response.status >= 300 ? "Request failed with error code " + response.status : undefined,
|
||||
error: response.status >= 300 ? response.statusText : undefined,
|
||||
statusCode: response.status,
|
||||
...(body.screenshot ? ({
|
||||
screenshot: `data:image/png;base64,${body.screenshot}`,
|
||||
|
|
|
@ -9,7 +9,7 @@ import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "
|
|||
import { executeTransformers } from "./transformers";
|
||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||
|
||||
type ScrapeUrlResponse = ({
|
||||
export type ScrapeUrlResponse = ({
|
||||
success: true,
|
||||
document: Document,
|
||||
} | {
|
||||
|
@ -104,6 +104,7 @@ async function scrapeURLLoop(
|
|||
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`,);
|
||||
|
||||
// TODO: handle sitemap data, see WebScraper/index.ts:280
|
||||
// TODO: ScrapeEvents
|
||||
|
||||
const fallbackList = buildFallbackList(meta);
|
||||
|
||||
|
|
|
@ -94,6 +94,10 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
|
|||
meta.logger.warn("Request had format: extract, but there was no extract field in the result.");
|
||||
}
|
||||
|
||||
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
||||
delete document.actions;
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
|
|
|
@ -16,11 +16,11 @@ export async function logJob(job: FirecrawlJob) {
|
|||
|
||||
// Redact any pages that have an authorization header
|
||||
if (
|
||||
job.pageOptions &&
|
||||
job.pageOptions.headers &&
|
||||
job.pageOptions.headers["Authorization"]
|
||||
job.scrapeOptions &&
|
||||
job.scrapeOptions.headers &&
|
||||
job.scrapeOptions.headers["Authorization"]
|
||||
) {
|
||||
job.pageOptions.headers["Authorization"] = "REDACTED";
|
||||
job.scrapeOptions.headers["Authorization"] = "REDACTED";
|
||||
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
|
||||
}
|
||||
|
||||
|
@ -38,9 +38,8 @@ export async function logJob(job: FirecrawlJob) {
|
|||
mode: job.mode,
|
||||
url: job.url,
|
||||
crawler_options: job.crawlerOptions,
|
||||
page_options: job.pageOptions,
|
||||
page_options: job.scrapeOptions,
|
||||
origin: job.origin,
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
|
@ -63,9 +62,8 @@ export async function logJob(job: FirecrawlJob) {
|
|||
mode: job.mode,
|
||||
url: job.url,
|
||||
crawler_options: job.crawlerOptions,
|
||||
page_options: job.pageOptions,
|
||||
page_options: job.scrapeOptions,
|
||||
origin: job.origin,
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: job.retry,
|
||||
},
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
import { Logtail } from "@logtail/node";
|
||||
import "dotenv/config";
|
||||
import { logger } from "../lib/logger";
|
||||
|
||||
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
|
||||
class MockLogtail {
|
||||
info(message: string, context?: Record<string, any>): void {
|
||||
logger.debug(`${message} - ${context}`);
|
||||
}
|
||||
error(message: string, context: Record<string, any> = {}): void {
|
||||
logger.error(`${message} - ${context}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
|
||||
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
|
||||
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
|
||||
logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
|
||||
return new MockLogtail();
|
||||
})();
|
|
@ -7,11 +7,9 @@ import {
|
|||
redisConnection,
|
||||
scrapeQueueName,
|
||||
} from "./queue-service";
|
||||
import { logtail } from "./logtail";
|
||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||
import { callWebhook } from "./webhook";
|
||||
import { logJob } from "./logging/log_job";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import { Job } from "bullmq";
|
||||
import { logger } from "../lib/logger";
|
||||
import { Worker } from "bullmq";
|
||||
|
@ -28,23 +26,17 @@ import {
|
|||
} from "../lib/crawl-redis";
|
||||
import { StoredCrawl } from "../lib/crawl-redis";
|
||||
import { addScrapeJob } from "./queue-jobs";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import {
|
||||
addJobPriority,
|
||||
deleteJobPriority,
|
||||
getJobPriority,
|
||||
} from "../../src/lib/job-priority";
|
||||
import { PlanType } from "../types";
|
||||
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
||||
import { getJobs } from "..//controllers/v1/crawl-status";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { scrapeOptions } from "../controllers/v1/types";
|
||||
configDotenv();
|
||||
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({
|
||||
consoleCapture: true,
|
||||
additionalInstrumentations: [],
|
||||
});
|
||||
}
|
||||
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
||||
|
@ -71,13 +63,18 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
|||
let err = null;
|
||||
try {
|
||||
const result = await processJob(job, token);
|
||||
try {
|
||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
await job.moveToCompleted(null, token, false);
|
||||
} else {
|
||||
await job.moveToCompleted(result.docs, token, false);
|
||||
}
|
||||
} catch (e) {}
|
||||
if (result.success) {
|
||||
try {
|
||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
await job.moveToCompleted(null, token, false);
|
||||
} else {
|
||||
await job.moveToCompleted(result.document, token, false);
|
||||
}
|
||||
} catch (e) {}
|
||||
} else {
|
||||
await job.moveToFailed((result as any).error, token, false);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.log("Job failed, error:", error);
|
||||
Sentry.captureException(error);
|
||||
|
@ -210,12 +207,11 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
||||
const data = {
|
||||
success: false,
|
||||
docs: [],
|
||||
document: null,
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
|
||||
};
|
||||
await job.moveToCompleted(data.docs, token, false);
|
||||
return data;
|
||||
}
|
||||
|
||||
|
@ -228,46 +224,36 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
});
|
||||
const start = Date.now();
|
||||
|
||||
const { success, message, docs } = await startWebScraperPipeline({
|
||||
const pipeline = await startWebScraperPipeline({
|
||||
job,
|
||||
token,
|
||||
});
|
||||
|
||||
// Better if we throw here so we capture with the correct error
|
||||
if (!success) {
|
||||
throw new Error(message);
|
||||
if (!pipeline.success) {
|
||||
// TODO: let's Not do this
|
||||
throw pipeline.error;
|
||||
}
|
||||
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||
const doc = pipeline.document;
|
||||
|
||||
const rawHtml = doc.rawHtml ?? "";
|
||||
|
||||
const data = {
|
||||
success,
|
||||
success: true,
|
||||
result: {
|
||||
links: docs.map((doc) => {
|
||||
return {
|
||||
content: doc,
|
||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||
};
|
||||
}),
|
||||
links: [{
|
||||
content: doc,
|
||||
source: doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? "",
|
||||
}],
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
docs,
|
||||
document: doc,
|
||||
};
|
||||
|
||||
// No idea what this does and when it is called.
|
||||
if (job.data.mode === "crawl" && !job.data.v1) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.id as string,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
);
|
||||
}
|
||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||
if (job.data.webhook && job.data.v1) {
|
||||
await callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
|
@ -280,26 +266,25 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
success: true,
|
||||
num_docs: 1,
|
||||
docs: [doc],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
@ -329,9 +314,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
{
|
||||
url: link,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
team_id: sc.team_id,
|
||||
pageOptions: sc.pageOptions,
|
||||
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
v1: job.data.v1,
|
||||
|
@ -367,15 +352,15 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: jobStatus === "completed",
|
||||
message: sc.cancelled ? "Cancelled" : message,
|
||||
message: sc.cancelled ? "Cancelled" : undefined,
|
||||
num_docs: fullDocs.length,
|
||||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
scrapeOptions: sc.scrapeOptions,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
|
||||
|
@ -390,7 +375,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
docs: fullDocs,
|
||||
};
|
||||
|
||||
|
@ -428,15 +412,15 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: jobStatus === "completed",
|
||||
message: sc.cancelled ? "Cancelled" : message,
|
||||
message: sc.cancelled ? "Cancelled" : undefined,
|
||||
num_docs: jobIDs.length,
|
||||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
scrapeOptions: sc.scrapeOptions,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}
|
||||
|
@ -448,37 +432,24 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
} catch (error) {
|
||||
logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||
|
||||
if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) {
|
||||
Sentry.captureException(error, {
|
||||
data: {
|
||||
job: job.id,
|
||||
},
|
||||
});
|
||||
}
|
||||
Sentry.captureException(error, {
|
||||
data: {
|
||||
job: job.id,
|
||||
},
|
||||
});
|
||||
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
logger.error(error.message); // or any other error handling
|
||||
|
||||
logtail.error("Custom error while ingesting", {
|
||||
job_id: job.id,
|
||||
error: error.message,
|
||||
dataIngestionJob: error.dataIngestionJob,
|
||||
});
|
||||
}
|
||||
logger.error(error);
|
||||
if (error.stack) {
|
||||
logger.error(error.stack);
|
||||
}
|
||||
|
||||
logtail.error("Overall error ingesting", {
|
||||
job_id: job.id,
|
||||
error: error.message,
|
||||
});
|
||||
|
||||
const data = {
|
||||
success: false,
|
||||
docs: [],
|
||||
document: null,
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||
|
@ -505,6 +476,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
|
@ -519,14 +492,12 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
const sc = await getCrawl(job.data.crawl_id);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: false,
|
||||
|
@ -541,8 +512,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc ? sc.originUrl : job.data.url,
|
||||
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||
crawlerOptions: sc ? sc.crawlerOptions : undefined,
|
||||
scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import axios from "axios";
|
||||
import { Document, legacyDocumentConverter } from "../../src/controllers/v1/types";
|
||||
import { logger } from "../../src/lib/logger";
|
||||
import { supabase_service } from "./supabase";
|
||||
import { WebhookEventType } from "../types";
|
||||
|
@ -59,7 +58,7 @@ export const callWebhook = async (
|
|||
for (let i = 0; i < data.result.links.length; i++) {
|
||||
if (v1) {
|
||||
dataToSend.push(
|
||||
legacyDocumentConverter(data.result.links[i].content)
|
||||
data.result.links[i].content
|
||||
);
|
||||
} else {
|
||||
dataToSend.push({
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import { AuthCreditUsageChunk } from "./controllers/v1/types";
|
||||
import { ExtractorOptions, Document, DocumentUrl } from "./lib/entities";
|
||||
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document } from "./controllers/v1/types";
|
||||
import { ExtractorOptions, Document } from "./lib/entities";
|
||||
import { InternalOptions } from "./scraper/scrapeURL";
|
||||
|
||||
type Mode = "crawl" | "single_urls" | "sitemap";
|
||||
|
||||
|
@ -24,9 +25,8 @@ export interface IngestResult {
|
|||
export interface WebScraperOptions {
|
||||
url: string;
|
||||
mode: Mode;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
extractorOptions?: any;
|
||||
scrapeOptions: ScrapeOptions;
|
||||
internalOptions?: InternalOptions;
|
||||
team_id: string;
|
||||
origin?: string;
|
||||
crawl_id?: string;
|
||||
|
@ -39,22 +39,22 @@ export interface WebScraperOptions {
|
|||
export interface RunWebScraperParams {
|
||||
url: string;
|
||||
mode: Mode;
|
||||
crawlerOptions: any;
|
||||
pageOptions?: any;
|
||||
extractorOptions?: any;
|
||||
inProgress: (progress: any) => void;
|
||||
onSuccess: (result: any, mode: string) => void;
|
||||
onError: (error: Error) => void;
|
||||
scrapeOptions: ScrapeOptions;
|
||||
internalOptions?: InternalOptions;
|
||||
// onSuccess: (result: V1Document, mode: string) => void;
|
||||
// onError: (error: Error) => void;
|
||||
team_id: string;
|
||||
bull_job_id: string;
|
||||
priority?: number;
|
||||
is_scrape?: boolean;
|
||||
}
|
||||
|
||||
export interface RunWebScraperResult {
|
||||
success: boolean;
|
||||
message: string;
|
||||
docs: Document[] | DocumentUrl[];
|
||||
export type RunWebScraperResult = {
|
||||
success: false;
|
||||
error: Error;
|
||||
} | {
|
||||
success: true;
|
||||
document: V1Document;
|
||||
}
|
||||
|
||||
export interface FirecrawlJob {
|
||||
|
@ -68,9 +68,8 @@ export interface FirecrawlJob {
|
|||
mode: string;
|
||||
url: string;
|
||||
crawlerOptions?: any;
|
||||
pageOptions?: any;
|
||||
scrapeOptions?: any;
|
||||
origin: string;
|
||||
extractor_options?: ExtractorOptions,
|
||||
num_tokens?: number,
|
||||
retry?: boolean,
|
||||
crawl_id?: string;
|
||||
|
|
|
@ -16,7 +16,6 @@ x-common-service: &common-service
|
|||
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
|
||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
||||
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
||||
- TEST_API_KEY=${TEST_API_KEY}
|
||||
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
|
||||
|
|
|
@ -7,7 +7,6 @@ data:
|
|||
OPENAI_API_KEY: ""
|
||||
SLACK_WEBHOOK_URL: ""
|
||||
LLAMAPARSE_API_KEY: ""
|
||||
LOGTAIL_KEY: ""
|
||||
BULL_AUTH_KEY: ""
|
||||
TEST_API_KEY: ""
|
||||
POSTHOG_API_KEY: ""
|
||||
|
@ -15,6 +14,5 @@ data:
|
|||
SCRAPING_BEE_API_KEY: ""
|
||||
STRIPE_PRICE_ID_STANDARD: ""
|
||||
STRIPE_PRICE_ID_SCALE: ""
|
||||
HYPERDX_API_KEY: ""
|
||||
FIRE_ENGINE_BETA_URL: ""
|
||||
REDIS_PASSWORD: ""
|
||||
|
|
Loading…
Reference in New Issue
Block a user