mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick: speed improvements
This commit is contained in:
parent
1bc36e1a56
commit
e31a5007d5
|
@ -73,28 +73,6 @@ export async function scrapeHelper(
|
|||
});
|
||||
}
|
||||
|
||||
let creditsToBeBilled = filteredDocs.length;
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
|
||||
|
||||
if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
|
||||
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs[0],
|
||||
|
@ -104,6 +82,7 @@ export async function scrapeHelper(
|
|||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
|
@ -113,28 +92,41 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
if (extractorOptions.mode === "llm-extraction") {
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
const checkCredits = async () => {
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
};
|
||||
|
||||
|
||||
// Async check saves 500ms in average case
|
||||
// Don't async check in llm extraction mode as it could be expensive
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
await checkCredits();
|
||||
} else {
|
||||
checkCredits();
|
||||
}
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
req,
|
||||
|
@ -149,6 +141,33 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
console.log("Billed team in", new Date().getTime() - startTimeBilling, "ms");
|
||||
}
|
||||
|
||||
logJob({
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
|
@ -164,6 +183,9 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
|
|
@ -164,7 +164,6 @@ export class WebScraperDataProvider {
|
|||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
|
||||
const crawler = new WebCrawler({
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
|
@ -225,7 +224,6 @@ export class WebScraperDataProvider {
|
|||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
@ -253,35 +251,60 @@ export class WebScraperDataProvider {
|
|||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const pdfLinks = links.filter(link => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
|
||||
|
||||
const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
const docxDocuments = await this.fetchDocxDocuments(docLinks);
|
||||
|
||||
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
links,
|
||||
inProgress,
|
||||
allHtmls
|
||||
const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(
|
||||
(link) => link.endsWith(".doc") || link.endsWith(".docx")
|
||||
);
|
||||
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
const [pdfDocuments, docxDocuments] = await Promise.all([
|
||||
this.fetchPdfDocuments(pdfLinks),
|
||||
this.fetchDocxDocuments(docLinks),
|
||||
]);
|
||||
|
||||
links = links.filter(
|
||||
(link) => !pdfLinks.includes(link) && !docLinks.includes(link)
|
||||
);
|
||||
|
||||
let [documents, sitemapData] = await Promise.all([
|
||||
this.convertUrlsToDocuments(links, inProgress, allHtmls),
|
||||
this.mode === "single_urls" && links.length > 0
|
||||
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
||||
(error) => {
|
||||
console.error("Failed to fetch sitemap data:", error);
|
||||
return null;
|
||||
}
|
||||
)
|
||||
: Promise.resolve(null),
|
||||
]);
|
||||
|
||||
if (this.mode === "single_urls" && documents.length > 0) {
|
||||
documents[0].metadata.sitemap = sitemapData;
|
||||
} else {
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
documents = this.applyPathReplacements(documents);
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
(this.extractorOptions.mode === "llm-extraction" ||
|
||||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions, "markdown");
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"markdown"
|
||||
);
|
||||
}
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
|
||||
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"raw-html"
|
||||
);
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
@ -289,7 +312,10 @@ export class WebScraperDataProvider {
|
|||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
pdfLink,
|
||||
this.pageOptions.parsePDF
|
||||
);
|
||||
return {
|
||||
content: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
|
@ -301,7 +327,8 @@ export class WebScraperDataProvider {
|
|||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (p) => {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
|
||||
const { content, pageStatusCode, pageError } =
|
||||
await fetchAndProcessDocx(p);
|
||||
return {
|
||||
content,
|
||||
metadata: { sourceURL: p, pageStatusCode, pageError },
|
||||
|
@ -489,16 +516,21 @@ export class WebScraperDataProvider {
|
|||
includeHtml: false,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: []
|
||||
removeTags: [],
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
false;
|
||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.allowBackwardCrawling =
|
||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
@ -537,6 +569,34 @@ export class WebScraperDataProvider {
|
|||
}
|
||||
return documents;
|
||||
}
|
||||
private async getSitemapDataForSingleUrl(
|
||||
baseUrl: string,
|
||||
url: string,
|
||||
timeout?: number
|
||||
) {
|
||||
const sitemapData = await fetchSitemapData(baseUrl, timeout);
|
||||
if (sitemapData) {
|
||||
const docInSitemapData = sitemapData.find(
|
||||
(data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
|
||||
);
|
||||
if (docInSitemapData) {
|
||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
||||
if (docInSitemapData.changefreq) {
|
||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
||||
}
|
||||
if (docInSitemapData.priority) {
|
||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
||||
}
|
||||
if (docInSitemapData.lastmod) {
|
||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
||||
}
|
||||
if (Object.keys(sitemapDocData).length !== 0) {
|
||||
return sitemapDocData;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
||||
await Promise.all(
|
||||
documents.map(async (document) => {
|
||||
|
|
|
@ -54,10 +54,10 @@ export async function getLinksFromSitemap(
|
|||
return allUrls;
|
||||
}
|
||||
|
||||
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
||||
export const fetchSitemapData = async (url: string, timeout?: number): Promise<SitemapEntry[] | null> => {
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
const xml = response.data;
|
||||
const parsedXml = await parseStringPromise(xml);
|
||||
|
|
|
@ -19,20 +19,20 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||
// credits_used: The number of credits consumed by the API call.
|
||||
// created_at: The timestamp of the API usage.
|
||||
|
||||
// 1. get the subscription
|
||||
const { data: subscription } = await supabase_service
|
||||
.from("subscriptions")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single();
|
||||
|
||||
// 2. Check for available coupons
|
||||
const { data: coupons } = await supabase_service
|
||||
.from("coupons")
|
||||
.select("id, credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active");
|
||||
// 1. get the subscription and check for available coupons concurrently
|
||||
const [{ data: subscription }, { data: coupons }] = await Promise.all([
|
||||
supabase_service
|
||||
.from("subscriptions")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single(),
|
||||
supabase_service
|
||||
.from("coupons")
|
||||
.select("id, credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active"),
|
||||
]);
|
||||
|
||||
let couponCredits = 0;
|
||||
if (coupons && coupons.length > 0) {
|
||||
|
@ -169,21 +169,21 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
return { success: true, message: "Preview team, no credits used" };
|
||||
}
|
||||
|
||||
// Retrieve the team's active subscription
|
||||
const { data: subscription, error: subscriptionError } =
|
||||
await supabase_service
|
||||
.from("subscriptions")
|
||||
.select("id, price_id, current_period_start, current_period_end")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single();
|
||||
|
||||
// Check for available coupons
|
||||
const { data: coupons } = await supabase_service
|
||||
.from("coupons")
|
||||
.select("credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active");
|
||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
|
||||
await Promise.all([
|
||||
supabase_service
|
||||
.from("subscriptions")
|
||||
.select("id, price_id, current_period_start, current_period_end")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single(),
|
||||
supabase_service
|
||||
.from("coupons")
|
||||
.select("credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active"),
|
||||
]);
|
||||
|
||||
let couponCredits = 0;
|
||||
if (coupons && coupons.length > 0) {
|
||||
|
@ -238,7 +238,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
// 5. Compare the total credits used with the credits allowed by the plan.
|
||||
if (totalCreditsUsed + credits > FREE_CREDITS) {
|
||||
// Send email notification for insufficient credits
|
||||
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
|
@ -275,7 +274,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
||||
|
||||
// Get the price details
|
||||
const { data: price, error: priceError } = await supabase_service
|
||||
.from("prices")
|
||||
|
|
Loading…
Reference in New Issue
Block a user