Nick: speed improvements

This commit is contained in:
Nicolas 2024-07-22 18:30:58 -04:00
parent 1bc36e1a56
commit e31a5007d5
4 changed files with 172 additions and 92 deletions

View File

@ -73,28 +73,6 @@ export async function scrapeHelper(
}); });
} }
let creditsToBeBilled = filteredDocs.length;
const creditsPerLLMExtract = 50;
if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
}
const billingResult = await billTeam(
team_id,
creditsToBeBilled
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
return { return {
success: true, success: true,
data: filteredDocs[0], data: filteredDocs[0],
@ -104,6 +82,7 @@ export async function scrapeHelper(
export async function scrapeController(req: Request, res: Response) { export async function scrapeController(req: Request, res: Response) {
try { try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token> // make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser( const { success, team_id, error, status, plan } = await authenticateUser(
req, req,
@ -113,28 +92,41 @@ export async function scrapeController(req: Request, res: Response) {
if (!success) { if (!success) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
const origin = req.body.origin ?? defaultOrigin; const origin = req.body.origin ?? defaultOrigin;
let timeout = req.body.timeout ?? defaultTimeout; let timeout = req.body.timeout ?? defaultTimeout;
if (extractorOptions.mode === "llm-extraction") { if (extractorOptions.mode.includes("llm-extraction")) {
pageOptions.onlyMainContent = true; pageOptions.onlyMainContent = true;
timeout = req.body.timeout ?? 90000; timeout = req.body.timeout ?? 90000;
} }
const checkCredits = async () => {
try { try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
await checkTeamCredits(team_id, 1); if (!creditsCheckSuccess) {
if (!creditsCheckSuccess) { earlyReturn = true;
return res.status(402).json({ error: "Insufficient credits" }); return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
console.error(error);
earlyReturn = true;
return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
} }
} catch (error) { };
console.error(error);
return res.status(500).json({ error: "Internal server error" });
// Async check saves 500ms in average case
// Don't async check in llm extraction mode as it could be expensive
if (extractorOptions.mode.includes("llm-extraction")) {
await checkCredits();
} else {
checkCredits();
} }
const startTime = new Date().getTime(); const startTime = new Date().getTime();
const result = await scrapeHelper( const result = await scrapeHelper(
req, req,
@ -149,6 +141,33 @@ export async function scrapeController(req: Request, res: Response) {
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
if (result.success) {
let creditsToBeBilled = 1; // Assuming 1 credit per document
const creditsPerLLMExtract = 50;
if (extractorOptions.mode.includes("llm-extraction")) {
creditsToBeBilled += creditsPerLLMExtract;
}
let startTimeBilling = new Date().getTime();
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
const billingResult = await billTeam(
team_id,
creditsToBeBilled
);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error: "Failed to bill team. Insufficient credits or subscription not found.",
});
}
console.log("Billed team in", new Date().getTime() - startTimeBilling, "ms");
}
logJob({ logJob({
success: result.success, success: result.success,
message: result.error, message: result.error,
@ -164,6 +183,9 @@ export async function scrapeController(req: Request, res: Response) {
extractor_options: extractorOptions, extractor_options: extractorOptions,
num_tokens: numTokens, num_tokens: numTokens,
}); });
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
} catch (error) { } catch (error) {
console.error(error); console.error(error);

View File

@ -164,7 +164,6 @@ export class WebScraperDataProvider {
private async handleCrawlMode( private async handleCrawlMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes: this.includes,
@ -225,7 +224,6 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }
let documents = await this.processLinks(links, inProgress); let documents = await this.processLinks(links, inProgress);
return this.cacheAndFinalizeDocuments(documents, links); return this.cacheAndFinalizeDocuments(documents, links);
} }
@ -253,35 +251,60 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
allHtmls?: string[] allHtmls?: string[]
): Promise<Document[]> { ): Promise<Document[]> {
const pdfLinks = links.filter(link => link.endsWith(".pdf")); const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx")); const docLinks = links.filter(
(link) => link.endsWith(".doc") || link.endsWith(".docx")
const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
const docxDocuments = await this.fetchDocxDocuments(docLinks);
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
let documents = await this.convertUrlsToDocuments(
links,
inProgress,
allHtmls
); );
documents = await this.getSitemapData(this.urls[0], documents); const [pdfDocuments, docxDocuments] = await Promise.all([
this.fetchPdfDocuments(pdfLinks),
this.fetchDocxDocuments(docLinks),
]);
links = links.filter(
(link) => !pdfLinks.includes(link) && !docLinks.includes(link)
);
let [documents, sitemapData] = await Promise.all([
this.convertUrlsToDocuments(links, inProgress, allHtmls),
this.mode === "single_urls" && links.length > 0
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
(error) => {
console.error("Failed to fetch sitemap data:", error);
return null;
}
)
: Promise.resolve(null),
]);
if (this.mode === "single_urls" && documents.length > 0) {
documents[0].metadata.sitemap = sitemapData;
} else {
documents = await this.getSitemapData(this.urls[0], documents);
}
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
(this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") && (this.extractorOptions.mode === "llm-extraction" ||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
this.mode === "single_urls" this.mode === "single_urls"
) { ) {
documents = await generateCompletions(documents, this.extractorOptions, "markdown"); documents = await generateCompletions(
documents,
this.extractorOptions,
"markdown"
);
} }
if ( if (
(this.extractorOptions.mode === "llm-extraction-from-raw-html") && this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
this.mode === "single_urls" this.mode === "single_urls"
) { ) {
documents = await generateCompletions(documents, this.extractorOptions, "raw-html"); documents = await generateCompletions(
documents,
this.extractorOptions,
"raw-html"
);
} }
return documents.concat(pdfDocuments).concat(docxDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }
@ -289,7 +312,10 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
pdfLinks.map(async (pdfLink) => { pdfLinks.map(async (pdfLink) => {
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
pdfLink,
this.pageOptions.parsePDF
);
return { return {
content: content, content: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
@ -301,7 +327,8 @@ export class WebScraperDataProvider {
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> { private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
docxLinks.map(async (p) => { docxLinks.map(async (p) => {
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p); const { content, pageStatusCode, pageError } =
await fetchAndProcessDocx(p);
return { return {
content, content,
metadata: { sourceURL: p, pageStatusCode, pageError }, metadata: { sourceURL: p, pageStatusCode, pageError },
@ -489,16 +516,21 @@ export class WebScraperDataProvider {
includeHtml: false, includeHtml: false,
replaceAllPathsWithAbsolutePaths: false, replaceAllPathsWithAbsolutePaths: false,
parsePDF: true, parsePDF: true,
removeTags: [] removeTags: [],
}; };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths =
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; this.allowBackwardCrawling =
this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false; options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {
@ -537,6 +569,34 @@ export class WebScraperDataProvider {
} }
return documents; return documents;
} }
private async getSitemapDataForSingleUrl(
baseUrl: string,
url: string,
timeout?: number
) {
const sitemapData = await fetchSitemapData(baseUrl, timeout);
if (sitemapData) {
const docInSitemapData = sitemapData.find(
(data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
);
if (docInSitemapData) {
let sitemapDocData: Partial<SitemapEntry> = {};
if (docInSitemapData.changefreq) {
sitemapDocData.changefreq = docInSitemapData.changefreq;
}
if (docInSitemapData.priority) {
sitemapDocData.priority = Number(docInSitemapData.priority);
}
if (docInSitemapData.lastmod) {
sitemapDocData.lastmod = docInSitemapData.lastmod;
}
if (Object.keys(sitemapDocData).length !== 0) {
return sitemapDocData;
}
}
}
return null;
}
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => { generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
await Promise.all( await Promise.all(
documents.map(async (document) => { documents.map(async (document) => {

View File

@ -54,10 +54,10 @@ export async function getLinksFromSitemap(
return allUrls; return allUrls;
} }
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => { export const fetchSitemapData = async (url: string, timeout?: number): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try { try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
const xml = response.data; const xml = response.data;
const parsedXml = await parseStringPromise(xml); const parsedXml = await parseStringPromise(xml);

View File

@ -19,20 +19,20 @@ export async function supaBillTeam(team_id: string, credits: number) {
// credits_used: The number of credits consumed by the API call. // credits_used: The number of credits consumed by the API call.
// created_at: The timestamp of the API usage. // created_at: The timestamp of the API usage.
// 1. get the subscription // 1. get the subscription and check for available coupons concurrently
const { data: subscription } = await supabase_service const [{ data: subscription }, { data: coupons }] = await Promise.all([
.from("subscriptions") supabase_service
.select("*") .from("subscriptions")
.eq("team_id", team_id) .select("*")
.eq("status", "active") .eq("team_id", team_id)
.single(); .eq("status", "active")
.single(),
// 2. Check for available coupons supabase_service
const { data: coupons } = await supabase_service .from("coupons")
.from("coupons") .select("id, credits")
.select("id, credits") .eq("team_id", team_id)
.eq("team_id", team_id) .eq("status", "active"),
.eq("status", "active"); ]);
let couponCredits = 0; let couponCredits = 0;
if (coupons && coupons.length > 0) { if (coupons && coupons.length > 0) {
@ -169,21 +169,21 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
return { success: true, message: "Preview team, no credits used" }; return { success: true, message: "Preview team, no credits used" };
} }
// Retrieve the team's active subscription // Retrieve the team's active subscription and check for available coupons concurrently
const { data: subscription, error: subscriptionError } = const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
await supabase_service await Promise.all([
.from("subscriptions") supabase_service
.select("id, price_id, current_period_start, current_period_end") .from("subscriptions")
.eq("team_id", team_id) .select("id, price_id, current_period_start, current_period_end")
.eq("status", "active") .eq("team_id", team_id)
.single(); .eq("status", "active")
.single(),
// Check for available coupons supabase_service
const { data: coupons } = await supabase_service .from("coupons")
.from("coupons") .select("credits")
.select("credits") .eq("team_id", team_id)
.eq("team_id", team_id) .eq("status", "active"),
.eq("status", "active"); ]);
let couponCredits = 0; let couponCredits = 0;
if (coupons && coupons.length > 0) { if (coupons && coupons.length > 0) {
@ -238,7 +238,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
// 5. Compare the total credits used with the credits allowed by the plan. // 5. Compare the total credits used with the credits allowed by the plan.
if (totalCreditsUsed + credits > FREE_CREDITS) { if (totalCreditsUsed + credits > FREE_CREDITS) {
// Send email notification for insufficient credits // Send email notification for insufficient credits
await sendNotification( await sendNotification(
team_id, team_id,
NotificationType.LIMIT_REACHED, NotificationType.LIMIT_REACHED,
@ -275,7 +274,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
// Adjust total credits used by subtracting coupon value // Adjust total credits used by subtracting coupon value
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
// Get the price details // Get the price details
const { data: price, error: priceError } = await supabase_service const { data: price, error: priceError } = await supabase_service
.from("prices") .from("prices")