From a0f9ab2be74b53fa0f5af8632c344900245a2b2b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 6 Sep 2024 20:14:47 -0300 Subject: [PATCH] Update map.ts --- apps/api/src/controllers/v1/map.ts | 44 +++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index e6abd9ae..9142f5c7 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -47,24 +47,42 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); - const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); - - if (sitemap !== null) { - sitemap.map((x) => { - links.push(x.url); - }); - } - let urlWithoutWww = req.body.url.replace("www.", ""); let mapUrl = req.body.search ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; - // www. seems to exclude subdomains in some cases - const mapResults = await fireEngineMap(mapUrl, { - // limit to 100 results (beta) - numResults: Math.min(limit, 100), - }); + + const maxResults = 5000; + const resultsPerPage = 100; + const maxPages = Math.ceil(maxResults / resultsPerPage); + + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page + }); + }; + + const pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); + + // Parallelize sitemap fetch with serper search + const [sitemap, ...allResults] = await Promise.all([ + req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), + ...pagePromises + ]); + + if (sitemap !== null) { + sitemap.forEach((x) => { + links.push(x.url); + }); + } + + let mapResults = allResults.flat().filter(result => result !== null && result !== undefined); + + if (mapResults.length > maxResults) { + mapResults = mapResults.slice(0, maxResults); + } if (mapResults.length > 0) { if (req.body.search) {