From 47123be783582dd977ef30edd252da62adc3676b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 16 Aug 2024 22:01:56 -0400 Subject: [PATCH] Nick: weird activity block --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 1 + apps/api/src/services/queue-worker.ts | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 0bdf9876..fd3c9ad1 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -15,6 +15,7 @@ const socialMediaBlocklist = [ 'whatsapp.com', 'wechat.com', 'telegram.org', + 'researchhub.com' ]; const allowedKeywords = [ diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c15201be..890e6e7b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -173,9 +173,14 @@ async function processJob(job: Job, token: string) { if (!job.data.sitemapped) { if (!sc.cancelled) { const crawler = crawlToCrawler(job.data.crawl_id, sc); - - const links = crawler.filterLinks((data.docs[0].linksOnPage ?? []) - .map(href => crawler.filterURL(href.trim(), sc.originUrl)) + let linksOnPage = []; + try{ + linksOnPage = data.docs[0]?.linksOnPage ?? []; + }catch(e){ + linksOnPage = [] + } + const links = crawler.filterLinks( + linksOnPage.map(href => crawler.filterURL(href.trim(), sc.originUrl)) .filter(x => x !== null), Infinity, sc.crawlerOptions?.maxDepth ?? 10