mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 19:58:08 +08:00
Nick: weird activity block
This commit is contained in:
parent
c281fe62c0
commit
47123be783
|
@ -15,6 +15,7 @@ const socialMediaBlocklist = [
|
||||||
'whatsapp.com',
|
'whatsapp.com',
|
||||||
'wechat.com',
|
'wechat.com',
|
||||||
'telegram.org',
|
'telegram.org',
|
||||||
|
'researchhub.com'
|
||||||
];
|
];
|
||||||
|
|
||||||
const allowedKeywords = [
|
const allowedKeywords = [
|
||||||
|
|
|
@ -173,9 +173,14 @@ async function processJob(job: Job, token: string) {
|
||||||
if (!job.data.sitemapped) {
|
if (!job.data.sitemapped) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
|
let linksOnPage = [];
|
||||||
const links = crawler.filterLinks((data.docs[0].linksOnPage ?? [])
|
try{
|
||||||
.map(href => crawler.filterURL(href.trim(), sc.originUrl))
|
linksOnPage = data.docs[0]?.linksOnPage ?? [];
|
||||||
|
}catch(e){
|
||||||
|
linksOnPage = []
|
||||||
|
}
|
||||||
|
const links = crawler.filterLinks(
|
||||||
|
linksOnPage.map(href => crawler.filterURL(href.trim(), sc.originUrl))
|
||||||
.filter(x => x !== null),
|
.filter(x => x !== null),
|
||||||
Infinity,
|
Infinity,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
|
|
Loading…
Reference in New Issue
Block a user