mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick:
This commit is contained in:
parent
48d1ec05b2
commit
c69c89f838
|
@ -0,0 +1,63 @@
|
|||
import { isUrlBlocked } from '../blocklist';
|
||||
|
||||
describe('isUrlBlocked', () => {
|
||||
it('should return true for blocked social media URLs', () => {
|
||||
const blockedUrls = [
|
||||
'https://www.facebook.com',
|
||||
'https://twitter.com/someuser',
|
||||
'https://instagram.com/someuser',
|
||||
'https://www.linkedin.com/in/someuser',
|
||||
'https://pinterest.com/someuser',
|
||||
'https://snapchat.com/someuser',
|
||||
'https://tiktok.com/@someuser',
|
||||
'https://reddit.com/r/somesubreddit',
|
||||
'https://flickr.com/photos/someuser',
|
||||
'https://whatsapp.com/someuser',
|
||||
'https://wechat.com/someuser',
|
||||
'https://telegram.org/someuser',
|
||||
];
|
||||
|
||||
blockedUrls.forEach(url => {
|
||||
if (!isUrlBlocked(url)) {
|
||||
console.log(`URL not blocked: ${url}`);
|
||||
}
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for URLs containing allowed keywords', () => {
|
||||
const allowedUrls = [
|
||||
'https://www.facebook.com/privacy',
|
||||
'https://twitter.com/terms',
|
||||
'https://instagram.com/legal',
|
||||
'https://www.linkedin.com/help',
|
||||
'https://pinterest.com/about',
|
||||
'https://snapchat.com/support',
|
||||
'https://tiktok.com/contact',
|
||||
'https://reddit.com/user-agreement',
|
||||
'https://tumblr.com/policy',
|
||||
'https://flickr.com/blog',
|
||||
'https://whatsapp.com/press',
|
||||
'https://wechat.com/careers',
|
||||
'https://telegram.org/conditions',
|
||||
];
|
||||
|
||||
allowedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for non-blocked URLs', () => {
|
||||
const nonBlockedUrls = [
|
||||
'https://www.example.com',
|
||||
'https://www.somewebsite.org',
|
||||
'https://subdomain.example.com',
|
||||
'firecrawl.dev',
|
||||
'amazon.com',
|
||||
];
|
||||
|
||||
nonBlockedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
|
@ -38,12 +38,17 @@ export function isUrlBlocked(url: string): boolean {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Check if the URL matches any domain in the blocklist
|
||||
return socialMediaBlocklist.some(domain => {
|
||||
// Create a regular expression to match the exact domain
|
||||
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`);
|
||||
// Test the hostname of the URL against the pattern
|
||||
return domainPattern.test(new URL(url).hostname);
|
||||
});
|
||||
try {
|
||||
// Check if the URL matches any domain in the blocklist
|
||||
return socialMediaBlocklist.some(domain => {
|
||||
// Create a regular expression to match the exact domain
|
||||
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`);
|
||||
// Test the hostname of the URL against the pattern
|
||||
return domainPattern.test(new URL(url).hostname);
|
||||
});
|
||||
} catch (e) {
|
||||
// If an error occurs (e.g., invalid URL), return false
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user