Merge pull request #878 from mendableai/mog/deduplicate-urls

feat(crawl): Similar URL deduplication
This commit is contained in:
Nicolas 2024-11-11 14:33:13 -05:00 committed by GitHub
commit 56a1ac07a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 91 additions and 9 deletions

View File

@ -203,6 +203,7 @@ const crawlerOptions = z.object({
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true),
}).strict(strictMessage);
// export type CrawlerOptions = {
@ -457,6 +458,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
};
}
@ -470,7 +472,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks,
ignoreSitemap: x.ignoreSitemap,
// TODO: returnOnlyUrls support
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
}),
internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls,

View File

@ -0,0 +1,33 @@
import { generateURLPermutations } from "./crawl-redis";
describe("generateURLPermutations", () => {
it("generates permutations correctly", () => {
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
expect(bareHttps.length).toBe(4);
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
expect(bareHttp.length).toBe(4);
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
expect(wwwHttps.length).toBe(4);
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
expect(wwwHttp.length).toBe(4);
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
})
});

View File

@ -90,6 +90,44 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
}
export function normalizeURL(url: string): string {
const urlO = new URL(url);
urlO.search = "";
urlO.hash = "";
return urlO.href;
}
export function generateURLPermutations(url: string | URL): URL[] {
const urlO = new URL(url);
// Construct two versions, one with www., one without
const urlWithWWW = new URL(urlO);
const urlWithoutWWW = new URL(urlO);
if (urlO.hostname.startsWith("www.")) {
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
} else {
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
}
let permutations = [urlWithWWW, urlWithoutWWW];
// Construct more versions for http/https
permutations = permutations.flatMap(urlO => {
if (!["http:", "https:"].includes(urlO.protocol)) {
return [urlO];
}
const urlWithHTTP = new URL(urlO);
const urlWithHTTPS = new URL(urlO);
urlWithHTTP.protocol = "http:";
urlWithHTTPS.protocol = "https:";
return [urlWithHTTP, urlWithHTTPS];
});
return permutations;
}
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
@ -97,16 +135,16 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
}
}
try {
const urlO = new URL(url);
urlO.search = "";
urlO.hash = "";
url = urlO.href;
} catch (error) {
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
url = normalizeURL(url);
let res: boolean;
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
} else {
const permutations = generateURLPermutations(url);
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
}
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}

View File

@ -23,6 +23,7 @@ import {
getCrawl,
getCrawlJobs,
lockURL,
normalizeURL,
} from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
@ -318,6 +319,11 @@ async function processJob(job: Job & { id: string }, token: string) {
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) {
logger.debug("Was redirected, locking new URL...");
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
}
await logJob({
job_id: job.id as string,

View File

@ -86,6 +86,8 @@ export interface CrawlScrapeOptions {
country?: string;
languages?: string[];
};
skipTlsVerification?: boolean;
removeBase64Images?: boolean;
}
export type Action = {
@ -151,6 +153,7 @@ export interface CrawlParams {
ignoreSitemap?: boolean;
scrapeOptions?: CrawlScrapeOptions;
webhook?: string;
deduplicateSimilarURLs?: boolean;
}
/**