Nick: fixed bugs associated with absolute path replacements

This commit is contained in:
Nicolas 2024-06-11 12:43:16 -07:00
parent 788abdce6e
commit 520739c9f4
4 changed files with 24 additions and 18 deletions

View File

@ -190,11 +190,6 @@
"description": "Ignore the website sitemap when crawling",
"default": false
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"limit": {
"type": "integer",
"description": "Maximum number of pages to crawl",
@ -223,6 +218,11 @@
"headers": {
"type": "object",
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
}
}
}

View File

@ -18,6 +18,7 @@ export type PageOptions = {
waitFor?: number;
screenshot?: boolean;
headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean;
};
export type ExtractorOptions = {

View File

@ -302,9 +302,10 @@ export class WebScraperDataProvider {
}
private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths
? replacePathsWithAbsolutePaths(documents)
: replaceImgPathsWithAbsolutePaths(documents);
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
}
return replaceImgPathsWithAbsolutePaths(documents);
}
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
@ -473,9 +474,9 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default";

View File

@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
) || [];
paths.forEach((path: string) => {
const isImage = path.startsWith("!");
try {
const isImage = path.startsWith("!");
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
let url = matchedUrl[1];
@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
}
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
if (isImage) {
document.content = document.content.replace(
path,
`${markdownLinkOrImageText}(${url})`
);
} else {
// Image is handled afterwards
if (!isImage) {
document.content = document.content.replace(
path,
`${markdownLinkOrImageText}(${url})`
);
}
} catch (error) {
}
});
document.markdown = document.content;
});
return documents;
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
if (!imageUrl.startsWith("http")) {
if (imageUrl.startsWith("/")) {
imageUrl = imageUrl.substring(1);
imageUrl = new URL(imageUrl, baseUrl).toString();
} else {
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
}
imageUrl = new URL(imageUrl, baseUrl).toString();
}
}
@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
`![${altText}](${imageUrl})`
);
});
document.markdown = document.content;
});
return documents;