mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Update metadata.ts
This commit is contained in:
parent
f49552e413
commit
795e5a9228
|
@ -70,11 +70,12 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||||
let pageStatusCode: number | null = null;
|
let pageStatusCode: number | null = null;
|
||||||
let pageError: string | null = null;
|
let pageError: string | null = null;
|
||||||
|
|
||||||
|
const customMetadata: Record<string, string | string[]> = {};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
title = soup("title").text() || null;
|
title = soup("title").text() || null;
|
||||||
description = soup('meta[name="description"]').attr("content") || null;
|
description = soup('meta[name="description"]').attr("content") || null;
|
||||||
|
|
||||||
// Assuming the language is part of the URL as per the regex pattern
|
|
||||||
language = soup('html').attr('lang') || null;
|
language = soup('html').attr('lang') || null;
|
||||||
|
|
||||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||||
|
@ -104,6 +105,22 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||||
dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null;
|
dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null;
|
||||||
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||||
|
|
||||||
|
// Extract all meta tags for custom metadata
|
||||||
|
soup("meta").each((i, elem) => {
|
||||||
|
const name = soup(elem).attr("name") || soup(elem).attr("property");
|
||||||
|
const content = soup(elem).attr("content");
|
||||||
|
|
||||||
|
if (name && content) {
|
||||||
|
if (customMetadata[name] === undefined) {
|
||||||
|
customMetadata[name] = content;
|
||||||
|
} else if (Array.isArray(customMetadata[name])) {
|
||||||
|
(customMetadata[name] as string[]).push(content);
|
||||||
|
} else {
|
||||||
|
customMetadata[name] = [customMetadata[name] as string, content];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error extracting metadata: ${error}`);
|
Logger.error(`Error extracting metadata: ${error}`);
|
||||||
}
|
}
|
||||||
|
@ -141,5 +158,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||||
...(sourceURL ? { sourceURL } : {}),
|
...(sourceURL ? { sourceURL } : {}),
|
||||||
...(pageStatusCode ? { pageStatusCode } : {}),
|
...(pageStatusCode ? { pageStatusCode } : {}),
|
||||||
...(pageError ? { pageError } : {}),
|
...(pageError ? { pageError } : {}),
|
||||||
|
...customMetadata,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user