mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge pull request #204 from mendableai/feat/custom-scraping-readme
[Feat] Added custom scraping conditions for readme docs
This commit is contained in:
commit
51b0b88cd4
|
@ -210,6 +210,14 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
|
|||
return scrapersInOrder as typeof baseScrapers[number][];
|
||||
}
|
||||
|
||||
async function handleCustomScraping(text: string, url: string): Promise<string | null> {
|
||||
if (text.includes('<meta name="readme-deploy"')) {
|
||||
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
|
||||
return await scrapWithFireEngine(url, 1000);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
|
||||
|
@ -266,6 +274,12 @@ export async function scrapSingleUrl(
|
|||
break;
|
||||
}
|
||||
|
||||
// Check for custom scraping conditions
|
||||
const customScrapedContent = await handleCustomScraping(text, url);
|
||||
if (customScrapedContent) {
|
||||
text = customScrapedContent;
|
||||
}
|
||||
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user