This commit is contained in:
Yanlong Wang 2024-04-12 09:25:19 +08:00
parent 2dc0850c8c
commit 664d4b1c9f
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 23 additions and 23 deletions

View File

@ -32,13 +32,21 @@ export class CrawlerHost extends RPCHost {
const toBeTurnedToMd = snapshot.parsed?.content;
const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
const formatted = `Title: ${(snapshot.parsed?.title || snapshot.title || '').trim()}
const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
urlSource: snapshot.href.trim(),
markdownContent: contentText.trim(),
URL Source: ${snapshot.href.trim()}
toString() {
return `Title: ${this.title}
URL Source: ${this.urlSource}
Markdown Content:
${contentText.trim()}
${contentText}
`;
}
};
return formatted;
}
@ -47,6 +55,7 @@ ${contentText.trim()}
runtime: {
memory: '4GiB',
timeoutSeconds: 540,
concurrency: 4,
},
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
@ -60,20 +69,22 @@ ${contentText.trim()}
) {
const noSlashURL = ctx.req.url.slice(1);
const urlToCrawl = new URL(normalizeUrl(noSlashURL));
const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
const noCache = Boolean(ctx.req.headers['x-no-cache']);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
try {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
if (!scrapped) {
continue;
}
const formatted = this.formatSnapshot(scrapped);
if (scrapped.screenshot) {
if (scrapped.screenshot && screenshotEnabled) {
sseStream.write({
event: 'screenshot',
data: scrapped.screenshot.toString('base64'),
@ -99,37 +110,25 @@ ${contentText.trim()}
}
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
if (!scrapped?.parsed?.content) {
continue;
}
const formatted = this.formatSnapshot(scrapped);
if (scrapped.screenshot) {
return [
{
type: 'image_url', image_url: {
url: `data:image/jpeg;base64,${scrapped.screenshot.toString('base64')}`,
}
},
{ type: 'text', content: formatted },
];
}
return formatted;
}
}
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
if (!scrapped?.parsed?.content) {
continue;
}
const formatted = this.formatSnapshot(scrapped);
return assignTransferProtocolMeta(formatted, { contentType: 'text/plain', envelope: null });
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
}
throw new Error('Unreachable');

View File

@ -49,9 +49,9 @@ export class PuppeteerControl extends AsyncService {
return page.browser().connected && !page.isClosed();
}
}, {
max: 1 + Math.floor(os.freemem() / 1024 * 1024 * 1024),
max: Math.max(1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), 4),
min: 1,
acquireTimeoutMillis: 15_000,
acquireTimeoutMillis: 60_000,
testOnBorrow: true,
testOnReturn: true,
});
@ -72,7 +72,7 @@ export class PuppeteerControl extends AsyncService {
}
this.browser = await puppeteer.launch({
headless: true,
timeout: 300_000
timeout: 60_000
});
this.browser.once('disconnected', () => {
this.logger.warn(`Browser disconnected`);
@ -91,6 +91,7 @@ export class PuppeteerControl extends AsyncService {
const preparations = [];
preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
page.emit('snapshot', snapshot);