mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 03:32:25 +08:00
feat: add image captioning (#6)
* Fix contentText assignment in CrawlerHost class * fix: recover vscode configurations * feat: add image captioning * feat: add image captioning * clean: vscode config * chore: fix some ts warnings * feat: auto alt text * fix * chore: improve prompt * clean: unused config * fix: failure condition * fix: remove redundant code * fix: catch parse error * fix: catch parse error --------- Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
This commit is contained in:
parent
18373626b2
commit
b3fb4c5c57
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,4 +1,2 @@
|
|||
node_modules/
|
||||
.DS_Store
|
||||
.vscode
|
||||
.cache
|
||||
.DS_Store
|
10
.vscode/exensions.json
vendored
Normal file
10
.vscode/exensions.json
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
{
|
||||
"recommendations": [
|
||||
"editorconfig.editorconfig",
|
||||
"octref.vetur",
|
||||
"redhat.vscode-yaml",
|
||||
"dbaeumer.vscode-eslint",
|
||||
"esbenp.prettier-vscode",
|
||||
"streetsidesoftware.code-spell-checker"
|
||||
]
|
||||
}
|
60
.vscode/launch.json
vendored
Normal file
60
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Debug Fullstack: attach",
|
||||
"request": "attach",
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node",
|
||||
"preLaunchTask": "Fullstack:debug"
|
||||
},
|
||||
{
|
||||
"name": "Debug Fullstack: attach: with proxy",
|
||||
"request": "attach",
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node",
|
||||
"preLaunchTask": "Fullstack:debug:with-proxy"
|
||||
},
|
||||
{
|
||||
"name": "Attach",
|
||||
"port": 9229,
|
||||
"request": "attach",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node"
|
||||
},
|
||||
{
|
||||
"name": "Attach by Process ID",
|
||||
"processId": "${command:PickProcess}",
|
||||
"request": "attach",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node"
|
||||
},
|
||||
{
|
||||
"name": "Debug Fullstack",
|
||||
"request": "launch",
|
||||
"runtimeArgs": [
|
||||
"emulators:start",
|
||||
"--import=../.firebase-emu",
|
||||
"--export-on-exit=../.firebase-emu",
|
||||
],
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node",
|
||||
"preLaunchTask": "Fullstack:prepare",
|
||||
"killBehavior": "polite"
|
||||
},
|
||||
]
|
||||
}
|
32
.vscode/settings.json
vendored
Normal file
32
.vscode/settings.json
vendored
Normal file
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"editor.wordWrap": "on",
|
||||
"editor.wordWrapColumn": 120,
|
||||
"files.trimTrailingWhitespace": true,
|
||||
"files.trimFinalNewlines": true,
|
||||
"[javascript]": {
|
||||
"editor.defaultFormatter": "vscode.typescript-language-features"
|
||||
},
|
||||
"[jsonc]": {
|
||||
"editor.defaultFormatter": "vscode.json-language-features"
|
||||
},
|
||||
"[typescript]": {
|
||||
"editor.defaultFormatter": "vscode.typescript-language-features"
|
||||
},
|
||||
"[json]": {
|
||||
"editor.defaultFormatter": "vscode.json-language-features"
|
||||
},
|
||||
"[yaml]": {
|
||||
"editor.defaultFormatter": "redhat.vscode-yaml"
|
||||
},
|
||||
"[markdown]": {
|
||||
"files.trimTrailingWhitespace": false
|
||||
},
|
||||
"typescript.tsdk": "node_modules/typescript/lib",
|
||||
"typescript.preferences.quoteStyle": "single",
|
||||
"typescript.format.semicolons": "insert",
|
||||
"typescript.preferences.importModuleSpecifier": "project-relative",
|
||||
"typescript.locale": "en",
|
||||
"cSpell.enabled": true,
|
||||
"cSpell.words": [
|
||||
],
|
||||
}
|
156
.vscode/tasks.json
vendored
Normal file
156
.vscode/tasks.json
vendored
Normal file
|
@ -0,0 +1,156 @@
|
|||
{
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "build",
|
||||
"group": "build",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
},
|
||||
"problemMatcher": [],
|
||||
"label": "Backend:rebuild",
|
||||
"detail": "Backend:rebuild"
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "emu:reset",
|
||||
"group": "build",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
},
|
||||
"problemMatcher": [],
|
||||
"label": "Backend:reset-emulator",
|
||||
"detail": "Backend:reset-emulator"
|
||||
},
|
||||
{
|
||||
"type": "typescript",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
},
|
||||
"tsconfig": "backend/functions/tsconfig.json",
|
||||
"option": "watch",
|
||||
"isBackground": true,
|
||||
"problemMatcher": [
|
||||
"$tsc-watch"
|
||||
],
|
||||
"group": "build",
|
||||
"label": "Backend:build:watch"
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "emu:debug",
|
||||
"group": "none",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
},
|
||||
"problemMatcher": [
|
||||
{
|
||||
"base": "$tsc",
|
||||
"background": {
|
||||
"activeOnStart": false,
|
||||
"beginsPattern": "shutdown requested|Starting emulators",
|
||||
"endsPattern": "Debugger listening"
|
||||
}
|
||||
}
|
||||
],
|
||||
"label": "Backend:start-emulator-debug",
|
||||
"detail": "Backend:start-emulator-debug",
|
||||
"dependsOn": [
|
||||
"Backend:build:watch"
|
||||
],
|
||||
"isBackground": true,
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "dev",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/webapp",
|
||||
},
|
||||
"group": "build",
|
||||
"label": "Frontend:start:dev",
|
||||
"detail": "Frontend:start:dev",
|
||||
"isBackground": true,
|
||||
"problemMatcher": {
|
||||
"base": "$vite",
|
||||
"background": {
|
||||
"activeOnStart": true,
|
||||
"endsPattern": "OK",
|
||||
"beginsPattern": "vite"
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "dev",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/webapp",
|
||||
"env": {
|
||||
"FIREBASE_EMULATE": "true",
|
||||
}
|
||||
},
|
||||
"group": "build",
|
||||
"label": "Frontend:start:emu",
|
||||
"detail": "Frontend:start:emu",
|
||||
"isBackground": true,
|
||||
"problemMatcher": {
|
||||
"base": "$vite",
|
||||
"background": {
|
||||
"activeOnStart": true,
|
||||
"endsPattern": "OK",
|
||||
"beginsPattern": "vite"
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "emu:debug2",
|
||||
"group": "none",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"env": {
|
||||
"https_proxy": "http://127.0.0.1:7890",
|
||||
"http_proxy": "http://127.0.0.1:7890",
|
||||
"all_proxy": "socks5://127.0.0.1:7890"
|
||||
}
|
||||
},
|
||||
"problemMatcher": [
|
||||
{
|
||||
"base": "$tsc",
|
||||
"background": {
|
||||
"activeOnStart": false,
|
||||
"beginsPattern": "shutdown requested|Starting emulators",
|
||||
"endsPattern": "Debugger listening"
|
||||
}
|
||||
}
|
||||
],
|
||||
"label": "Backend:start-emulator-debug:with-proxy",
|
||||
"detail": "Backend:start-emulator-debug:with-proxy",
|
||||
"dependsOn": [
|
||||
"Backend:build:watch"
|
||||
],
|
||||
"isBackground": true,
|
||||
},
|
||||
{
|
||||
"label": "Fullstack:prepare",
|
||||
"dependsOn": [
|
||||
"Frontend:start:emu",
|
||||
"Backend:build:watch",
|
||||
],
|
||||
},
|
||||
{
|
||||
"label": "Fullstack:debug",
|
||||
"dependsOn": [
|
||||
// "Frontend:start:emu",
|
||||
"Backend:start-emulator-debug",
|
||||
],
|
||||
},
|
||||
{
|
||||
"label": "Fullstack:debug:with-proxy",
|
||||
"dependsOn": [
|
||||
"Frontend:start:emu",
|
||||
"Backend:start-emulator-debug:with-proxy",
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,14 +1,9 @@
|
|||
const { join } = require('path');
|
||||
|
||||
let config = {};
|
||||
if (!process.env.FUNCTIONS_EMULATOR) {
|
||||
config = {
|
||||
// Changes the cache location for Puppeteer.
|
||||
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @type {import("puppeteer").Configuration}
|
||||
*/
|
||||
module.exports = config;
|
||||
module.exports = {
|
||||
// Changes the cache location for Puppeteer.
|
||||
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
||||
};
|
||||
|
|
|
@ -3,9 +3,10 @@ import { singleton } from 'tsyringe';
|
|||
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
|
||||
import TurnDownService from 'turndown';
|
||||
import { Request, Response } from 'express';
|
||||
import normalizeUrl from "@esm2cjs/normalize-url";
|
||||
import { AltTextService } from '../services/alt-text';
|
||||
import TurndownService from 'turndown';
|
||||
|
||||
function tidyMarkdown(markdown: string): string {
|
||||
|
||||
|
@ -50,11 +51,14 @@ function tidyMarkdown(markdown: string): string {
|
|||
export class CrawlerHost extends RPCHost {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
turnDownService = new TurnDownService().use(require('turndown-plugin-gfm').gfm);
|
||||
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
|
||||
|
||||
imageShortUrlPrefix?: string;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
protected altTextService: AltTextService,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
@ -65,14 +69,57 @@ export class CrawlerHost extends RPCHost {
|
|||
this.emit('ready');
|
||||
}
|
||||
|
||||
formatSnapshot(snapshot: PageSnapshot) {
|
||||
|
||||
async formatSnapshot(snapshot: PageSnapshot) {
|
||||
const toBeTurnedToMd = snapshot.parsed?.content;
|
||||
const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : '';
|
||||
let turnDownService = new TurndownService();
|
||||
for (const plugin of this.turnDownPlugins) {
|
||||
turnDownService = turnDownService.use(plugin);
|
||||
}
|
||||
|
||||
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
|
||||
let contentText = '';
|
||||
if (toBeTurnedToMd) {
|
||||
const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {};
|
||||
const tasks = (snapshot.imgs || []).map(async (x) => {
|
||||
const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> {
|
||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||
return undefined;
|
||||
});
|
||||
if (r) {
|
||||
urlToAltMap[x.src.trim()] = r;
|
||||
}
|
||||
});
|
||||
|
||||
const cleanText = tidyMarkdown(contentText).trim();
|
||||
await Promise.all(tasks);
|
||||
let imgIdx = 0;
|
||||
|
||||
turnDownService.addRule('img-generated-alt', {
|
||||
filter: 'img',
|
||||
replacement: (_content, node) => {
|
||||
const src = (node.getAttribute('src') || '').trim();
|
||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||
if (!src) {
|
||||
return '';
|
||||
}
|
||||
const mapped = urlToAltMap[src];
|
||||
imgIdx++;
|
||||
if (mapped) {
|
||||
return `![Image ${imgIdx}: ${mapped.alt || alt}](${this.imageShortUrlPrefix ? `${this.imageShortUrlPrefix}/${mapped.shortDigest}` : src})`;
|
||||
}
|
||||
return `![Image ${imgIdx}: ${alt}](${src})`;
|
||||
}
|
||||
});
|
||||
|
||||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||
}
|
||||
|
||||
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
||||
contentText = turnDownService.turndown(snapshot.html);
|
||||
}
|
||||
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
||||
contentText = snapshot.text;
|
||||
}
|
||||
|
||||
const cleanText = tidyMarkdown(contentText || '').trim();
|
||||
|
||||
const formatted = {
|
||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||
|
@ -142,7 +189,7 @@ ${this.content}
|
|||
continue;
|
||||
}
|
||||
|
||||
const formatted = this.formatSnapshot(scrapped);
|
||||
const formatted = await this.formatSnapshot(scrapped);
|
||||
|
||||
if (scrapped.screenshot && screenshotEnabled) {
|
||||
sseStream.write({
|
||||
|
@ -177,7 +224,7 @@ ${this.content}
|
|||
continue;
|
||||
}
|
||||
|
||||
const formatted = this.formatSnapshot(scrapped);
|
||||
const formatted = await this.formatSnapshot(scrapped);
|
||||
|
||||
return formatted;
|
||||
}
|
||||
|
@ -186,7 +233,7 @@ ${this.content}
|
|||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||
}
|
||||
|
||||
return this.formatSnapshot(lastScrapped);
|
||||
return await this.formatSnapshot(lastScrapped);
|
||||
}
|
||||
|
||||
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
||||
|
@ -195,7 +242,7 @@ ${this.content}
|
|||
continue;
|
||||
}
|
||||
|
||||
const formatted = this.formatSnapshot(scrapped);
|
||||
const formatted = await this.formatSnapshot(scrapped);
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
|
@ -204,8 +251,12 @@ ${this.content}
|
|||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||
}
|
||||
|
||||
return `${this.formatSnapshot(lastScrapped)}`;
|
||||
return `${await this.formatSnapshot(lastScrapped)}`;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
function cleanAttribute(attribute: string) {
|
||||
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
|
||||
}
|
||||
|
|
42
backend/functions/src/db/img-alt.ts
Normal file
42
backend/functions/src/db/img-alt.ts
Normal file
|
@ -0,0 +1,42 @@
|
|||
import { Also, Prop } from 'civkit';
|
||||
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||
import _ from 'lodash';
|
||||
|
||||
@Also({
|
||||
dictOf: Object
|
||||
})
|
||||
export class ImgAlt extends FirestoreRecord {
|
||||
static override collectionName = 'imgAlts';
|
||||
|
||||
override _id!: string;
|
||||
|
||||
@Prop({
|
||||
required: true
|
||||
})
|
||||
src!: string;
|
||||
|
||||
@Prop({
|
||||
required: true
|
||||
})
|
||||
urlDigest!: string;
|
||||
|
||||
@Prop()
|
||||
width?: number;
|
||||
|
||||
@Prop()
|
||||
height?: number;
|
||||
|
||||
@Prop()
|
||||
generatedAlt?: string;
|
||||
|
||||
@Prop()
|
||||
originalAlt?: string;
|
||||
|
||||
@Prop()
|
||||
createdAt!: Date;
|
||||
|
||||
@Prop()
|
||||
expireAt?: Date;
|
||||
|
||||
[k: string]: any;
|
||||
}
|
91
backend/functions/src/services/alt-text.ts
Normal file
91
backend/functions/src/services/alt-text.ts
Normal file
|
@ -0,0 +1,91 @@
|
|||
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { CanvasService } from '../shared/services/canvas';
|
||||
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
||||
import { ImgBrief } from './puppeteer';
|
||||
import { ImgAlt } from '../db/img-alt';
|
||||
|
||||
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
@singleton()
|
||||
export class AltTextService extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected imageInterrogator: ImageInterrogationManager,
|
||||
protected canvasService: CanvasService
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async caption(url: string) {
|
||||
try {
|
||||
const img = await this.canvasService.loadImage(url);
|
||||
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
||||
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
||||
|
||||
const r = await this.imageInterrogator.interrogate('blip2', {
|
||||
image: exported,
|
||||
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
|
||||
});
|
||||
|
||||
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
|
||||
} catch (err) {
|
||||
throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err });
|
||||
}
|
||||
}
|
||||
|
||||
async getAltTextAndShortDigest(imgBrief: ImgBrief) {
|
||||
if (!imgBrief.src) {
|
||||
return undefined;
|
||||
}
|
||||
const digest = md5Hasher.hash(imgBrief.src);
|
||||
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
||||
|
||||
const existing = await ImgAlt.fromFirestore(shortDigest);
|
||||
|
||||
if (existing?.generatedAlt) {
|
||||
return {
|
||||
shortDigest,
|
||||
alt: existing.generatedAlt,
|
||||
};
|
||||
}
|
||||
|
||||
let generatedCaption;
|
||||
|
||||
if (!imgBrief.alt) {
|
||||
try {
|
||||
generatedCaption = await this.caption(imgBrief.src);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
||||
}
|
||||
}
|
||||
|
||||
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
||||
{
|
||||
_id: shortDigest,
|
||||
src: imgBrief.src || '',
|
||||
width: imgBrief.naturalWidth || 0,
|
||||
height: imgBrief.naturalHeight || 0,
|
||||
urlDigest: digest,
|
||||
originalAlt: imgBrief.alt || '',
|
||||
generatedAlt: generatedCaption || '',
|
||||
createdAt: new Date()
|
||||
}, { merge: true }
|
||||
);
|
||||
|
||||
return {
|
||||
shortDigest,
|
||||
alt: generatedCaption,
|
||||
};
|
||||
}
|
||||
}
|
|
@ -7,11 +7,19 @@ import os from 'os';
|
|||
import fs from 'fs';
|
||||
import { Crawled } from '../db/crawled';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import puppeteerStealth from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
|
||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||
|
||||
export interface ImgBrief {
|
||||
src: string;
|
||||
loaded: boolean;
|
||||
width: number;
|
||||
height: number;
|
||||
naturalWidth: number;
|
||||
naturalHeight: number;
|
||||
alt?: string;
|
||||
}
|
||||
|
||||
export interface PageSnapshot {
|
||||
title: string;
|
||||
href: string;
|
||||
|
@ -30,13 +38,16 @@ export interface PageSnapshot {
|
|||
publishedTime: string;
|
||||
} | null;
|
||||
screenshot?: Buffer;
|
||||
imgs?: ImgBrief[];
|
||||
}
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(puppeteerStealth());
|
||||
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
|
||||
// puppeteer.use(puppeteerUAOverride({
|
||||
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
|
||||
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
|
||||
// platform: `Linux`,
|
||||
// }))
|
||||
|
||||
@singleton()
|
||||
|
@ -84,7 +95,7 @@ export class PuppeteerControl extends AsyncService {
|
|||
this.browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
timeout: 10_000
|
||||
}).catch((err) => {
|
||||
}).catch((err: any) => {
|
||||
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
||||
process.nextTick(() => {
|
||||
this.emit('error', err);
|
||||
|
@ -117,23 +128,42 @@ export class PuppeteerControl extends AsyncService {
|
|||
}));
|
||||
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
||||
preparations.push(page.evaluateOnNewDocument(`
|
||||
function briefImgs(elem) {
|
||||
const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
|
||||
|
||||
return imageTags.map((x)=> ({
|
||||
src: x.src,
|
||||
loaded: x.complete,
|
||||
width: x.width,
|
||||
height: x.height,
|
||||
naturalWidth: x.naturalWidth,
|
||||
naturalHeight: x.naturalHeight,
|
||||
alt: x.alt || x.title,
|
||||
}));
|
||||
}
|
||||
function giveSnapshot() {
|
||||
let parsedContent;
|
||||
let parsed;
|
||||
try {
|
||||
// Attempt to parse the cloned document
|
||||
parsedContent = new Readability(document.cloneNode(true)).parse();
|
||||
} catch (error) {
|
||||
// If an error occurs, log it and set parsedContent to undefined
|
||||
parsedContent = undefined;
|
||||
parsed = new Readability(document.cloneNode(true)).parse();
|
||||
} catch (err) {
|
||||
void 0;
|
||||
}
|
||||
|
||||
return {
|
||||
const r = {
|
||||
title: document.title,
|
||||
href: document.location.href,
|
||||
html: document.documentElement.outerHTML,
|
||||
text: document.body.innerText,
|
||||
parsed: parsedContent
|
||||
parsed: parsed,
|
||||
imgs: [],
|
||||
};
|
||||
if (parsed && parsed.content) {
|
||||
const elem = document.createElement('div');
|
||||
elem.innerHTML = parsed.content;
|
||||
r.imgs = briefImgs(elem);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
`));
|
||||
preparations.push(page.evaluateOnNewDocument(() => {
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 9f0fa1dd7f8cfcea4c8d79252319b151fae6ed19
|
||||
Subproject commit bea967a371581c1109dc0101dbcab196e9ed9ade
|
Loading…
Reference in New Issue
Block a user