mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
feat: add image captioning (#6)
* Fix contentText assignment in CrawlerHost class * fix: recover vscode configurations * feat: add image captioning * feat: add image captioning * clean: vscode config * chore: fix some ts warnings * feat: auto alt text * fix * chore: improve prompt * clean: unused config * fix: failure condition * fix: remove redundant code * fix: catch parse error * fix: catch parse error --------- Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
This commit is contained in:
parent
18373626b2
commit
b3fb4c5c57
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,4 +1,2 @@
|
||||||
node_modules/
|
node_modules/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.vscode
|
|
||||||
.cache
|
|
10
.vscode/exensions.json
vendored
Normal file
10
.vscode/exensions.json
vendored
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"recommendations": [
|
||||||
|
"editorconfig.editorconfig",
|
||||||
|
"octref.vetur",
|
||||||
|
"redhat.vscode-yaml",
|
||||||
|
"dbaeumer.vscode-eslint",
|
||||||
|
"esbenp.prettier-vscode",
|
||||||
|
"streetsidesoftware.code-spell-checker"
|
||||||
|
]
|
||||||
|
}
|
60
.vscode/launch.json
vendored
Normal file
60
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
{
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Debug Fullstack: attach",
|
||||||
|
"request": "attach",
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions",
|
||||||
|
"skipFiles": [
|
||||||
|
"<node_internals>/**"
|
||||||
|
],
|
||||||
|
"type": "node",
|
||||||
|
"preLaunchTask": "Fullstack:debug"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Debug Fullstack: attach: with proxy",
|
||||||
|
"request": "attach",
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions",
|
||||||
|
"skipFiles": [
|
||||||
|
"<node_internals>/**"
|
||||||
|
],
|
||||||
|
"type": "node",
|
||||||
|
"preLaunchTask": "Fullstack:debug:with-proxy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Attach",
|
||||||
|
"port": 9229,
|
||||||
|
"request": "attach",
|
||||||
|
"skipFiles": [
|
||||||
|
"<node_internals>/**"
|
||||||
|
],
|
||||||
|
"type": "node"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Attach by Process ID",
|
||||||
|
"processId": "${command:PickProcess}",
|
||||||
|
"request": "attach",
|
||||||
|
"skipFiles": [
|
||||||
|
"<node_internals>/**"
|
||||||
|
],
|
||||||
|
"type": "node"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Debug Fullstack",
|
||||||
|
"request": "launch",
|
||||||
|
"runtimeArgs": [
|
||||||
|
"emulators:start",
|
||||||
|
"--import=../.firebase-emu",
|
||||||
|
"--export-on-exit=../.firebase-emu",
|
||||||
|
],
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions",
|
||||||
|
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
|
||||||
|
"skipFiles": [
|
||||||
|
"<node_internals>/**"
|
||||||
|
],
|
||||||
|
"type": "node",
|
||||||
|
"preLaunchTask": "Fullstack:prepare",
|
||||||
|
"killBehavior": "polite"
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
32
.vscode/settings.json
vendored
Normal file
32
.vscode/settings.json
vendored
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
{
|
||||||
|
"editor.wordWrap": "on",
|
||||||
|
"editor.wordWrapColumn": 120,
|
||||||
|
"files.trimTrailingWhitespace": true,
|
||||||
|
"files.trimFinalNewlines": true,
|
||||||
|
"[javascript]": {
|
||||||
|
"editor.defaultFormatter": "vscode.typescript-language-features"
|
||||||
|
},
|
||||||
|
"[jsonc]": {
|
||||||
|
"editor.defaultFormatter": "vscode.json-language-features"
|
||||||
|
},
|
||||||
|
"[typescript]": {
|
||||||
|
"editor.defaultFormatter": "vscode.typescript-language-features"
|
||||||
|
},
|
||||||
|
"[json]": {
|
||||||
|
"editor.defaultFormatter": "vscode.json-language-features"
|
||||||
|
},
|
||||||
|
"[yaml]": {
|
||||||
|
"editor.defaultFormatter": "redhat.vscode-yaml"
|
||||||
|
},
|
||||||
|
"[markdown]": {
|
||||||
|
"files.trimTrailingWhitespace": false
|
||||||
|
},
|
||||||
|
"typescript.tsdk": "node_modules/typescript/lib",
|
||||||
|
"typescript.preferences.quoteStyle": "single",
|
||||||
|
"typescript.format.semicolons": "insert",
|
||||||
|
"typescript.preferences.importModuleSpecifier": "project-relative",
|
||||||
|
"typescript.locale": "en",
|
||||||
|
"cSpell.enabled": true,
|
||||||
|
"cSpell.words": [
|
||||||
|
],
|
||||||
|
}
|
156
.vscode/tasks.json
vendored
Normal file
156
.vscode/tasks.json
vendored
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
{
|
||||||
|
"version": "2.0.0",
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"type": "npm",
|
||||||
|
"script": "build",
|
||||||
|
"group": "build",
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions"
|
||||||
|
},
|
||||||
|
"problemMatcher": [],
|
||||||
|
"label": "Backend:rebuild",
|
||||||
|
"detail": "Backend:rebuild"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "npm",
|
||||||
|
"script": "emu:reset",
|
||||||
|
"group": "build",
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions"
|
||||||
|
},
|
||||||
|
"problemMatcher": [],
|
||||||
|
"label": "Backend:reset-emulator",
|
||||||
|
"detail": "Backend:reset-emulator"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "typescript",
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions"
|
||||||
|
},
|
||||||
|
"tsconfig": "backend/functions/tsconfig.json",
|
||||||
|
"option": "watch",
|
||||||
|
"isBackground": true,
|
||||||
|
"problemMatcher": [
|
||||||
|
"$tsc-watch"
|
||||||
|
],
|
||||||
|
"group": "build",
|
||||||
|
"label": "Backend:build:watch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "npm",
|
||||||
|
"script": "emu:debug",
|
||||||
|
"group": "none",
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions"
|
||||||
|
},
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"base": "$tsc",
|
||||||
|
"background": {
|
||||||
|
"activeOnStart": false,
|
||||||
|
"beginsPattern": "shutdown requested|Starting emulators",
|
||||||
|
"endsPattern": "Debugger listening"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"label": "Backend:start-emulator-debug",
|
||||||
|
"detail": "Backend:start-emulator-debug",
|
||||||
|
"dependsOn": [
|
||||||
|
"Backend:build:watch"
|
||||||
|
],
|
||||||
|
"isBackground": true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "npm",
|
||||||
|
"script": "dev",
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}/webapp",
|
||||||
|
},
|
||||||
|
"group": "build",
|
||||||
|
"label": "Frontend:start:dev",
|
||||||
|
"detail": "Frontend:start:dev",
|
||||||
|
"isBackground": true,
|
||||||
|
"problemMatcher": {
|
||||||
|
"base": "$vite",
|
||||||
|
"background": {
|
||||||
|
"activeOnStart": true,
|
||||||
|
"endsPattern": "OK",
|
||||||
|
"beginsPattern": "vite"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "npm",
|
||||||
|
"script": "dev",
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}/webapp",
|
||||||
|
"env": {
|
||||||
|
"FIREBASE_EMULATE": "true",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"group": "build",
|
||||||
|
"label": "Frontend:start:emu",
|
||||||
|
"detail": "Frontend:start:emu",
|
||||||
|
"isBackground": true,
|
||||||
|
"problemMatcher": {
|
||||||
|
"base": "$vite",
|
||||||
|
"background": {
|
||||||
|
"activeOnStart": true,
|
||||||
|
"endsPattern": "OK",
|
||||||
|
"beginsPattern": "vite"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "npm",
|
||||||
|
"script": "emu:debug2",
|
||||||
|
"group": "none",
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}/backend/functions",
|
||||||
|
"env": {
|
||||||
|
"https_proxy": "http://127.0.0.1:7890",
|
||||||
|
"http_proxy": "http://127.0.0.1:7890",
|
||||||
|
"all_proxy": "socks5://127.0.0.1:7890"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"base": "$tsc",
|
||||||
|
"background": {
|
||||||
|
"activeOnStart": false,
|
||||||
|
"beginsPattern": "shutdown requested|Starting emulators",
|
||||||
|
"endsPattern": "Debugger listening"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"label": "Backend:start-emulator-debug:with-proxy",
|
||||||
|
"detail": "Backend:start-emulator-debug:with-proxy",
|
||||||
|
"dependsOn": [
|
||||||
|
"Backend:build:watch"
|
||||||
|
],
|
||||||
|
"isBackground": true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Fullstack:prepare",
|
||||||
|
"dependsOn": [
|
||||||
|
"Frontend:start:emu",
|
||||||
|
"Backend:build:watch",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Fullstack:debug",
|
||||||
|
"dependsOn": [
|
||||||
|
// "Frontend:start:emu",
|
||||||
|
"Backend:start-emulator-debug",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Fullstack:debug:with-proxy",
|
||||||
|
"dependsOn": [
|
||||||
|
"Frontend:start:emu",
|
||||||
|
"Backend:start-emulator-debug:with-proxy",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -1,14 +1,9 @@
|
||||||
const { join } = require('path');
|
const { join } = require('path');
|
||||||
|
|
||||||
let config = {};
|
|
||||||
if (!process.env.FUNCTIONS_EMULATOR) {
|
|
||||||
config = {
|
|
||||||
// Changes the cache location for Puppeteer.
|
|
||||||
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @type {import("puppeteer").Configuration}
|
* @type {import("puppeteer").Configuration}
|
||||||
*/
|
*/
|
||||||
module.exports = config;
|
module.exports = {
|
||||||
|
// Changes the cache location for Puppeteer.
|
||||||
|
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
||||||
|
};
|
||||||
|
|
|
@ -3,9 +3,10 @@ import { singleton } from 'tsyringe';
|
||||||
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
|
||||||
import TurnDownService from 'turndown';
|
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
import normalizeUrl from "@esm2cjs/normalize-url";
|
import normalizeUrl from "@esm2cjs/normalize-url";
|
||||||
|
import { AltTextService } from '../services/alt-text';
|
||||||
|
import TurndownService from 'turndown';
|
||||||
|
|
||||||
function tidyMarkdown(markdown: string): string {
|
function tidyMarkdown(markdown: string): string {
|
||||||
|
|
||||||
|
@ -50,11 +51,14 @@ function tidyMarkdown(markdown: string): string {
|
||||||
export class CrawlerHost extends RPCHost {
|
export class CrawlerHost extends RPCHost {
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
turnDownService = new TurnDownService().use(require('turndown-plugin-gfm').gfm);
|
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
|
||||||
|
|
||||||
|
imageShortUrlPrefix?: string;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
|
protected altTextService: AltTextService,
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
}
|
}
|
||||||
|
@ -65,14 +69,57 @@ export class CrawlerHost extends RPCHost {
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
formatSnapshot(snapshot: PageSnapshot) {
|
async formatSnapshot(snapshot: PageSnapshot) {
|
||||||
|
|
||||||
const toBeTurnedToMd = snapshot.parsed?.content;
|
const toBeTurnedToMd = snapshot.parsed?.content;
|
||||||
const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : '';
|
let turnDownService = new TurndownService();
|
||||||
|
for (const plugin of this.turnDownPlugins) {
|
||||||
|
turnDownService = turnDownService.use(plugin);
|
||||||
|
}
|
||||||
|
|
||||||
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
|
let contentText = '';
|
||||||
|
if (toBeTurnedToMd) {
|
||||||
|
const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {};
|
||||||
|
const tasks = (snapshot.imgs || []).map(async (x) => {
|
||||||
|
const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> {
|
||||||
|
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||||
|
return undefined;
|
||||||
|
});
|
||||||
|
if (r) {
|
||||||
|
urlToAltMap[x.src.trim()] = r;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
const cleanText = tidyMarkdown(contentText).trim();
|
await Promise.all(tasks);
|
||||||
|
let imgIdx = 0;
|
||||||
|
|
||||||
|
turnDownService.addRule('img-generated-alt', {
|
||||||
|
filter: 'img',
|
||||||
|
replacement: (_content, node) => {
|
||||||
|
const src = (node.getAttribute('src') || '').trim();
|
||||||
|
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||||
|
if (!src) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
const mapped = urlToAltMap[src];
|
||||||
|
imgIdx++;
|
||||||
|
if (mapped) {
|
||||||
|
return `![Image ${imgIdx}: ${mapped.alt || alt}](${this.imageShortUrlPrefix ? `${this.imageShortUrlPrefix}/${mapped.shortDigest}` : src})`;
|
||||||
|
}
|
||||||
|
return `![Image ${imgIdx}: ${alt}](${src})`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
||||||
|
contentText = turnDownService.turndown(snapshot.html);
|
||||||
|
}
|
||||||
|
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
||||||
|
contentText = snapshot.text;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cleanText = tidyMarkdown(contentText || '').trim();
|
||||||
|
|
||||||
const formatted = {
|
const formatted = {
|
||||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
|
@ -142,7 +189,7 @@ ${this.content}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = this.formatSnapshot(scrapped);
|
const formatted = await this.formatSnapshot(scrapped);
|
||||||
|
|
||||||
if (scrapped.screenshot && screenshotEnabled) {
|
if (scrapped.screenshot && screenshotEnabled) {
|
||||||
sseStream.write({
|
sseStream.write({
|
||||||
|
@ -177,7 +224,7 @@ ${this.content}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = this.formatSnapshot(scrapped);
|
const formatted = await this.formatSnapshot(scrapped);
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
@ -186,7 +233,7 @@ ${this.content}
|
||||||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.formatSnapshot(lastScrapped);
|
return await this.formatSnapshot(lastScrapped);
|
||||||
}
|
}
|
||||||
|
|
||||||
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
||||||
|
@ -195,7 +242,7 @@ ${this.content}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = this.formatSnapshot(scrapped);
|
const formatted = await this.formatSnapshot(scrapped);
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
|
@ -204,8 +251,12 @@ ${this.content}
|
||||||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return `${this.formatSnapshot(lastScrapped)}`;
|
return `${await this.formatSnapshot(lastScrapped)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function cleanAttribute(attribute: string) {
|
||||||
|
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
|
||||||
|
}
|
||||||
|
|
42
backend/functions/src/db/img-alt.ts
Normal file
42
backend/functions/src/db/img-alt.ts
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
import { Also, Prop } from 'civkit';
|
||||||
|
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||||
|
import _ from 'lodash';
|
||||||
|
|
||||||
|
@Also({
|
||||||
|
dictOf: Object
|
||||||
|
})
|
||||||
|
export class ImgAlt extends FirestoreRecord {
|
||||||
|
static override collectionName = 'imgAlts';
|
||||||
|
|
||||||
|
override _id!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
src!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
urlDigest!: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
width?: number;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
height?: number;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
generatedAlt?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
originalAlt?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
createdAt!: Date;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
expireAt?: Date;
|
||||||
|
|
||||||
|
[k: string]: any;
|
||||||
|
}
|
91
backend/functions/src/services/alt-text.ts
Normal file
91
backend/functions/src/services/alt-text.ts
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import { CanvasService } from '../shared/services/canvas';
|
||||||
|
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
||||||
|
import { ImgBrief } from './puppeteer';
|
||||||
|
import { ImgAlt } from '../db/img-alt';
|
||||||
|
|
||||||
|
|
||||||
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class AltTextService extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
protected imageInterrogator: ImageInterrogationManager,
|
||||||
|
protected canvasService: CanvasService
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
async caption(url: string) {
|
||||||
|
try {
|
||||||
|
const img = await this.canvasService.loadImage(url);
|
||||||
|
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
||||||
|
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
||||||
|
|
||||||
|
const r = await this.imageInterrogator.interrogate('blip2', {
|
||||||
|
image: exported,
|
||||||
|
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
|
||||||
|
});
|
||||||
|
|
||||||
|
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
|
||||||
|
} catch (err) {
|
||||||
|
throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async getAltTextAndShortDigest(imgBrief: ImgBrief) {
|
||||||
|
if (!imgBrief.src) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const digest = md5Hasher.hash(imgBrief.src);
|
||||||
|
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
||||||
|
|
||||||
|
const existing = await ImgAlt.fromFirestore(shortDigest);
|
||||||
|
|
||||||
|
if (existing?.generatedAlt) {
|
||||||
|
return {
|
||||||
|
shortDigest,
|
||||||
|
alt: existing.generatedAlt,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let generatedCaption;
|
||||||
|
|
||||||
|
if (!imgBrief.alt) {
|
||||||
|
try {
|
||||||
|
generatedCaption = await this.caption(imgBrief.src);
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
||||||
|
{
|
||||||
|
_id: shortDigest,
|
||||||
|
src: imgBrief.src || '',
|
||||||
|
width: imgBrief.naturalWidth || 0,
|
||||||
|
height: imgBrief.naturalHeight || 0,
|
||||||
|
urlDigest: digest,
|
||||||
|
originalAlt: imgBrief.alt || '',
|
||||||
|
generatedAlt: generatedCaption || '',
|
||||||
|
createdAt: new Date()
|
||||||
|
}, { merge: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
shortDigest,
|
||||||
|
alt: generatedCaption,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
|
@ -7,11 +7,19 @@ import os from 'os';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import puppeteer from 'puppeteer-extra';
|
import puppeteer from 'puppeteer-extra';
|
||||||
import puppeteerStealth from 'puppeteer-extra-plugin-stealth';
|
|
||||||
|
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
|
|
||||||
|
export interface ImgBrief {
|
||||||
|
src: string;
|
||||||
|
loaded: boolean;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
naturalWidth: number;
|
||||||
|
naturalHeight: number;
|
||||||
|
alt?: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface PageSnapshot {
|
export interface PageSnapshot {
|
||||||
title: string;
|
title: string;
|
||||||
href: string;
|
href: string;
|
||||||
|
@ -30,13 +38,16 @@ export interface PageSnapshot {
|
||||||
publishedTime: string;
|
publishedTime: string;
|
||||||
} | null;
|
} | null;
|
||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
|
imgs?: ImgBrief[];
|
||||||
}
|
}
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
|
const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
|
||||||
puppeteer.use(puppeteerStealth());
|
puppeteer.use(puppeteerStealth());
|
||||||
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
|
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
|
||||||
// puppeteer.use(puppeteerUAOverride({
|
// puppeteer.use(puppeteerUAOverride({
|
||||||
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
|
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
|
||||||
|
// platform: `Linux`,
|
||||||
// }))
|
// }))
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
|
@ -84,7 +95,7 @@ export class PuppeteerControl extends AsyncService {
|
||||||
this.browser = await puppeteer.launch({
|
this.browser = await puppeteer.launch({
|
||||||
headless: true,
|
headless: true,
|
||||||
timeout: 10_000
|
timeout: 10_000
|
||||||
}).catch((err) => {
|
}).catch((err: any) => {
|
||||||
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
||||||
process.nextTick(() => {
|
process.nextTick(() => {
|
||||||
this.emit('error', err);
|
this.emit('error', err);
|
||||||
|
@ -117,23 +128,42 @@ export class PuppeteerControl extends AsyncService {
|
||||||
}));
|
}));
|
||||||
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
||||||
preparations.push(page.evaluateOnNewDocument(`
|
preparations.push(page.evaluateOnNewDocument(`
|
||||||
|
function briefImgs(elem) {
|
||||||
|
const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
|
||||||
|
|
||||||
|
return imageTags.map((x)=> ({
|
||||||
|
src: x.src,
|
||||||
|
loaded: x.complete,
|
||||||
|
width: x.width,
|
||||||
|
height: x.height,
|
||||||
|
naturalWidth: x.naturalWidth,
|
||||||
|
naturalHeight: x.naturalHeight,
|
||||||
|
alt: x.alt || x.title,
|
||||||
|
}));
|
||||||
|
}
|
||||||
function giveSnapshot() {
|
function giveSnapshot() {
|
||||||
let parsedContent;
|
let parsed;
|
||||||
try {
|
try {
|
||||||
// Attempt to parse the cloned document
|
parsed = new Readability(document.cloneNode(true)).parse();
|
||||||
parsedContent = new Readability(document.cloneNode(true)).parse();
|
} catch (err) {
|
||||||
} catch (error) {
|
void 0;
|
||||||
// If an error occurs, log it and set parsedContent to undefined
|
|
||||||
parsedContent = undefined;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
const r = {
|
||||||
title: document.title,
|
title: document.title,
|
||||||
href: document.location.href,
|
href: document.location.href,
|
||||||
html: document.documentElement.outerHTML,
|
html: document.documentElement.outerHTML,
|
||||||
text: document.body.innerText,
|
text: document.body.innerText,
|
||||||
parsed: parsedContent
|
parsed: parsed,
|
||||||
|
imgs: [],
|
||||||
};
|
};
|
||||||
|
if (parsed && parsed.content) {
|
||||||
|
const elem = document.createElement('div');
|
||||||
|
elem.innerHTML = parsed.content;
|
||||||
|
r.imgs = briefImgs(elem);
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
}
|
}
|
||||||
`));
|
`));
|
||||||
preparations.push(page.evaluateOnNewDocument(() => {
|
preparations.push(page.evaluateOnNewDocument(() => {
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 9f0fa1dd7f8cfcea4c8d79252319b151fae6ed19
|
Subproject commit bea967a371581c1109dc0101dbcab196e9ed9ade
|
Loading…
Reference in New Issue
Block a user