mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
remove alt-text service
This commit is contained in:
parent
4c957adbce
commit
cbe4fa94c1
|
@ -10,7 +10,7 @@ import _ from 'lodash';
|
|||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
import { Request, Response } from 'express';
|
||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||
import { AltTextService } from '../services/alt-text';
|
||||
// import { AltTextService } from '../services/alt-text';
|
||||
import TurndownService from 'turndown';
|
||||
// import { Crawled } from '../db/crawled';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
|
@ -74,7 +74,7 @@ export class CrawlerHost extends RPCHost {
|
|||
protected globalLogger: Logger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected altTextService: AltTextService,
|
||||
// protected altTextService: AltTextService,
|
||||
// protected pdfExtractor: PDFExtractor,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
protected threadLocal: AsyncContext,
|
||||
|
@ -399,10 +399,7 @@ export class CrawlerHost extends RPCHost {
|
|||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
||||
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
||||
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||
return undefined;
|
||||
});
|
||||
const r = "ALT TEXT!!!"
|
||||
if (r && x.src) {
|
||||
urlToAltMap[x.src.trim()] = r;
|
||||
}
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { CanvasService } from '../shared/services/canvas';
|
||||
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
||||
import { ImgBrief } from './puppeteer';
|
||||
import { ImgAlt } from '../db/img-alt';
|
||||
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
@singleton()
|
||||
export class AltTextService extends AsyncService {
|
||||
|
||||
altsToIgnore = 'image,img,photo,picture,pic,alt,figure,fig'.split(',');
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected imageInterrogator: ImageInterrogationManager,
|
||||
protected canvasService: CanvasService
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async caption(url: string) {
|
||||
try {
|
||||
const img = await this.canvasService.loadImage(url);
|
||||
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
||||
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
||||
|
||||
const r = await this.imageInterrogator.interrogate('blip2', {
|
||||
image: exported,
|
||||
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
|
||||
});
|
||||
|
||||
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
|
||||
} catch (err) {
|
||||
throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err });
|
||||
}
|
||||
}
|
||||
|
||||
async getAltText(imgBrief: ImgBrief) {
|
||||
if (!imgBrief.src) {
|
||||
return undefined;
|
||||
}
|
||||
if (imgBrief.alt && !this.altsToIgnore.includes(imgBrief.alt.trim().toLowerCase())) {
|
||||
return imgBrief.alt;
|
||||
}
|
||||
const digest = md5Hasher.hash(imgBrief.src);
|
||||
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
||||
|
||||
const existing = await ImgAlt.fromFirestore(shortDigest);
|
||||
|
||||
if (existing) {
|
||||
return existing.generatedAlt || existing.originalAlt || '';
|
||||
}
|
||||
|
||||
let generatedCaption = '';
|
||||
|
||||
try {
|
||||
generatedCaption = await this.caption(imgBrief.src);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
||||
}
|
||||
|
||||
// Don't try again until the next day
|
||||
const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
|
||||
|
||||
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
||||
{
|
||||
_id: shortDigest,
|
||||
src: imgBrief.src || '',
|
||||
width: imgBrief.naturalWidth || 0,
|
||||
height: imgBrief.naturalHeight || 0,
|
||||
urlDigest: digest,
|
||||
originalAlt: imgBrief.alt || '',
|
||||
generatedAlt: generatedCaption || '',
|
||||
createdAt: new Date(),
|
||||
...expireMixin
|
||||
}, { merge: true }
|
||||
);
|
||||
|
||||
return generatedCaption;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user