fix: consider image data-src and make generated alt text optional (#50)

* fix: image src and alt

* fix

* docs: doc about x-with-generated-alt

* fix: deps
This commit is contained in:
Yanlong Wang 2024-05-08 18:29:11 +08:00 committed by GitHub
parent 8cfd0d67dc
commit 62dc75f78e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 42 additions and 15 deletions

View File

@ -72,6 +72,7 @@ As you have already seen above, one can control the behavior of the Reader API u
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
- You can specify a proxy server via the `x-proxy-url` header.
- You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
- You can enable alt-text generation feature via the `x-with-generated-alt` header.
### JSON mode (super early beta)

View File

@ -5,7 +5,7 @@ import {
AssertionFailureError, ParamValidationError,
} from 'civkit';
import { singleton } from 'tsyringe';
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { RateLimitControl } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@ -41,6 +41,7 @@ export class CrawlerHost extends RPCHost {
protected altTextService: AltTextService,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext,
) {
super(...arguments);
@ -123,8 +124,8 @@ export class CrawlerHost extends RPCHost {
turnDownService = turnDownService.use(plugin);
}
const urlToAltMap: { [k: string]: string | undefined; } = {};
if (snapshot.imgs?.length) {
const tasks = (snapshot.imgs || []).map(async (x) => {
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
const r = await this.altTextService.getAltText(x).catch((err: any) => {
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
return undefined;
@ -140,7 +141,15 @@ export class CrawlerHost extends RPCHost {
turnDownService.addRule('img-generated-alt', {
filter: 'img',
replacement: (_content, node) => {
const src = (node.getAttribute('src') || '').trim();
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
const dataSrc = (node.getAttribute('data-src') || '').trim();
if (dataSrc && !dataSrc.startsWith('data:')) {
linkPreferredSrc = dataSrc;
}
}
const src = linkPreferredSrc;
const alt = cleanAttribute(node.getAttribute('alt'));
if (!src) {
return '';
@ -285,6 +294,11 @@ ${this.content}
in: 'header',
schema: { type: 'string' }
},
'X-With-Generated-Alt': {
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
in: 'header',
schema: { type: 'string' }
},
}
}
},
@ -365,6 +379,7 @@ ${authMixin}`,
}
const customMode = ctx.req.get('x-respond-with') || 'default';
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
const noCache = Boolean(ctx.req.get('x-no-cache'));
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
@ -381,6 +396,7 @@ ${authMixin}`,
domain: urlToCrawl.hostname,
});
}
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
const crawlOpts: ScrappingOptions = {
proxyUrl: ctx.req.get('x-proxy-url'),

View File

@ -12,6 +12,7 @@ const md5Hasher = new HashManager('md5', 'hex');
@singleton()
export class AltTextService extends AsyncService {
altsToIgnore = 'image,img,photo,picture,pic,alt,figure,fig'.split(',');
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
@ -48,7 +49,7 @@ export class AltTextService extends AsyncService {
if (!imgBrief.src) {
return undefined;
}
if (imgBrief.alt) {
if (imgBrief.alt && !this.altsToIgnore.includes(imgBrief.alt.trim().toLowerCase())) {
return imgBrief.alt;
}
const digest = md5Hasher.hash(imgBrief.src);

View File

@ -193,17 +193,26 @@ export class PuppeteerControl extends AsyncService {
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
preparations.push(page.evaluateOnNewDocument(`
function briefImgs(elem) {
const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
return imageTags.map((x)=> ({
src: x.src,
loaded: x.complete,
width: x.width,
height: x.height,
naturalWidth: x.naturalWidth,
naturalHeight: x.naturalHeight,
alt: x.alt || x.title,
}));
return imageTags.map((x)=> {
let linkPreferredSrc = x.src;
if (linkPreferredSrc.startsWith('data:')) {
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
linkPreferredSrc = x.dataset.src;
}
}
return {
src: linkPreferredSrc,
loaded: x.complete,
width: x.width,
height: x.height,
naturalWidth: x.naturalWidth,
naturalHeight: x.naturalHeight,
alt: x.alt || x.title,
};
});
}
function giveSnapshot() {
let parsed;