mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
fix: consider image data-src and make generated alt text optional (#50)
* fix: image src and alt * fix * docs: doc about x-with-generated-alt * fix: deps
This commit is contained in:
parent
8cfd0d67dc
commit
62dc75f78e
|
@ -72,6 +72,7 @@ As you have already seen above, one can control the behavior of the Reader API u
|
|||
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
|
||||
- You can specify a proxy server via the `x-proxy-url` header.
|
||||
- You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
|
||||
- You can enable alt-text generation feature via the `x-with-generated-alt` header.
|
||||
|
||||
### JSON mode (super early beta)
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import {
|
|||
AssertionFailureError, ParamValidationError,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
import { RateLimitControl } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
|
@ -41,6 +41,7 @@ export class CrawlerHost extends RPCHost {
|
|||
protected altTextService: AltTextService,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
protected rateLimitControl: RateLimitControl,
|
||||
protected threadLocal: AsyncContext,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
|
@ -123,8 +124,8 @@ export class CrawlerHost extends RPCHost {
|
|||
turnDownService = turnDownService.use(plugin);
|
||||
}
|
||||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||
if (snapshot.imgs?.length) {
|
||||
const tasks = (snapshot.imgs || []).map(async (x) => {
|
||||
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
||||
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
||||
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||
return undefined;
|
||||
|
@ -140,7 +141,15 @@ export class CrawlerHost extends RPCHost {
|
|||
turnDownService.addRule('img-generated-alt', {
|
||||
filter: 'img',
|
||||
replacement: (_content, node) => {
|
||||
const src = (node.getAttribute('src') || '').trim();
|
||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
||||
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
||||
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
||||
if (dataSrc && !dataSrc.startsWith('data:')) {
|
||||
linkPreferredSrc = dataSrc;
|
||||
}
|
||||
}
|
||||
|
||||
const src = linkPreferredSrc;
|
||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||
if (!src) {
|
||||
return '';
|
||||
|
@ -285,6 +294,11 @@ ${this.content}
|
|||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Generated-Alt': {
|
||||
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -365,6 +379,7 @@ ${authMixin}`,
|
|||
}
|
||||
|
||||
const customMode = ctx.req.get('x-respond-with') || 'default';
|
||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||
|
@ -381,6 +396,7 @@ ${authMixin}`,
|
|||
domain: urlToCrawl.hostname,
|
||||
});
|
||||
}
|
||||
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
||||
|
||||
const crawlOpts: ScrappingOptions = {
|
||||
proxyUrl: ctx.req.get('x-proxy-url'),
|
||||
|
|
|
@ -12,6 +12,7 @@ const md5Hasher = new HashManager('md5', 'hex');
|
|||
@singleton()
|
||||
export class AltTextService extends AsyncService {
|
||||
|
||||
altsToIgnore = 'image,img,photo,picture,pic,alt,figure,fig'.split(',');
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
|
@ -48,7 +49,7 @@ export class AltTextService extends AsyncService {
|
|||
if (!imgBrief.src) {
|
||||
return undefined;
|
||||
}
|
||||
if (imgBrief.alt) {
|
||||
if (imgBrief.alt && !this.altsToIgnore.includes(imgBrief.alt.trim().toLowerCase())) {
|
||||
return imgBrief.alt;
|
||||
}
|
||||
const digest = md5Hasher.hash(imgBrief.src);
|
||||
|
|
|
@ -193,17 +193,26 @@ export class PuppeteerControl extends AsyncService {
|
|||
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
||||
preparations.push(page.evaluateOnNewDocument(`
|
||||
function briefImgs(elem) {
|
||||
const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
|
||||
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
||||
|
||||
return imageTags.map((x)=> ({
|
||||
src: x.src,
|
||||
loaded: x.complete,
|
||||
width: x.width,
|
||||
height: x.height,
|
||||
naturalWidth: x.naturalWidth,
|
||||
naturalHeight: x.naturalHeight,
|
||||
alt: x.alt || x.title,
|
||||
}));
|
||||
return imageTags.map((x)=> {
|
||||
let linkPreferredSrc = x.src;
|
||||
if (linkPreferredSrc.startsWith('data:')) {
|
||||
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
|
||||
linkPreferredSrc = x.dataset.src;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
src: linkPreferredSrc,
|
||||
loaded: x.complete,
|
||||
width: x.width,
|
||||
height: x.height,
|
||||
naturalWidth: x.naturalWidth,
|
||||
naturalHeight: x.naturalHeight,
|
||||
alt: x.alt || x.title,
|
||||
};
|
||||
});
|
||||
}
|
||||
function giveSnapshot() {
|
||||
let parsed;
|
||||
|
|
Loading…
Reference in New Issue
Block a user