From a8e0628460e695c0eb8f3940051806ea633d62d6 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Tue, 21 May 2024 17:34:19 +0800 Subject: [PATCH] feat: links and images summary (#63) * wip: dedicated link and image summary * fix * fix * fix * fix: docs * fix * fix * fix --- .../functions/src/cloud-functions/crawler.ts | 137 +++++++++++++++--- .../functions/src/cloud-functions/searcher.ts | 42 +++++- backend/functions/src/services/puppeteer.ts | 86 +++++++++-- backend/functions/src/types.d.ts | 5 + thinapps-shared | 2 +- 5 files changed, 239 insertions(+), 33 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index d83e7b7..ae109f6 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -38,6 +38,8 @@ export interface FormattedPage { text?: string; screenshotUrl?: string; screenshot?: Buffer; + links?: { [k: string]: string; }; + images?: { [k: string]: string; }; toString: () => string; } @@ -135,9 +137,43 @@ export class CrawlerHost extends RPCHost { return turnDownService; } + getGeneralSnapshotMixins(snapshot: PageSnapshot) { + const inferred = this.puppeteerControl.inferSnapshot(snapshot); + const mixin: any = {}; + if (this.threadLocal.get('withImagesSummary')) { + const imageSummary = {} as { [k: string]: string; }; + const imageIdxTrack = new Map(); + + let imgIdx = 0; + + for (const img of inferred.imgs) { + const imgSerial = ++imgIdx; + const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : []; + idxArr.push(imgSerial); + imageIdxTrack.set(img.src, idxArr); + imageSummary[img.src] = img.alt || ''; + } + + mixin.images = + _(imageSummary) + .toPairs() + .map( + ([url, alt], i) => { + return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; + } + ).fromPairs() + .value(); + } + if (this.threadLocal.get('withLinksSummary')) { + mixin.links = _.invert(inferred.links || {}); + } + + return mixin; + } + async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & { screenshotUrl?: string; - }, nominalUrl?: URL){ + }, nominalUrl?: URL) { if (mode === 'screenshot') { if (snapshot.screenshot && !snapshot.screenshotUrl) { const fid = `instant-screenshots/${randomUUID()}`; @@ -150,6 +186,7 @@ export class CrawlerHost extends RPCHost { } return { + ...this.getGeneralSnapshotMixins(snapshot), screenshotUrl: snapshot.screenshotUrl, toString() { return this.screenshotUrl; @@ -158,6 +195,7 @@ export class CrawlerHost extends RPCHost { } if (mode === 'html') { return { + ...this.getGeneralSnapshotMixins(snapshot), html: snapshot.html, toString() { return this.html; @@ -166,6 +204,7 @@ export class CrawlerHost extends RPCHost { } if (mode === 'text') { return { + ...this.getGeneralSnapshotMixins(snapshot), text: snapshot.text, toString() { return this.text; @@ -193,6 +232,8 @@ export class CrawlerHost extends RPCHost { await Promise.all(tasks); } let imgIdx = 0; + const imageSummary = {} as { [k: string]: string; }; + const imageIdxTrack = new Map(); turnDownService.addRule('img-generated-alt', { filter: 'img', replacement: (_content, node) => { @@ -215,10 +256,19 @@ export class CrawlerHost extends RPCHost { return ''; } const mapped = urlToAltMap[src]; - imgIdx++; + const imgSerial = ++imgIdx; + const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : []; + idxArr.push(imgSerial); + imageIdxTrack.set(src, idxArr); + if (mapped) { + imageSummary[src] = mapped || alt; + return `![Image ${imgIdx}: ${mapped || alt}](${src})`; } + + imageSummary[src] = alt || ''; + return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`; } }); @@ -260,20 +310,41 @@ export class CrawlerHost extends RPCHost { const cleanText = (contentText || '').trim(); - const formatted = { + const formatted: FormattedPage = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), url: nominalUrl?.toString() || snapshot.href?.trim(), content: cleanText, publishedTime: snapshot.parsed?.publishedTime || undefined, toString() { + if (mode === 'markdown') { + return this.content as string; + } + const mixins = []; if (this.publishedTime) { mixins.push(`Published Time: ${this.publishedTime}`); } - - if (mode === 'markdown') { - return this.content; + const suffixMixins = []; + if (this.images) { + const imageSummaryChunks = ['Images:']; + for (const [k, v] of Object.entries(this.images)) { + imageSummaryChunks.push(`- ![${k}](${v})`); + } + if (imageSummaryChunks.length === 1) { + imageSummaryChunks.push('This page does not seem to contain any images.'); + } + suffixMixins.push(imageSummaryChunks.join('\n')); + } + if (this.links) { + const linkSummaryChunks = ['Links/Buttons:']; + for (const [k, v] of Object.entries(this.links)) { + linkSummaryChunks.push(`- [${k}](${v})`); + } + if (linkSummaryChunks.length === 1) { + linkSummaryChunks.push('This page does not seem to contain any buttons/links.'); + } + suffixMixins.push(linkSummaryChunks.join('\n')); } return `Title: ${this.title} @@ -282,10 +353,25 @@ URL Source: ${this.url} ${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''} Markdown Content: ${this.content} -`; +${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } }; + if (this.threadLocal.get('withImagesSummary')) { + formatted.images = + _(imageSummary) + .toPairs() + .map( + ([url, alt], i) => { + return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; + } + ).fromPairs() + .value(); + } + if (this.threadLocal.get('withLinksSummary')) { + formatted.links = _.invert(this.puppeteerControl.inferSnapshot(snapshot).links || {}); + } + return formatted as FormattedPage; } @@ -313,9 +399,9 @@ ${this.content} operation: { parameters: { 'Accept': { - description: `Specifies your preference for the response format. \n\n` + - `Supported formats:\n` + - `- text/event-stream\n` + + description: `Specifies your preference for the response format.\n\n` + + `Supported formats: \n` + + `- text/event - stream\n` + `- application/json or text/json\n` + `- text/plain` , @@ -333,8 +419,8 @@ ${this.content} schema: { type: 'string' } }, 'X-Respond-With': { - description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` + - `Supported formats:\n` + + description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` + + `Supported formats: \n` + `- markdown\n` + `- html\n` + `- text\n` + @@ -344,22 +430,22 @@ ${this.content} schema: { type: 'string' } }, 'X-Wait-For-Selector': { - description: `Specifies a CSS selector to wait for the appearance of such an element before returning. \n\n` + + description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` + 'Example: `X-Wait-For-Selector: .content-block`\n' , in: 'header', schema: { type: 'string' } }, 'X-Target-Selector': { - description: `Specifies a CSS selector for return target instead of the full html. \n\n` + + description: `Specifies a CSS selector for return target instead of the full html.\n\n` + 'Implies `X-Wait-For-Selector: (same selector)`' , in: 'header', schema: { type: 'string' } }, 'X-Proxy-Url': { - description: `Specifies your custom proxy if you prefer to use one. \n\n` + - `Supported protocols:\n` + + description: `Specifies your custom proxy if you prefer to use one.\n\n` + + `Supported protocols: \n` + `- http\n` + `- https\n` + `- socks4\n` + @@ -375,7 +461,18 @@ ${this.content} schema: { type: 'string' } }, 'X-With-Generated-Alt': { - description: `Enable automatic alt-text generating for images without an meaningful alt-text.`, + description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` + + `Note: Does not work when \`X-Respond-With\` is specified`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-Images-Summary': { + description: `Enable dedicated summary section for images on the page.`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-links-Summary': { + description: `Enable dedicated summary section for hyper links on the page.`, in: 'header', schema: { type: 'string' } }, @@ -465,6 +562,8 @@ ${this.content} const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default'; const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt')); + const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary')); + const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary')); const noCache = Boolean(ctx.req.get('x-no-cache')); let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000; if (isNaN(cacheTolerance)) { @@ -491,6 +590,8 @@ ${this.content} }); } this.threadLocal.set('withGeneratedAlt', withGeneratedAlt); + this.threadLocal.set('withLinksSummary', withLinksSummary); + this.threadLocal.set('withImagesSummary', withImagesSummary); const crawlOpts: ExtraScrappingOptions = { proxyUrl: ctx.req.get('x-proxy-url'), @@ -729,7 +830,7 @@ ${this.content} return undefined; } - const textContent = formatted?.content || formatted?.description || formatted?.text || formatted?.html; + const textContent = formatted?.content || formatted?.description || formatted?.text || formatted?.html; if (typeof textContent === 'string') { return estimateToken(textContent); diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index bfe9190..126226d 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -116,7 +116,18 @@ export class SearcherHost extends RPCHost { schema: { type: 'string' } }, 'X-With-Generated-Alt': { - description: `Enable automatic alt-text generating for images without an meaningful alt-text.`, + description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` + + `Note: Does not work when \`X-Respond-With\` is specified`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-Images-Summary': { + description: `Enable dedicated summary section for images on the page.`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-links-Summary': { + description: `Enable dedicated summary section for hyper links on the page.`, in: 'header', schema: { type: 'string' } }, @@ -189,6 +200,8 @@ export class SearcherHost extends RPCHost { const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default'; const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt')); + const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary')); + const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary')); const noCache = Boolean(ctx.req.get('x-no-cache')); let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000; if (isNaN(pageCacheTolerance)) { @@ -211,6 +224,9 @@ export class SearcherHost extends RPCHost { }); } this.threadLocal.set('withGeneratedAlt', withGeneratedAlt); + this.threadLocal.set('withLinksSummary', withLinksSummary); + this.threadLocal.set('withImagesSummary', withImagesSummary); + const crawlOpts: ScrappingOptions = { proxyUrl: ctx.req.get('x-proxy-url'), cookies, @@ -395,11 +411,33 @@ export class SearcherHost extends RPCHost { mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`); } + const suffixMixins = []; + if (this.images) { + const imageSummaryChunks = [`[${i + 1}] Images:`]; + for (const [k, v] of Object.entries(this.images)) { + imageSummaryChunks.push(`- ![${k}](${v})`); + } + if (imageSummaryChunks.length === 1) { + imageSummaryChunks.push('This page does not seem to contain any images.'); + } + suffixMixins.push(imageSummaryChunks.join('\n')); + } + if (this.links) { + const linkSummaryChunks = [`[${i + 1}] Links/Buttons:`]; + for (const [k, v] of Object.entries(this.links)) { + linkSummaryChunks.push(`- [${k}](${v})`); + } + if (linkSummaryChunks.length === 1) { + linkSummaryChunks.push('This page does not seem to contain any buttons/links.'); + } + suffixMixins.push(linkSummaryChunks.join('\n')); + } + return `[${i + 1}] Title: ${this.title} [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''} [${i + 1}] Markdown Content: ${this.content} -`; +${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`; } }; }); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 148eab4..fa89eef 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -3,7 +3,7 @@ import fs from 'fs'; import { container, singleton } from 'tsyringe'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; import { Logger } from '../shared/services/logger'; -import { JSDOM } from 'jsdom'; +import { JSDOM, VirtualConsole } from 'jsdom'; import type { Browser, CookieParam, Page } from 'puppeteer'; import puppeteer from 'puppeteer-extra'; @@ -15,13 +15,17 @@ import { Readability } from '@mozilla/readability'; const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); + +const virtualConsole = new VirtualConsole(); +virtualConsole.on('error', () => void 0); + export interface ImgBrief { src: string; - loaded: boolean; - width: number; - height: number; - naturalWidth: number; - naturalHeight: number; + loaded?: boolean; + width?: number; + height?: number; + naturalWidth?: number; + naturalHeight?: number; alt?: string; } @@ -48,6 +52,11 @@ export interface PageSnapshot { imgs?: ImgBrief[]; } +export interface ExtendedSnapshot extends PageSnapshot { + links: { [url: string]: string; }; + imgs: ImgBrief[]; +} + export interface ScrappingOptions { proxyUrl?: string; cookies?: CookieParam[]; @@ -100,7 +109,6 @@ export class PuppeteerControl extends AsyncService { briefPages() { this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`); - this.logger.info(``); } override async init() { @@ -304,7 +312,7 @@ document.addEventListener('load', handlePageLoad); } async getNextPage() { - let thePage; + let thePage: Page | undefined; if (this.__loadedPage.length) { thePage = this.__loadedPage.shift(); if (this.__loadedPage.length <= 1) { @@ -321,8 +329,8 @@ document.addEventListener('load', handlePageLoad); } const timer = setTimeout(() => { - this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage)}...`); - this.ditchPage(thePage); + this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage!)}...`); + this.ditchPage(thePage!); }, 300 * 1000); this.finalizerMap.set(thePage, timer); @@ -487,14 +495,14 @@ document.addEventListener('load', handlePageLoad); return snapshot; } - const jsdom = new JSDOM(snapshot.html, { url: snapshot.href }); + const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); const elem = jsdom.window.document.querySelector(targetSelect); if (!elem) { return snapshot; } - const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href }); + const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href, virtualConsole }); let parsed; try { parsed = new Readability(selectedJsDom.window.document).parse(); @@ -531,6 +539,60 @@ document.addEventListener('load', handlePageLoad); return r; } + + inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot { + const extendedSnapshot = { ...snapshot } as ExtendedSnapshot; + try { + const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); + const links = Array.from(jsdom.window.document.querySelectorAll('a[href]')) + .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()]) + .map(([href, text]) => { + if (!text) { + return undefined; + } + try { + const parsed = new URL(href, snapshot.href); + if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') { + return undefined; + } + return [parsed.toString(), text] as const; + } catch (err) { + return undefined; + } + }) + .filter(Boolean) + .reduce((acc, pair) => { + acc[pair![0]] = pair![1]; + return acc; + }, {} as { [k: string]: string; }); + + extendedSnapshot.links = links; + + const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]')) + .map((x: any) => { + let linkPreferredSrc = x.getAttribute('src') || ''; + if (linkPreferredSrc.startsWith('data:')) { + const dataSrc = x.getAttribute('data-src') || ''; + if (dataSrc && !dataSrc.startsWith('data:')) { + linkPreferredSrc = dataSrc; + } + } + + return { + src: new URL(linkPreferredSrc, snapshot.href).toString(), + width: parseInt(x.getAttribute('width') || '0'), + height: parseInt(x.getAttribute('height') || '0'), + alt: x.getAttribute('alt') || x.getAttribute('title'), + }; + }); + + extendedSnapshot.imgs = imgs as any; + } catch (_err) { + void 0; + } + + return extendedSnapshot; + } } const puppeteerControl = container.resolve(PuppeteerControl); diff --git a/backend/functions/src/types.d.ts b/backend/functions/src/types.d.ts index 796e64d..7919e03 100644 --- a/backend/functions/src/types.d.ts +++ b/backend/functions/src/types.d.ts @@ -9,8 +9,13 @@ declare module 'langdetect' { } declare module 'jsdom' { + import EventEmitter from 'events'; export class JSDOM { constructor(html: string, options?: any); window: typeof window; } + export class VirtualConsole extends EventEmitter{ + constructor(); + sendTo(console: any, options?: any); + } } diff --git a/thinapps-shared b/thinapps-shared index fc3545e..1b28100 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit fc3545e3a7ae27968e69f351f109d3ffb535f963 +Subproject commit 1b28100c71b3c7e37669fa98756affbac3095ced