fix: add cache tolerance

This commit is contained in:
Yanlong Wang 2024-05-15 08:06:35 +08:00
parent d100c3fc5f
commit 1cf8e83857
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 41 additions and 26 deletions

View File

@ -426,6 +426,7 @@ ${this.content}
const customMode = ctx.req.get('x-respond-with') || 'default';
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
const noCache = Boolean(ctx.req.get('x-no-cache'));
const cacheTolerance = noCache ? 0 : this.cacheValidMs;
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
if (Array.isArray(setCookieHeaders)) {
@ -454,7 +455,7 @@ ${this.content}
rpcReflect.return(sseStream);
try {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
if (!scrapped) {
continue;
}
@ -481,7 +482,7 @@ ${this.content}
let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
lastScrapped = scrapped;
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue;
@ -503,7 +504,7 @@ ${this.content}
return formatted;
}
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
lastScrapped = scrapped;
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue;
@ -546,7 +547,7 @@ ${this.content}
return digest;
}
async queryCache(urlToCrawl: URL) {
async queryCache(urlToCrawl: URL, cacheTolerance: number) {
const digest = this.getUrlDigest(urlToCrawl);
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
@ -556,9 +557,9 @@ ${this.content}
}
const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
url: urlToCrawl, digest, age, stale
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
url: urlToCrawl, digest, age, stale, cacheTolerance
});
let snapshot: PageSnapshot | undefined;
@ -641,10 +642,10 @@ ${this.content}
return r;
}
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
let cache;
if (!noCache && !crawlOpts?.cookies?.length) {
cache = await this.queryCache(urlToCrawl);
if (cacheTolerance && !crawlOpts?.cookies?.length) {
cache = await this.queryCache(urlToCrawl, cacheTolerance);
}
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
@ -687,10 +688,10 @@ ${this.content}
}
async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
async *scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
let nextDeferred = Defer();
let concluded = false;

View File

@ -18,7 +18,7 @@ import { CookieParam } from 'puppeteer';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
import { SearchResult } from '../db/searched';
import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
@singleton()
@ -27,6 +27,9 @@ export class SearcherHost extends RPCHost {
cacheRetentionMs = 1000 * 3600 * 24 * 7;
cacheValidMs = 1000 * 3600;
pageCacheToleranceMs = 1000 * 3600 * 24;
reasonableDelayMs = 10_000;
constructor(
protected globalLogger: Logger,
@ -178,6 +181,7 @@ export class SearcherHost extends RPCHost {
const customMode = ctx.req.get('x-respond-with') || 'default';
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
const noCache = Boolean(ctx.req.get('x-no-cache'));
const pageCacheTolerance = noCache ? 0 : this.pageCacheToleranceMs;
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
if (Array.isArray(setCookieHeaders)) {
@ -204,8 +208,7 @@ export class SearcherHost extends RPCHost {
count: 5
});
const urls = r.web.results.map((x) => new URL(x.url));
const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
@ -238,12 +241,14 @@ export class SearcherHost extends RPCHost {
return sseStream;
}
const t0 = Date.now();
let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of it) {
lastScrapped = scrapped;
if (!this.qualified(scrapped)) {
if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
continue;
}
@ -264,7 +269,7 @@ export class SearcherHost extends RPCHost {
for await (const scrapped of it) {
lastScrapped = scrapped;
if (!this.qualified(scrapped)) {
if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
continue;
}
chargeAmount = this.getChargeAmount(scrapped);
@ -282,18 +287,27 @@ export class SearcherHost extends RPCHost {
}
async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
urls: URL[], options?: ScrappingOptions, noCache = false) {
for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
searchResults: WebSearchResult[], options?: ScrappingOptions, pageCacheTolerance?: number) {
const urls = searchResults.map((x) => new URL(x.url));
for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
const mapped = scrapped.map((x, i) => {
if (!x) {
const upstreamSearchResult = searchResults[i];
if (!x || (!x.parsed && mode !== 'markdown')) {
const p = {
toString() {
return `[${i + 1}] No content available for ${urls[i]}`;
toString(this: any) {
if (this.title && this.description) {
return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}
[${i + 1}] Description: ${this.description}
`;
}
return `[${i + 1}] No content available for ${this.url}`;
}
};
const r = Object.create(p);
r.url = urls[i].toString();
r.url = upstreamSearchResult.url;
r.title = upstreamSearchResult.title;
r.description = upstreamSearchResult.description;
return r;
}
@ -317,7 +331,7 @@ export class SearcherHost extends RPCHost {
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
[${i + 1}] Markdown Content:
${this.content}
`;
`;
};
}
}