fix: add cache tolerance

2024-11-16 11:42:32 +08:00 · 2024-05-15 08:06:35 +08:00 · 2024-05-15 08:06:35 +08:00 · 1cf8e83857
commit 1cf8e83857
parent d100c3fc5f
2 changed files with 41 additions and 26 deletions
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@ -426,6 +426,7 @@ ${this.content}
        const customMode = ctx.req.get('x-respond-with') || 'default';
        const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
        const noCache = Boolean(ctx.req.get('x-no-cache'));
+        const cacheTolerance = noCache ? 0 : this.cacheValidMs;
        const cookies: CookieParam[] = [];
        const setCookieHeaders = ctx.req.headers['x-set-cookie'];
        if (Array.isArray(setCookieHeaders)) {
@ -454,7 +455,7 @@ ${this.content}
            rpcReflect.return(sseStream);

            try {
-                for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
+                for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
                    if (!scrapped) {
                        continue;
                    }
@ -481,7 +482,7 @@ ${this.content}

        let lastScrapped;
        if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
-            for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
+            for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
                lastScrapped = scrapped;
                if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
                    continue;
@ -503,7 +504,7 @@ ${this.content}
            return formatted;
        }

-        for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
+        for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
            lastScrapped = scrapped;
            if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
                continue;
@ -546,7 +547,7 @@ ${this.content}
        return digest;
    }

-    async queryCache(urlToCrawl: URL) {
+    async queryCache(urlToCrawl: URL, cacheTolerance: number) {
        const digest = this.getUrlDigest(urlToCrawl);

        const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
@ -556,9 +557,9 @@ ${this.content}
        }

        const age = Date.now() - cache.createdAt.valueOf();
-        const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
-        this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
-            url: urlToCrawl, digest, age, stale
+        const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
+        this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
+            url: urlToCrawl, digest, age, stale, cacheTolerance
        });

        let snapshot: PageSnapshot | undefined;
@ -641,10 +642,10 @@ ${this.content}
        return r;
    }

-    async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
+    async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
        let cache;
-        if (!noCache && !crawlOpts?.cookies?.length) {
-            cache = await this.queryCache(urlToCrawl);
+        if (cacheTolerance && !crawlOpts?.cookies?.length) {
+            cache = await this.queryCache(urlToCrawl, cacheTolerance);
        }

        if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
@ -687,10 +688,10 @@ ${this.content}
    }


-    async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
-        const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
+    async *scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
+        const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));

-        const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
+        const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);

        let nextDeferred = Defer();
        let concluded = false;
--- a/backend/functions/src/cloud-functions/searcher.ts
+++ b/backend/functions/src/cloud-functions/searcher.ts
@ -18,7 +18,7 @@ import { CookieParam } from 'puppeteer';
 import { parseString as parseSetCookieString } from 'set-cookie-parser';
 import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
 import { SearchResult } from '../db/searched';
-import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
+import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';


@singleton()
@ -27,6 +27,9 @@ export class SearcherHost extends RPCHost {

    cacheRetentionMs = 1000 * 3600 * 24 * 7;
    cacheValidMs = 1000 * 3600;
+    pageCacheToleranceMs = 1000 * 3600 * 24;
+
+    reasonableDelayMs = 10_000;

    constructor(
        protected globalLogger: Logger,
@ -178,6 +181,7 @@ export class SearcherHost extends RPCHost {
        const customMode = ctx.req.get('x-respond-with') || 'default';
        const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
        const noCache = Boolean(ctx.req.get('x-no-cache'));
+        const pageCacheTolerance = noCache ? 0 : this.pageCacheToleranceMs;
        const cookies: CookieParam[] = [];
        const setCookieHeaders = ctx.req.headers['x-set-cookie'];
        if (Array.isArray(setCookieHeaders)) {
@ -204,8 +208,7 @@ export class SearcherHost extends RPCHost {
            count: 5
        });

-        const urls = r.web.results.map((x) => new URL(x.url));
-        const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
+        const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);

        if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
            const sseStream = new OutputServerEventStream();
@ -238,12 +241,14 @@ export class SearcherHost extends RPCHost {
            return sseStream;
        }

+        const t0 = Date.now();
+
        let lastScrapped;
        if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
            for await (const scrapped of it) {
                lastScrapped = scrapped;

-                if (!this.qualified(scrapped)) {
+                if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
                    continue;
                }

@ -264,7 +269,7 @@ export class SearcherHost extends RPCHost {
        for await (const scrapped of it) {
            lastScrapped = scrapped;

-            if (!this.qualified(scrapped)) {
+            if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
                continue;
            }
            chargeAmount = this.getChargeAmount(scrapped);
@ -282,18 +287,27 @@ export class SearcherHost extends RPCHost {
    }

    async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
-        urls: URL[], options?: ScrappingOptions, noCache = false) {
-
-        for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
+        searchResults: WebSearchResult[], options?: ScrappingOptions, pageCacheTolerance?: number) {
+        const urls = searchResults.map((x) => new URL(x.url));
+        for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
            const mapped = scrapped.map((x, i) => {
-                if (!x) {
+                const upstreamSearchResult = searchResults[i];
+                if (!x || (!x.parsed && mode !== 'markdown')) {
                    const p = {
-                        toString() {
-                            return `[${i + 1}] No content available for ${urls[i]}`;
+                        toString(this: any) {
+                            if (this.title && this.description) {
+                                return `[${i + 1}] Title: ${this.title}
+[${i + 1}] URL Source: ${this.url}
+[${i + 1}] Description: ${this.description}
+`;
+                            }
+                            return `[${i + 1}] No content available for ${this.url}`;
                        }
                    };
                    const r = Object.create(p);
-                    r.url = urls[i].toString();
+                    r.url = upstreamSearchResult.url;
+                    r.title = upstreamSearchResult.title;
+                    r.description = upstreamSearchResult.description;

                    return r;
                }
@ -317,7 +331,7 @@ export class SearcherHost extends RPCHost {
 [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
 [${i + 1}] Markdown Content:
 ${this.content}
-        `;
+`;
                    };
                }
            }