mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
fix: add cache tolerance
This commit is contained in:
parent
d100c3fc5f
commit
1cf8e83857
|
@ -426,6 +426,7 @@ ${this.content}
|
|||
const customMode = ctx.req.get('x-respond-with') || 'default';
|
||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
||||
const cacheTolerance = noCache ? 0 : this.cacheValidMs;
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||
if (Array.isArray(setCookieHeaders)) {
|
||||
|
@ -454,7 +455,7 @@ ${this.content}
|
|||
rpcReflect.return(sseStream);
|
||||
|
||||
try {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
||||
if (!scrapped) {
|
||||
continue;
|
||||
}
|
||||
|
@ -481,7 +482,7 @@ ${this.content}
|
|||
|
||||
let lastScrapped;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
||||
lastScrapped = scrapped;
|
||||
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
||||
continue;
|
||||
|
@ -503,7 +504,7 @@ ${this.content}
|
|||
return formatted;
|
||||
}
|
||||
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
||||
lastScrapped = scrapped;
|
||||
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
||||
continue;
|
||||
|
@ -546,7 +547,7 @@ ${this.content}
|
|||
return digest;
|
||||
}
|
||||
|
||||
async queryCache(urlToCrawl: URL) {
|
||||
async queryCache(urlToCrawl: URL, cacheTolerance: number) {
|
||||
const digest = this.getUrlDigest(urlToCrawl);
|
||||
|
||||
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
||||
|
@ -556,9 +557,9 @@ ${this.content}
|
|||
}
|
||||
|
||||
const age = Date.now() - cache.createdAt.valueOf();
|
||||
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
|
||||
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
|
||||
url: urlToCrawl, digest, age, stale
|
||||
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
|
||||
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
|
||||
url: urlToCrawl, digest, age, stale, cacheTolerance
|
||||
});
|
||||
|
||||
let snapshot: PageSnapshot | undefined;
|
||||
|
@ -641,10 +642,10 @@ ${this.content}
|
|||
return r;
|
||||
}
|
||||
|
||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
|
||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
||||
let cache;
|
||||
if (!noCache && !crawlOpts?.cookies?.length) {
|
||||
cache = await this.queryCache(urlToCrawl);
|
||||
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||
}
|
||||
|
||||
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
||||
|
@ -687,10 +688,10 @@ ${this.content}
|
|||
}
|
||||
|
||||
|
||||
async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
|
||||
const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
|
||||
async *scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
|
||||
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
||||
|
||||
const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
|
||||
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
||||
|
||||
let nextDeferred = Defer();
|
||||
let concluded = false;
|
||||
|
|
|
@ -18,7 +18,7 @@ import { CookieParam } from 'puppeteer';
|
|||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||
import { SearchResult } from '../db/searched';
|
||||
import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
|
||||
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
||||
|
||||
|
||||
@singleton()
|
||||
|
@ -27,6 +27,9 @@ export class SearcherHost extends RPCHost {
|
|||
|
||||
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||
cacheValidMs = 1000 * 3600;
|
||||
pageCacheToleranceMs = 1000 * 3600 * 24;
|
||||
|
||||
reasonableDelayMs = 10_000;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
|
@ -178,6 +181,7 @@ export class SearcherHost extends RPCHost {
|
|||
const customMode = ctx.req.get('x-respond-with') || 'default';
|
||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
||||
const pageCacheTolerance = noCache ? 0 : this.pageCacheToleranceMs;
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||
if (Array.isArray(setCookieHeaders)) {
|
||||
|
@ -204,8 +208,7 @@ export class SearcherHost extends RPCHost {
|
|||
count: 5
|
||||
});
|
||||
|
||||
const urls = r.web.results.map((x) => new URL(x.url));
|
||||
const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
|
||||
const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);
|
||||
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
const sseStream = new OutputServerEventStream();
|
||||
|
@ -238,12 +241,14 @@ export class SearcherHost extends RPCHost {
|
|||
return sseStream;
|
||||
}
|
||||
|
||||
const t0 = Date.now();
|
||||
|
||||
let lastScrapped;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
for await (const scrapped of it) {
|
||||
lastScrapped = scrapped;
|
||||
|
||||
if (!this.qualified(scrapped)) {
|
||||
if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -264,7 +269,7 @@ export class SearcherHost extends RPCHost {
|
|||
for await (const scrapped of it) {
|
||||
lastScrapped = scrapped;
|
||||
|
||||
if (!this.qualified(scrapped)) {
|
||||
if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
|
||||
continue;
|
||||
}
|
||||
chargeAmount = this.getChargeAmount(scrapped);
|
||||
|
@ -282,18 +287,27 @@ export class SearcherHost extends RPCHost {
|
|||
}
|
||||
|
||||
async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||
urls: URL[], options?: ScrappingOptions, noCache = false) {
|
||||
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
|
||||
searchResults: WebSearchResult[], options?: ScrappingOptions, pageCacheTolerance?: number) {
|
||||
const urls = searchResults.map((x) => new URL(x.url));
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
|
||||
const mapped = scrapped.map((x, i) => {
|
||||
if (!x) {
|
||||
const upstreamSearchResult = searchResults[i];
|
||||
if (!x || (!x.parsed && mode !== 'markdown')) {
|
||||
const p = {
|
||||
toString() {
|
||||
return `[${i + 1}] No content available for ${urls[i]}`;
|
||||
toString(this: any) {
|
||||
if (this.title && this.description) {
|
||||
return `[${i + 1}] Title: ${this.title}
|
||||
[${i + 1}] URL Source: ${this.url}
|
||||
[${i + 1}] Description: ${this.description}
|
||||
`;
|
||||
}
|
||||
return `[${i + 1}] No content available for ${this.url}`;
|
||||
}
|
||||
};
|
||||
const r = Object.create(p);
|
||||
r.url = urls[i].toString();
|
||||
r.url = upstreamSearchResult.url;
|
||||
r.title = upstreamSearchResult.title;
|
||||
r.description = upstreamSearchResult.description;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
@ -317,7 +331,7 @@ export class SearcherHost extends RPCHost {
|
|||
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
||||
[${i + 1}] Markdown Content:
|
||||
${this.content}
|
||||
`;
|
||||
`;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user