mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
feat: web search (#57)
This commit is contained in:
parent
f171e54ac9
commit
2e3c217479
3
backend/.gitignore
vendored
3
backend/.gitignore
vendored
|
@ -75,4 +75,5 @@ build/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
*.local
|
*.local
|
||||||
.secret.*
|
.secret.*
|
||||||
|
licensed/
|
11
backend/functions/integrity-check.cjs
Executable file
11
backend/functions/integrity-check.cjs
Executable file
|
@ -0,0 +1,11 @@
|
||||||
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
const fs = require('fs');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');
|
||||||
|
|
||||||
|
if (!fs.existsSync(file)) {
|
||||||
|
console.error(`Integrity check failed: ${file} does not exist.`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
31
backend/functions/package-lock.json
generated
31
backend/functions/package-lock.json
generated
|
@ -24,6 +24,7 @@
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"langdetect": "^0.2.1",
|
"langdetect": "^0.2.1",
|
||||||
|
"maxmind": "^4.3.18",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
"openai": "^4.20.0",
|
"openai": "^4.20.0",
|
||||||
"puppeteer": "^22.7.1",
|
"puppeteer": "^22.7.1",
|
||||||
|
@ -8144,6 +8145,19 @@
|
||||||
"tmpl": "1.0.5"
|
"tmpl": "1.0.5"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/maxmind": {
|
||||||
|
"version": "4.3.18",
|
||||||
|
"resolved": "https://registry.npmjs.org/maxmind/-/maxmind-4.3.18.tgz",
|
||||||
|
"integrity": "sha512-5b9utU7ZxcGYTBaO7hCF0FXyfw3IpankLn+FnLW4RZS1zi97RBeSdfXJFJlk5UxNsMiFZlsdMT3lzvD+bD8MLQ==",
|
||||||
|
"dependencies": {
|
||||||
|
"mmdb-lib": "2.1.0",
|
||||||
|
"tiny-lru": "11.2.5"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=12",
|
||||||
|
"npm": ">=6"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/media-typer": {
|
"node_modules/media-typer": {
|
||||||
"version": "0.3.0",
|
"version": "0.3.0",
|
||||||
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
||||||
|
@ -8375,6 +8389,15 @@
|
||||||
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
|
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
|
||||||
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
|
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
|
||||||
},
|
},
|
||||||
|
"node_modules/mmdb-lib": {
|
||||||
|
"version": "2.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/mmdb-lib/-/mmdb-lib-2.1.0.tgz",
|
||||||
|
"integrity": "sha512-tdDTZmnI5G4UoSctv2KxM/3VQt2XRj4CmR5R4VsAWsOUcS3LysHR34wtixWm/pXxXdkBDuN92auxkC0T2+qd1Q==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=10",
|
||||||
|
"npm": ">=6"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/mongodb": {
|
"node_modules/mongodb": {
|
||||||
"version": "5.9.2",
|
"version": "5.9.2",
|
||||||
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
|
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
|
||||||
|
@ -11059,6 +11082,14 @@
|
||||||
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
|
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
|
||||||
"integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
|
"integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
|
||||||
},
|
},
|
||||||
|
"node_modules/tiny-lru": {
|
||||||
|
"version": "11.2.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/tiny-lru/-/tiny-lru-11.2.5.tgz",
|
||||||
|
"integrity": "sha512-JpqM0K33lG6iQGKiigcwuURAKZlq6rHXfrgeL4/I8/REoyJTGU+tEMszvT/oTRVHG2OiylhGDjqPp1jWMlr3bw==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=12"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/tld-extract": {
|
"node_modules/tld-extract": {
|
||||||
"version": "2.1.0",
|
"version": "2.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
"name": "reader",
|
"name": "reader",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"lint": "eslint --ext .js,.ts .",
|
"lint": "eslint --ext .js,.ts .",
|
||||||
"build": "tsc -p .",
|
"build": "node ./integrity-check.cjs && tsc -p .",
|
||||||
"build:watch": "tsc --watch",
|
"build:watch": "tsc --watch",
|
||||||
"build:clean": "rm -rf ./build",
|
"build:clean": "rm -rf ./build",
|
||||||
"shell": "npm run build && firebase functions:shell",
|
"shell": "npm run build && firebase functions:shell",
|
||||||
|
@ -44,6 +44,7 @@
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"langdetect": "^0.2.1",
|
"langdetect": "^0.2.1",
|
||||||
|
"maxmind": "^4.3.18",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
"openai": "^4.20.0",
|
"openai": "^4.20.0",
|
||||||
"puppeteer": "^22.7.1",
|
"puppeteer": "^22.7.1",
|
||||||
|
|
|
@ -2,7 +2,7 @@ import {
|
||||||
assignTransferProtocolMeta, marshalErrorLike,
|
assignTransferProtocolMeta, marshalErrorLike,
|
||||||
RPCHost, RPCReflection,
|
RPCHost, RPCReflection,
|
||||||
HashManager,
|
HashManager,
|
||||||
AssertionFailureError, ParamValidationError,
|
AssertionFailureError, ParamValidationError, Defer,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||||
|
@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost {
|
||||||
cacheValidMs = 1000 * 300;
|
cacheValidMs = 1000 * 300;
|
||||||
urlValidMs = 1000 * 3600 * 4;
|
urlValidMs = 1000 * 3600 * 4;
|
||||||
|
|
||||||
|
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
|
||||||
|
[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY
|
||||||
|
[Homepage] https://jina.ai/reader
|
||||||
|
[Source code] https://github.com/jina-ai/reader
|
||||||
|
`;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
|
@ -357,10 +363,7 @@ ${this.content}
|
||||||
[Balance left] ${latestUser.wallet.total_balance}
|
[Balance left] ${latestUser.wallet.total_balance}
|
||||||
` : '';
|
` : '';
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
|
return assignTransferProtocolMeta(`${this.indexText}${authMixin}`,
|
||||||
[Homepage] https://jina.ai/reader
|
|
||||||
[Source code] https://github.com/jina-ai/reader
|
|
||||||
${authMixin}`,
|
|
||||||
{ contentType: 'text/plain', envelope: null }
|
{ contentType: 'text/plain', envelope: null }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -638,13 +641,13 @@ ${authMixin}`,
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
|
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
|
||||||
let cache;
|
let cache;
|
||||||
if (!noCache && !crawlOpts.cookies?.length) {
|
if (!noCache && !crawlOpts?.cookies?.length) {
|
||||||
cache = await this.queryCache(urlToCrawl);
|
cache = await this.queryCache(urlToCrawl);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) {
|
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
||||||
yield cache.snapshot;
|
yield cache.snapshot;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -683,4 +686,47 @@ ${authMixin}`,
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
|
||||||
|
const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
|
||||||
|
|
||||||
|
const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
|
||||||
|
|
||||||
|
let nextDeferred = Defer();
|
||||||
|
let concluded = false;
|
||||||
|
|
||||||
|
const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
|
||||||
|
for await (const x of it) {
|
||||||
|
results[idx] = x;
|
||||||
|
|
||||||
|
if (x) {
|
||||||
|
nextDeferred.resolve();
|
||||||
|
nextDeferred = Defer();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Promise.all(
|
||||||
|
iterators.map((it, idx) => handler(it, idx))
|
||||||
|
).finally(() => {
|
||||||
|
concluded = true;
|
||||||
|
nextDeferred.resolve();
|
||||||
|
});
|
||||||
|
|
||||||
|
yield results;
|
||||||
|
|
||||||
|
try {
|
||||||
|
while (!concluded) {
|
||||||
|
await nextDeferred.promise;
|
||||||
|
|
||||||
|
yield results;
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
for (const x of iterators) {
|
||||||
|
x.return();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
389
backend/functions/src/cloud-functions/searcher.ts
Normal file
389
backend/functions/src/cloud-functions/searcher.ts
Normal file
|
@ -0,0 +1,389 @@
|
||||||
|
import {
|
||||||
|
assignTransferProtocolMeta, marshalErrorLike,
|
||||||
|
RPCHost, RPCReflection,
|
||||||
|
AssertionFailureError,
|
||||||
|
objHashMd5B64Of,
|
||||||
|
} from 'civkit';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||||
|
import { RateLimitControl } from '../shared/services/rate-limit';
|
||||||
|
import _ from 'lodash';
|
||||||
|
import { ScrappingOptions } from '../services/puppeteer';
|
||||||
|
import { Request, Response } from 'express';
|
||||||
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
|
import { BraveSearchService } from '../services/brave-search';
|
||||||
|
import { CrawlerHost } from './crawler';
|
||||||
|
import { CookieParam } from 'puppeteer';
|
||||||
|
|
||||||
|
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
|
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||||
|
import { SearchResult } from '../db/searched';
|
||||||
|
import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
|
||||||
|
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class SearcherHost extends RPCHost {
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||||
|
cacheValidMs = 1000 * 3600;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
protected rateLimitControl: RateLimitControl,
|
||||||
|
protected threadLocal: AsyncContext,
|
||||||
|
protected braveSearchService: BraveSearchService,
|
||||||
|
protected crawler: CrawlerHost,
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
@CloudHTTPv2({
|
||||||
|
name: 'search2',
|
||||||
|
runtime: {
|
||||||
|
memory: '4GiB',
|
||||||
|
timeoutSeconds: 300,
|
||||||
|
concurrency: 4,
|
||||||
|
},
|
||||||
|
tags: ['Searcher'],
|
||||||
|
httpMethod: ['get', 'post'],
|
||||||
|
returnType: [String, OutputServerEventStream],
|
||||||
|
exposeRoot: true,
|
||||||
|
})
|
||||||
|
@CloudHTTPv2({
|
||||||
|
runtime: {
|
||||||
|
memory: '8GiB',
|
||||||
|
timeoutSeconds: 300,
|
||||||
|
concurrency: 8,
|
||||||
|
maxInstances: 200,
|
||||||
|
},
|
||||||
|
openapi: {
|
||||||
|
operation: {
|
||||||
|
parameters: {
|
||||||
|
'Accept': {
|
||||||
|
description: `Specifies your preference for the response format. \n\n` +
|
||||||
|
`Supported formats:\n` +
|
||||||
|
`- text/event-stream\n` +
|
||||||
|
`- application/json or text/json\n` +
|
||||||
|
`- text/plain`
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-No-Cache': {
|
||||||
|
description: `Ignores internal cache if this header is specified with a value.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Respond-With': {
|
||||||
|
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
|
||||||
|
`Supported formats:\n` +
|
||||||
|
`- markdown\n` +
|
||||||
|
`- html\n` +
|
||||||
|
`- text\n` +
|
||||||
|
`- screenshot\n`
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Proxy-Url': {
|
||||||
|
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
||||||
|
`Supported protocols:\n` +
|
||||||
|
`- http\n` +
|
||||||
|
`- https\n` +
|
||||||
|
`- socks4\n` +
|
||||||
|
`- socks5\n\n` +
|
||||||
|
`For authentication, https://user:pass@host:port`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Set-Cookie': {
|
||||||
|
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||||
|
`Syntax is the same with standard Set-Cookie`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-With-Generated-Alt': {
|
||||||
|
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
tags: ['Searcher'],
|
||||||
|
httpMethod: ['get', 'post'],
|
||||||
|
returnType: [String, OutputServerEventStream],
|
||||||
|
exposeRoot: true,
|
||||||
|
})
|
||||||
|
async search(
|
||||||
|
@RPCReflect() rpcReflect: RPCReflection,
|
||||||
|
@Ctx() ctx: {
|
||||||
|
req: Request,
|
||||||
|
res: Response,
|
||||||
|
},
|
||||||
|
auth: JinaEmbeddingsAuthDTO
|
||||||
|
) {
|
||||||
|
const uid = await auth.solveUID();
|
||||||
|
let chargeAmount = 0;
|
||||||
|
const noSlashPath = ctx.req.url.slice(1);
|
||||||
|
if (!noSlashPath) {
|
||||||
|
const latestUser = uid ? await auth.assertUser() : undefined;
|
||||||
|
const authMixin = latestUser ? `
|
||||||
|
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
|
||||||
|
[Balance left] ${latestUser.wallet.total_balance}
|
||||||
|
` : '';
|
||||||
|
|
||||||
|
return assignTransferProtocolMeta(`${this.crawler.indexText}${authMixin}`,
|
||||||
|
{ contentType: 'text/plain', envelope: null }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (uid) {
|
||||||
|
const user = await auth.assertUser();
|
||||||
|
if (!(user.wallet.total_balance > 0)) {
|
||||||
|
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
||||||
|
[
|
||||||
|
// 1000 requests per minute
|
||||||
|
new Date(Date.now() - 60 * 1000), 1000
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
rpcReflect.finally(() => {
|
||||||
|
if (chargeAmount) {
|
||||||
|
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
|
||||||
|
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else if (ctx.req.ip) {
|
||||||
|
this.threadLocal.set('ip', ctx.req.ip);
|
||||||
|
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
||||||
|
[
|
||||||
|
// 100 requests per minute
|
||||||
|
new Date(Date.now() - 60 * 1000), 100
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const customMode = ctx.req.get('x-respond-with') || 'default';
|
||||||
|
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
||||||
|
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
||||||
|
const cookies: CookieParam[] = [];
|
||||||
|
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||||
|
if (Array.isArray(setCookieHeaders)) {
|
||||||
|
for (const setCookie of setCookieHeaders) {
|
||||||
|
cookies.push({
|
||||||
|
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else if (setCookieHeaders) {
|
||||||
|
cookies.push({
|
||||||
|
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
||||||
|
const crawlOpts: ScrappingOptions = {
|
||||||
|
proxyUrl: ctx.req.get('x-proxy-url'),
|
||||||
|
cookies,
|
||||||
|
favorScreenshot: customMode === 'screenshot'
|
||||||
|
};
|
||||||
|
|
||||||
|
const searchQuery = noSlashPath;
|
||||||
|
const r = await this.cachedWebSearch({
|
||||||
|
q: searchQuery,
|
||||||
|
count: 5
|
||||||
|
});
|
||||||
|
|
||||||
|
const urls = r.web.results.map((x) => new URL(x.url));
|
||||||
|
const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
|
||||||
|
|
||||||
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
|
const sseStream = new OutputServerEventStream();
|
||||||
|
rpcReflect.return(sseStream);
|
||||||
|
|
||||||
|
try {
|
||||||
|
for await (const scrapped of it) {
|
||||||
|
if (!scrapped) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
chargeAmount = this.getChargeAmount(scrapped);
|
||||||
|
sseStream.write({
|
||||||
|
event: 'data',
|
||||||
|
data: scrapped,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
this.logger.error(`Failed to collect search result for query ${searchQuery}`,
|
||||||
|
{ err: marshalErrorLike(err) }
|
||||||
|
);
|
||||||
|
sseStream.write({
|
||||||
|
event: 'error',
|
||||||
|
data: marshalErrorLike(err),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
sseStream.end();
|
||||||
|
|
||||||
|
return sseStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
let lastScrapped;
|
||||||
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
|
for await (const scrapped of it) {
|
||||||
|
lastScrapped = scrapped;
|
||||||
|
|
||||||
|
if (!this.qualified(scrapped)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
chargeAmount = this.getChargeAmount(scrapped);
|
||||||
|
|
||||||
|
return scrapped;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!lastScrapped) {
|
||||||
|
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||||
|
|
||||||
|
return lastScrapped;
|
||||||
|
}
|
||||||
|
|
||||||
|
for await (const scrapped of it) {
|
||||||
|
lastScrapped = scrapped;
|
||||||
|
|
||||||
|
if (!this.qualified(scrapped)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
chargeAmount = this.getChargeAmount(scrapped);
|
||||||
|
|
||||||
|
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!lastScrapped) {
|
||||||
|
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||||
|
|
||||||
|
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
||||||
|
}
|
||||||
|
|
||||||
|
async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||||
|
urls: URL[], options?: ScrappingOptions, noCache = false) {
|
||||||
|
|
||||||
|
for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
|
||||||
|
const mapped = scrapped.map((x, i) => {
|
||||||
|
if (!x) {
|
||||||
|
const p = {
|
||||||
|
toString() {
|
||||||
|
return `[${i + 1}] No content available for ${urls[i]}`;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
const r = Object.create(p);
|
||||||
|
r.url = urls[i].toString();
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
return this.crawler.formatSnapshot(mode, x, urls[i]);
|
||||||
|
});
|
||||||
|
|
||||||
|
const resultArray = await Promise.all(mapped);
|
||||||
|
for (const [i, result] of resultArray.entries()) {
|
||||||
|
if (result && typeof result === 'object' && Object.hasOwn(result, 'toString')) {
|
||||||
|
result.toString = function (this: any) {
|
||||||
|
const mixins = [];
|
||||||
|
if (this.publishedTime) {
|
||||||
|
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mode === 'markdown') {
|
||||||
|
return `[${i + 1}]\n${this.content}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `[${i + 1}] Title: ${this.title}
|
||||||
|
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
||||||
|
[${i + 1}] Markdown Content:
|
||||||
|
${this.content}
|
||||||
|
`;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resultArray.toString = function () {
|
||||||
|
return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n';
|
||||||
|
};
|
||||||
|
|
||||||
|
yield resultArray;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getChargeAmount(formatted: any[]) {
|
||||||
|
return _.sum(
|
||||||
|
formatted.map((x) => this.crawler.getChargeAmount(x) || 0)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
qualified(scrapped: any[]) {
|
||||||
|
return _.every(scrapped, (x) =>
|
||||||
|
(x as any)?.title &&
|
||||||
|
(
|
||||||
|
(x as any).content ||
|
||||||
|
(x as any).screenShotUrl ||
|
||||||
|
(x as any).screenshot ||
|
||||||
|
(x as any).text ||
|
||||||
|
(x as any).html
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
||||||
|
const queryDigest = objHashMd5B64Of(query);
|
||||||
|
let cache;
|
||||||
|
if (!noCache) {
|
||||||
|
cache = (await SearchResult.fromFirestoreQuery(
|
||||||
|
SearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
|
||||||
|
.orderBy('createdAt', 'desc')
|
||||||
|
.limit(1)
|
||||||
|
))[0];
|
||||||
|
if (cache) {
|
||||||
|
const age = Date.now() - cache.createdAt.valueOf();
|
||||||
|
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
|
||||||
|
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
|
||||||
|
query, digest: queryDigest, age, stale
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!stale) {
|
||||||
|
return cache.response as WebSearchApiResponse;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const r = await this.braveSearchService.webSearch(query);
|
||||||
|
|
||||||
|
const nowDate = new Date();
|
||||||
|
const record = SearchResult.from({
|
||||||
|
query,
|
||||||
|
queryDigest,
|
||||||
|
response: r,
|
||||||
|
createdAt: nowDate,
|
||||||
|
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
||||||
|
});
|
||||||
|
SearchResult.save(record).catch((err) => {
|
||||||
|
this.logger.warn(`Failed to cache search result`, { err });
|
||||||
|
});
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
}
|
60
backend/functions/src/db/searched.ts
Normal file
60
backend/functions/src/db/searched.ts
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
import { Also, parseJSONText, Prop } from 'civkit';
|
||||||
|
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||||
|
import _ from 'lodash';
|
||||||
|
|
||||||
|
@Also({
|
||||||
|
dictOf: Object
|
||||||
|
})
|
||||||
|
export class SearchResult extends FirestoreRecord {
|
||||||
|
static override collectionName = 'searchResults';
|
||||||
|
|
||||||
|
override _id!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
query!: any;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
queryDigest!: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
response?: any;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
createdAt!: Date;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
expireAt?: Date;
|
||||||
|
|
||||||
|
[k: string]: any;
|
||||||
|
|
||||||
|
static patchedFields = [
|
||||||
|
'query',
|
||||||
|
'response',
|
||||||
|
];
|
||||||
|
|
||||||
|
static override from(input: any) {
|
||||||
|
for (const field of this.patchedFields) {
|
||||||
|
if (typeof input[field] === 'string') {
|
||||||
|
input[field] = parseJSONText(input[field]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return super.from(input) as SearchResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
override degradeForFireStore() {
|
||||||
|
const copy: any = { ...this };
|
||||||
|
|
||||||
|
for (const field of (this.constructor as typeof SearchResult).patchedFields) {
|
||||||
|
if (typeof copy[field] === 'object') {
|
||||||
|
copy[field] = JSON.stringify(copy[field]) as any;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
}
|
71
backend/functions/src/services/brave-search.ts
Normal file
71
backend/functions/src/services/brave-search.ts
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
import { AsyncService, DownstreamServiceFailureError } from 'civkit';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import { SecretExposer } from '../shared/services/secrets';
|
||||||
|
import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||||
|
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
||||||
|
import { AsyncContext } from '../shared';
|
||||||
|
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class BraveSearchService extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
braveSearchHTTP!: BraveSearchHTTP;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
protected secretExposer: SecretExposer,
|
||||||
|
protected geoipControl: GeoIPService,
|
||||||
|
protected threadLocal: AsyncContext,
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
this.emit('ready');
|
||||||
|
|
||||||
|
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
|
||||||
|
}
|
||||||
|
|
||||||
|
async webSearch(query: WebSearchQueryParams) {
|
||||||
|
const ip = this.threadLocal.get('ip');
|
||||||
|
const extraHeaders: WebSearchOptionalHeaderOptions = {};
|
||||||
|
if (ip) {
|
||||||
|
const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
|
||||||
|
|
||||||
|
if (geoip?.city) {
|
||||||
|
extraHeaders['X-Loc-City'] = geoip.city;
|
||||||
|
}
|
||||||
|
if (geoip?.country) {
|
||||||
|
extraHeaders['X-Loc-Country'] = geoip.country.code;
|
||||||
|
}
|
||||||
|
if (geoip?.timezone) {
|
||||||
|
extraHeaders['X-Loc-Timezone'] = geoip.timezone;
|
||||||
|
}
|
||||||
|
if (geoip?.coordinates) {
|
||||||
|
extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`;
|
||||||
|
extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`;
|
||||||
|
}
|
||||||
|
if (geoip?.subdivisions?.length) {
|
||||||
|
extraHeaders['X-Loc-State'] = geoip.subdivisions[0].code;
|
||||||
|
extraHeaders['X-Loc-State-Name'] = geoip.subdivisions[0].name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (this.threadLocal.get('userAgent')) {
|
||||||
|
extraHeaders['User-Agent'] = this.threadLocal.get('userAgent');
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const r = await this.braveSearchHTTP.webSearch(query, { headers: extraHeaders as Record<string, string> });
|
||||||
|
|
||||||
|
return r.parsed;
|
||||||
|
} catch (err) {
|
||||||
|
throw new DownstreamServiceFailureError({ message: `Search failed`, cause: err });
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
123
backend/functions/src/services/geoip.ts
Normal file
123
backend/functions/src/services/geoip.ts
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
import { container, singleton } from 'tsyringe';
|
||||||
|
import fsp from 'fs/promises';
|
||||||
|
import { CityResponse, Reader } from 'maxmind';
|
||||||
|
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
|
||||||
|
import { Logger } from '../shared';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
|
export enum GEOIP_SUPPORTED_LANGUAGES {
|
||||||
|
EN = 'en',
|
||||||
|
ZH_CN = 'zh-CN',
|
||||||
|
JA = 'ja',
|
||||||
|
DE = 'de',
|
||||||
|
FR = 'fr',
|
||||||
|
ES = 'es',
|
||||||
|
PT_BR = 'pt-BR',
|
||||||
|
RU = 'ru',
|
||||||
|
}
|
||||||
|
|
||||||
|
export class GeoIPInfo extends AutoCastable {
|
||||||
|
@Prop()
|
||||||
|
code?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
name?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class GeoIPCountryInfo extends GeoIPInfo {
|
||||||
|
@Prop()
|
||||||
|
eu?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class GeoIPCityResponse extends AutoCastable {
|
||||||
|
@Prop()
|
||||||
|
continent?: GeoIPInfo;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
country?: GeoIPCountryInfo;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
arrayOf: GeoIPInfo
|
||||||
|
})
|
||||||
|
subdivisions?: GeoIPInfo[];
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
city?: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
arrayOf: Number
|
||||||
|
})
|
||||||
|
coordinates?: [number, number, number];
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
timezone?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class GeoIPService extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
mmdbCity!: Reader<CityResponse>;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
@runOnce()
|
||||||
|
async _lazyload() {
|
||||||
|
const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb');
|
||||||
|
|
||||||
|
const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null });
|
||||||
|
|
||||||
|
this.mmdbCity = new Reader<CityResponse>(dbBuff);
|
||||||
|
|
||||||
|
this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
|
||||||
|
await this._lazyload();
|
||||||
|
|
||||||
|
const r = this.mmdbCity.get(ip);
|
||||||
|
|
||||||
|
if (!r) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
return GeoIPCityResponse.from({
|
||||||
|
continent: r.continent ? {
|
||||||
|
code: r.continent?.code,
|
||||||
|
name: r.continent?.names?.[lang] || r.continent?.names?.en,
|
||||||
|
} : undefined,
|
||||||
|
country: r.country ? {
|
||||||
|
code: r.country?.iso_code,
|
||||||
|
name: r.country?.names?.[lang] || r.country?.names.en,
|
||||||
|
eu: r.country?.is_in_european_union,
|
||||||
|
} : undefined,
|
||||||
|
city: r.city?.names?.[lang] || r.city?.names?.en,
|
||||||
|
subdivisions: r.subdivisions?.map((x) => ({
|
||||||
|
code: x.iso_code,
|
||||||
|
name: x.names?.[lang] || x.names?.en,
|
||||||
|
})),
|
||||||
|
coordinates: r.location ? [
|
||||||
|
r.location.latitude, r.location.longitude, r.location.accuracy_radius
|
||||||
|
] : undefined,
|
||||||
|
timezone: r.location?.time_zone,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
const instance = container.resolve(GeoIPService);
|
||||||
|
|
||||||
|
export default instance;
|
|
@ -278,7 +278,7 @@ document.addEventListener('load', handlePageLoad);
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||||
// parsedUrl.search = '';
|
// parsedUrl.search = '';
|
||||||
const url = parsedUrl.toString();
|
const url = parsedUrl.toString();
|
||||||
|
|
||||||
|
@ -287,10 +287,10 @@ document.addEventListener('load', handlePageLoad);
|
||||||
let screenshot: Buffer | undefined;
|
let screenshot: Buffer | undefined;
|
||||||
|
|
||||||
const page = await this.pagePool.acquire();
|
const page = await this.pagePool.acquire();
|
||||||
if (options.proxyUrl) {
|
if (options?.proxyUrl) {
|
||||||
await page.useProxy(options.proxyUrl);
|
await page.useProxy(options.proxyUrl);
|
||||||
}
|
}
|
||||||
if (options.cookies) {
|
if (options?.cookies) {
|
||||||
await page.setCookie(...options.cookies);
|
await page.setCookie(...options.cookies);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -353,7 +353,7 @@ document.addEventListener('load', handlePageLoad);
|
||||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
yield { ...snapshot, screenshot } as PageSnapshot;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
lastHTML = snapshot.html;
|
lastHTML = snapshot.html;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 584791b789cd483dab18735416744b4d10130993
|
Subproject commit 2f2cdcff7b2738be33ee5aca858ef2d65eba29ed
|
Loading…
Reference in New Issue
Block a user