feat: web search (#57)

This commit is contained in:
Yanlong Wang 2024-05-14 19:39:43 +08:00 committed by GitHub
parent f171e54ac9
commit 2e3c217479
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 748 additions and 15 deletions

3
backend/.gitignore vendored
View File

@ -75,4 +75,5 @@ build/
.DS_Store .DS_Store
*.local *.local
.secret.* .secret.*
licensed/

View File

@ -0,0 +1,11 @@
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');
if (!fs.existsSync(file)) {
console.error(`Integrity check failed: ${file} does not exist.`);
process.exit(1);
}

View File

@ -24,6 +24,7 @@
"htmlparser2": "^9.0.0", "htmlparser2": "^9.0.0",
"jose": "^5.1.0", "jose": "^5.1.0",
"langdetect": "^0.2.1", "langdetect": "^0.2.1",
"maxmind": "^4.3.18",
"minio": "^7.1.3", "minio": "^7.1.3",
"openai": "^4.20.0", "openai": "^4.20.0",
"puppeteer": "^22.7.1", "puppeteer": "^22.7.1",
@ -8144,6 +8145,19 @@
"tmpl": "1.0.5" "tmpl": "1.0.5"
} }
}, },
"node_modules/maxmind": {
"version": "4.3.18",
"resolved": "https://registry.npmjs.org/maxmind/-/maxmind-4.3.18.tgz",
"integrity": "sha512-5b9utU7ZxcGYTBaO7hCF0FXyfw3IpankLn+FnLW4RZS1zi97RBeSdfXJFJlk5UxNsMiFZlsdMT3lzvD+bD8MLQ==",
"dependencies": {
"mmdb-lib": "2.1.0",
"tiny-lru": "11.2.5"
},
"engines": {
"node": ">=12",
"npm": ">=6"
}
},
"node_modules/media-typer": { "node_modules/media-typer": {
"version": "0.3.0", "version": "0.3.0",
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
@ -8375,6 +8389,15 @@
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
}, },
"node_modules/mmdb-lib": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/mmdb-lib/-/mmdb-lib-2.1.0.tgz",
"integrity": "sha512-tdDTZmnI5G4UoSctv2KxM/3VQt2XRj4CmR5R4VsAWsOUcS3LysHR34wtixWm/pXxXdkBDuN92auxkC0T2+qd1Q==",
"engines": {
"node": ">=10",
"npm": ">=6"
}
},
"node_modules/mongodb": { "node_modules/mongodb": {
"version": "5.9.2", "version": "5.9.2",
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz", "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
@ -11059,6 +11082,14 @@
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz", "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
"integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw==" "integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
}, },
"node_modules/tiny-lru": {
"version": "11.2.5",
"resolved": "https://registry.npmjs.org/tiny-lru/-/tiny-lru-11.2.5.tgz",
"integrity": "sha512-JpqM0K33lG6iQGKiigcwuURAKZlq6rHXfrgeL4/I8/REoyJTGU+tEMszvT/oTRVHG2OiylhGDjqPp1jWMlr3bw==",
"engines": {
"node": ">=12"
}
},
"node_modules/tld-extract": { "node_modules/tld-extract": {
"version": "2.1.0", "version": "2.1.0",
"resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz", "resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",

View File

@ -2,7 +2,7 @@
"name": "reader", "name": "reader",
"scripts": { "scripts": {
"lint": "eslint --ext .js,.ts .", "lint": "eslint --ext .js,.ts .",
"build": "tsc -p .", "build": "node ./integrity-check.cjs && tsc -p .",
"build:watch": "tsc --watch", "build:watch": "tsc --watch",
"build:clean": "rm -rf ./build", "build:clean": "rm -rf ./build",
"shell": "npm run build && firebase functions:shell", "shell": "npm run build && firebase functions:shell",
@ -44,6 +44,7 @@
"htmlparser2": "^9.0.0", "htmlparser2": "^9.0.0",
"jose": "^5.1.0", "jose": "^5.1.0",
"langdetect": "^0.2.1", "langdetect": "^0.2.1",
"maxmind": "^4.3.18",
"minio": "^7.1.3", "minio": "^7.1.3",
"openai": "^4.20.0", "openai": "^4.20.0",
"puppeteer": "^22.7.1", "puppeteer": "^22.7.1",

View File

@ -2,7 +2,7 @@ import {
assignTransferProtocolMeta, marshalErrorLike, assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection, RPCHost, RPCReflection,
HashManager, HashManager,
AssertionFailureError, ParamValidationError, AssertionFailureError, ParamValidationError, Defer,
} from 'civkit'; } from 'civkit';
import { singleton } from 'tsyringe'; import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost {
cacheValidMs = 1000 * 300; cacheValidMs = 1000 * 300;
urlValidMs = 1000 * 3600 * 4; urlValidMs = 1000 * 3600 * 4;
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY
[Homepage] https://jina.ai/reader
[Source code] https://github.com/jina-ai/reader
`;
constructor( constructor(
protected globalLogger: Logger, protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl, protected puppeteerControl: PuppeteerControl,
@ -357,10 +363,7 @@ ${this.content}
[Balance left] ${latestUser.wallet.total_balance} [Balance left] ${latestUser.wallet.total_balance}
` : ''; ` : '';
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL return assignTransferProtocolMeta(`${this.indexText}${authMixin}`,
[Homepage] https://jina.ai/reader
[Source code] https://github.com/jina-ai/reader
${authMixin}`,
{ contentType: 'text/plain', envelope: null } { contentType: 'text/plain', envelope: null }
); );
} }
@ -638,13 +641,13 @@ ${authMixin}`,
return r; return r;
} }
async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) { async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
let cache; let cache;
if (!noCache && !crawlOpts.cookies?.length) { if (!noCache && !crawlOpts?.cookies?.length) {
cache = await this.queryCache(urlToCrawl); cache = await this.queryCache(urlToCrawl);
} }
if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) { if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
yield cache.snapshot; yield cache.snapshot;
return; return;
@ -683,4 +686,47 @@ ${authMixin}`,
return undefined; return undefined;
} }
async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
let nextDeferred = Defer();
let concluded = false;
const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
for await (const x of it) {
results[idx] = x;
if (x) {
nextDeferred.resolve();
nextDeferred = Defer();
}
}
};
Promise.all(
iterators.map((it, idx) => handler(it, idx))
).finally(() => {
concluded = true;
nextDeferred.resolve();
});
yield results;
try {
while (!concluded) {
await nextDeferred.promise;
yield results;
}
} finally {
for (const x of iterators) {
x.return();
}
}
}
} }

View File

@ -0,0 +1,389 @@
import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
AssertionFailureError,
objHashMd5B64Of,
} from 'civkit';
import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { RateLimitControl } from '../shared/services/rate-limit';
import _ from 'lodash';
import { ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { BraveSearchService } from '../services/brave-search';
import { CrawlerHost } from './crawler';
import { CookieParam } from 'puppeteer';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
import { SearchResult } from '../db/searched';
import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
@singleton()
export class SearcherHost extends RPCHost {
logger = this.globalLogger.child({ service: this.constructor.name });
cacheRetentionMs = 1000 * 3600 * 24 * 7;
cacheValidMs = 1000 * 3600;
constructor(
protected globalLogger: Logger,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext,
protected braveSearchService: BraveSearchService,
protected crawler: CrawlerHost,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
@CloudHTTPv2({
name: 'search2',
runtime: {
memory: '4GiB',
timeoutSeconds: 300,
concurrency: 4,
},
tags: ['Searcher'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
})
@CloudHTTPv2({
runtime: {
memory: '8GiB',
timeoutSeconds: 300,
concurrency: 8,
maxInstances: 200,
},
openapi: {
operation: {
parameters: {
'Accept': {
description: `Specifies your preference for the response format. \n\n` +
`Supported formats:\n` +
`- text/event-stream\n` +
`- application/json or text/json\n` +
`- text/plain`
,
in: 'header',
schema: { type: 'string' }
},
'X-No-Cache': {
description: `Ignores internal cache if this header is specified with a value.`,
in: 'header',
schema: { type: 'string' }
},
'X-Respond-With': {
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
`Supported formats:\n` +
`- markdown\n` +
`- html\n` +
`- text\n` +
`- screenshot\n`
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
`Supported protocols:\n` +
`- http\n` +
`- https\n` +
`- socks4\n` +
`- socks5\n\n` +
`For authentication, https://user:pass@host:port`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Generated-Alt': {
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
in: 'header',
schema: { type: 'string' }
},
}
}
},
tags: ['Searcher'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
})
async search(
@RPCReflect() rpcReflect: RPCReflection,
@Ctx() ctx: {
req: Request,
res: Response,
},
auth: JinaEmbeddingsAuthDTO
) {
const uid = await auth.solveUID();
let chargeAmount = 0;
const noSlashPath = ctx.req.url.slice(1);
if (!noSlashPath) {
const latestUser = uid ? await auth.assertUser() : undefined;
const authMixin = latestUser ? `
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
[Balance left] ${latestUser.wallet.total_balance}
` : '';
return assignTransferProtocolMeta(`${this.crawler.indexText}${authMixin}`,
{ contentType: 'text/plain', envelope: null }
);
}
if (uid) {
const user = await auth.assertUser();
if (!(user.wallet.total_balance > 0)) {
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
}
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
[
// 1000 requests per minute
new Date(Date.now() - 60 * 1000), 1000
]
);
rpcReflect.finally(() => {
if (chargeAmount) {
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
});
}
});
} else if (ctx.req.ip) {
this.threadLocal.set('ip', ctx.req.ip);
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
[
// 100 requests per minute
new Date(Date.now() - 60 * 1000), 100
]
);
}
const customMode = ctx.req.get('x-respond-with') || 'default';
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
const noCache = Boolean(ctx.req.get('x-no-cache'));
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
if (Array.isArray(setCookieHeaders)) {
for (const setCookie of setCookieHeaders) {
cookies.push({
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
});
}
} else if (setCookieHeaders) {
cookies.push({
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
});
}
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
const crawlOpts: ScrappingOptions = {
proxyUrl: ctx.req.get('x-proxy-url'),
cookies,
favorScreenshot: customMode === 'screenshot'
};
const searchQuery = noSlashPath;
const r = await this.cachedWebSearch({
q: searchQuery,
count: 5
});
const urls = r.web.results.map((x) => new URL(x.url));
const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
try {
for await (const scrapped of it) {
if (!scrapped) {
continue;
}
chargeAmount = this.getChargeAmount(scrapped);
sseStream.write({
event: 'data',
data: scrapped,
});
}
} catch (err: any) {
this.logger.error(`Failed to collect search result for query ${searchQuery}`,
{ err: marshalErrorLike(err) }
);
sseStream.write({
event: 'error',
data: marshalErrorLike(err),
});
}
sseStream.end();
return sseStream;
}
let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of it) {
lastScrapped = scrapped;
if (!this.qualified(scrapped)) {
continue;
}
chargeAmount = this.getChargeAmount(scrapped);
return scrapped;
}
if (!lastScrapped) {
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
}
chargeAmount = this.getChargeAmount(lastScrapped);
return lastScrapped;
}
for await (const scrapped of it) {
lastScrapped = scrapped;
if (!this.qualified(scrapped)) {
continue;
}
chargeAmount = this.getChargeAmount(scrapped);
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
}
if (!lastScrapped) {
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
}
chargeAmount = this.getChargeAmount(lastScrapped);
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
}
async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
urls: URL[], options?: ScrappingOptions, noCache = false) {
for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
const mapped = scrapped.map((x, i) => {
if (!x) {
const p = {
toString() {
return `[${i + 1}] No content available for ${urls[i]}`;
}
};
const r = Object.create(p);
r.url = urls[i].toString();
return r;
}
return this.crawler.formatSnapshot(mode, x, urls[i]);
});
const resultArray = await Promise.all(mapped);
for (const [i, result] of resultArray.entries()) {
if (result && typeof result === 'object' && Object.hasOwn(result, 'toString')) {
result.toString = function (this: any) {
const mixins = [];
if (this.publishedTime) {
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
}
if (mode === 'markdown') {
return `[${i + 1}]\n${this.content}`;
}
return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
[${i + 1}] Markdown Content:
${this.content}
`;
};
}
}
resultArray.toString = function () {
return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n';
};
yield resultArray;
}
}
getChargeAmount(formatted: any[]) {
return _.sum(
formatted.map((x) => this.crawler.getChargeAmount(x) || 0)
);
}
qualified(scrapped: any[]) {
return _.every(scrapped, (x) =>
(x as any)?.title &&
(
(x as any).content ||
(x as any).screenShotUrl ||
(x as any).screenshot ||
(x as any).text ||
(x as any).html
)
);
}
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
const queryDigest = objHashMd5B64Of(query);
let cache;
if (!noCache) {
cache = (await SearchResult.fromFirestoreQuery(
SearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
.orderBy('createdAt', 'desc')
.limit(1)
))[0];
if (cache) {
const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
query, digest: queryDigest, age, stale
});
if (!stale) {
return cache.response as WebSearchApiResponse;
}
}
}
const r = await this.braveSearchService.webSearch(query);
const nowDate = new Date();
const record = SearchResult.from({
query,
queryDigest,
response: r,
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
});
SearchResult.save(record).catch((err) => {
this.logger.warn(`Failed to cache search result`, { err });
});
return r;
}
}

View File

@ -0,0 +1,60 @@
import { Also, parseJSONText, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';
@Also({
dictOf: Object
})
export class SearchResult extends FirestoreRecord {
static override collectionName = 'searchResults';
override _id!: string;
@Prop({
required: true
})
query!: any;
@Prop({
required: true
})
queryDigest!: string;
@Prop()
response?: any;
@Prop()
createdAt!: Date;
@Prop()
expireAt?: Date;
[k: string]: any;
static patchedFields = [
'query',
'response',
];
static override from(input: any) {
for (const field of this.patchedFields) {
if (typeof input[field] === 'string') {
input[field] = parseJSONText(input[field]);
}
}
return super.from(input) as SearchResult;
}
override degradeForFireStore() {
const copy: any = { ...this };
for (const field of (this.constructor as typeof SearchResult).patchedFields) {
if (typeof copy[field] === 'object') {
copy[field] = JSON.stringify(copy[field]) as any;
}
}
return copy;
}
}

View File

@ -0,0 +1,71 @@
import { AsyncService, DownstreamServiceFailureError } from 'civkit';
import { singleton } from 'tsyringe';
import { Logger } from '../shared/services/logger';
import { SecretExposer } from '../shared/services/secrets';
import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
import { AsyncContext } from '../shared';
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
@singleton()
export class BraveSearchService extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
braveSearchHTTP!: BraveSearchHTTP;
constructor(
protected globalLogger: Logger,
protected secretExposer: SecretExposer,
protected geoipControl: GeoIPService,
protected threadLocal: AsyncContext,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
}
async webSearch(query: WebSearchQueryParams) {
const ip = this.threadLocal.get('ip');
const extraHeaders: WebSearchOptionalHeaderOptions = {};
if (ip) {
const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
if (geoip?.city) {
extraHeaders['X-Loc-City'] = geoip.city;
}
if (geoip?.country) {
extraHeaders['X-Loc-Country'] = geoip.country.code;
}
if (geoip?.timezone) {
extraHeaders['X-Loc-Timezone'] = geoip.timezone;
}
if (geoip?.coordinates) {
extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`;
extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`;
}
if (geoip?.subdivisions?.length) {
extraHeaders['X-Loc-State'] = geoip.subdivisions[0].code;
extraHeaders['X-Loc-State-Name'] = geoip.subdivisions[0].name;
}
}
if (this.threadLocal.get('userAgent')) {
extraHeaders['User-Agent'] = this.threadLocal.get('userAgent');
}
try {
const r = await this.braveSearchHTTP.webSearch(query, { headers: extraHeaders as Record<string, string> });
return r.parsed;
} catch (err) {
throw new DownstreamServiceFailureError({ message: `Search failed`, cause: err });
}
}
}

View File

@ -0,0 +1,123 @@
import { container, singleton } from 'tsyringe';
import fsp from 'fs/promises';
import { CityResponse, Reader } from 'maxmind';
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
import { Logger } from '../shared';
import path from 'path';
export enum GEOIP_SUPPORTED_LANGUAGES {
EN = 'en',
ZH_CN = 'zh-CN',
JA = 'ja',
DE = 'de',
FR = 'fr',
ES = 'es',
PT_BR = 'pt-BR',
RU = 'ru',
}
export class GeoIPInfo extends AutoCastable {
@Prop()
code?: string;
@Prop()
name?: string;
}
export class GeoIPCountryInfo extends GeoIPInfo {
@Prop()
eu?: boolean;
}
export class GeoIPCityResponse extends AutoCastable {
@Prop()
continent?: GeoIPInfo;
@Prop()
country?: GeoIPCountryInfo;
@Prop({
arrayOf: GeoIPInfo
})
subdivisions?: GeoIPInfo[];
@Prop()
city?: string;
@Prop({
arrayOf: Number
})
coordinates?: [number, number, number];
@Prop()
timezone?: string;
}
@singleton()
export class GeoIPService extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
mmdbCity!: Reader<CityResponse>;
constructor(
protected globalLogger: Logger,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
@runOnce()
async _lazyload() {
const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb');
const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null });
this.mmdbCity = new Reader<CityResponse>(dbBuff);
this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`);
}
async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
await this._lazyload();
const r = this.mmdbCity.get(ip);
if (!r) {
return undefined;
}
return GeoIPCityResponse.from({
continent: r.continent ? {
code: r.continent?.code,
name: r.continent?.names?.[lang] || r.continent?.names?.en,
} : undefined,
country: r.country ? {
code: r.country?.iso_code,
name: r.country?.names?.[lang] || r.country?.names.en,
eu: r.country?.is_in_european_union,
} : undefined,
city: r.city?.names?.[lang] || r.city?.names?.en,
subdivisions: r.subdivisions?.map((x) => ({
code: x.iso_code,
name: x.names?.[lang] || x.names?.en,
})),
coordinates: r.location ? [
r.location.latitude, r.location.longitude, r.location.accuracy_radius
] : undefined,
timezone: r.location?.time_zone,
});
}
}
const instance = container.resolve(GeoIPService);
export default instance;

View File

@ -278,7 +278,7 @@ document.addEventListener('load', handlePageLoad);
return page; return page;
} }
async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> { async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
// parsedUrl.search = ''; // parsedUrl.search = '';
const url = parsedUrl.toString(); const url = parsedUrl.toString();
@ -287,10 +287,10 @@ document.addEventListener('load', handlePageLoad);
let screenshot: Buffer | undefined; let screenshot: Buffer | undefined;
const page = await this.pagePool.acquire(); const page = await this.pagePool.acquire();
if (options.proxyUrl) { if (options?.proxyUrl) {
await page.useProxy(options.proxyUrl); await page.useProxy(options.proxyUrl);
} }
if (options.cookies) { if (options?.cookies) {
await page.setCookie(...options.cookies); await page.setCookie(...options.cookies);
} }
@ -353,7 +353,7 @@ document.addEventListener('load', handlePageLoad);
yield { ...snapshot, screenshot } as PageSnapshot; yield { ...snapshot, screenshot } as PageSnapshot;
break; break;
} }
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot(); screenshot = await page.screenshot();
lastHTML = snapshot.html; lastHTML = snapshot.html;
} }

@ -1 +1 @@
Subproject commit 584791b789cd483dab18735416744b4d10130993 Subproject commit 2f2cdcff7b2738be33ee5aca858ef2d65eba29ed