diff --git a/backend/.gitignore b/backend/.gitignore index 67df96d..b1b850b 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -75,4 +75,5 @@ build/ .DS_Store *.local -.secret.* \ No newline at end of file +.secret.* +licensed/ \ No newline at end of file diff --git a/backend/functions/integrity-check.cjs b/backend/functions/integrity-check.cjs new file mode 100755 index 0000000..8f7494e --- /dev/null +++ b/backend/functions/integrity-check.cjs @@ -0,0 +1,11 @@ +#!/usr/bin/env node + +const fs = require('fs'); +const path = require('path'); + +const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb'); + +if (!fs.existsSync(file)) { + console.error(`Integrity check failed: ${file} does not exist.`); + process.exit(1); +} diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index 7cf5c17..83507ea 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -24,6 +24,7 @@ "htmlparser2": "^9.0.0", "jose": "^5.1.0", "langdetect": "^0.2.1", + "maxmind": "^4.3.18", "minio": "^7.1.3", "openai": "^4.20.0", "puppeteer": "^22.7.1", @@ -8144,6 +8145,19 @@ "tmpl": "1.0.5" } }, + "node_modules/maxmind": { + "version": "4.3.18", + "resolved": "https://registry.npmjs.org/maxmind/-/maxmind-4.3.18.tgz", + "integrity": "sha512-5b9utU7ZxcGYTBaO7hCF0FXyfw3IpankLn+FnLW4RZS1zi97RBeSdfXJFJlk5UxNsMiFZlsdMT3lzvD+bD8MLQ==", + "dependencies": { + "mmdb-lib": "2.1.0", + "tiny-lru": "11.2.5" + }, + "engines": { + "node": ">=12", + "npm": ">=6" + } + }, "node_modules/media-typer": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", @@ -8375,6 +8389,15 @@ "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" }, + "node_modules/mmdb-lib": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mmdb-lib/-/mmdb-lib-2.1.0.tgz", + "integrity": "sha512-tdDTZmnI5G4UoSctv2KxM/3VQt2XRj4CmR5R4VsAWsOUcS3LysHR34wtixWm/pXxXdkBDuN92auxkC0T2+qd1Q==", + "engines": { + "node": ">=10", + "npm": ">=6" + } + }, "node_modules/mongodb": { "version": "5.9.2", "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz", @@ -11059,6 +11082,14 @@ "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz", "integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw==" }, + "node_modules/tiny-lru": { + "version": "11.2.5", + "resolved": "https://registry.npmjs.org/tiny-lru/-/tiny-lru-11.2.5.tgz", + "integrity": "sha512-JpqM0K33lG6iQGKiigcwuURAKZlq6rHXfrgeL4/I8/REoyJTGU+tEMszvT/oTRVHG2OiylhGDjqPp1jWMlr3bw==", + "engines": { + "node": ">=12" + } + }, "node_modules/tld-extract": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz", diff --git a/backend/functions/package.json b/backend/functions/package.json index 979b34f..fb7418c 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -2,7 +2,7 @@ "name": "reader", "scripts": { "lint": "eslint --ext .js,.ts .", - "build": "tsc -p .", + "build": "node ./integrity-check.cjs && tsc -p .", "build:watch": "tsc --watch", "build:clean": "rm -rf ./build", "shell": "npm run build && firebase functions:shell", @@ -44,6 +44,7 @@ "htmlparser2": "^9.0.0", "jose": "^5.1.0", "langdetect": "^0.2.1", + "maxmind": "^4.3.18", "minio": "^7.1.3", "openai": "^4.20.0", "puppeteer": "^22.7.1", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index bba8229..74a304f 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -2,7 +2,7 @@ import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, HashManager, - AssertionFailureError, ParamValidationError, + AssertionFailureError, ParamValidationError, Defer, } from 'civkit'; import { singleton } from 'tsyringe'; import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared'; @@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost { cacheValidMs = 1000 * 300; urlValidMs = 1000 * 3600 * 4; + indexText = `[Usage1] https://r.jina.ai/YOUR_URL +[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY +[Homepage] https://jina.ai/reader +[Source code] https://github.com/jina-ai/reader +`; + constructor( protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, @@ -357,10 +363,7 @@ ${this.content} [Balance left] ${latestUser.wallet.total_balance} ` : ''; - return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL -[Homepage] https://jina.ai/reader -[Source code] https://github.com/jina-ai/reader -${authMixin}`, + return assignTransferProtocolMeta(`${this.indexText}${authMixin}`, { contentType: 'text/plain', envelope: null } ); } @@ -638,13 +641,13 @@ ${authMixin}`, return r; } - async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) { + async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) { let cache; - if (!noCache && !crawlOpts.cookies?.length) { + if (!noCache && !crawlOpts?.cookies?.length) { cache = await this.queryCache(urlToCrawl); } - if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) { + if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) { yield cache.snapshot; return; @@ -683,4 +686,47 @@ ${authMixin}`, return undefined; } + + async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) { + const iterators = urls.map((url) => this.cachedScrap(url, options, noCache)); + + const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined); + + let nextDeferred = Defer(); + let concluded = false; + + const handler = async (it: AsyncGenerator, idx: number) => { + for await (const x of it) { + results[idx] = x; + + if (x) { + nextDeferred.resolve(); + nextDeferred = Defer(); + } + + } + }; + + Promise.all( + iterators.map((it, idx) => handler(it, idx)) + ).finally(() => { + concluded = true; + nextDeferred.resolve(); + }); + + yield results; + + try { + while (!concluded) { + await nextDeferred.promise; + + yield results; + } + } finally { + for (const x of iterators) { + x.return(); + } + } + } + } diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts new file mode 100644 index 0000000..04ec255 --- /dev/null +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -0,0 +1,389 @@ +import { + assignTransferProtocolMeta, marshalErrorLike, + RPCHost, RPCReflection, + AssertionFailureError, + objHashMd5B64Of, +} from 'civkit'; +import { singleton } from 'tsyringe'; +import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared'; +import { RateLimitControl } from '../shared/services/rate-limit'; +import _ from 'lodash'; +import { ScrappingOptions } from '../services/puppeteer'; +import { Request, Response } from 'express'; +import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; +import { BraveSearchService } from '../services/brave-search'; +import { CrawlerHost } from './crawler'; +import { CookieParam } from 'puppeteer'; + +import { parseString as parseSetCookieString } from 'set-cookie-parser'; +import { WebSearchQueryParams } from '../shared/3rd-party/brave-search'; +import { SearchResult } from '../db/searched'; +import { WebSearchApiResponse } from '../shared/3rd-party/brave-types'; + + +@singleton() +export class SearcherHost extends RPCHost { + logger = this.globalLogger.child({ service: this.constructor.name }); + + cacheRetentionMs = 1000 * 3600 * 24 * 7; + cacheValidMs = 1000 * 3600; + + constructor( + protected globalLogger: Logger, + protected rateLimitControl: RateLimitControl, + protected threadLocal: AsyncContext, + protected braveSearchService: BraveSearchService, + protected crawler: CrawlerHost, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + @CloudHTTPv2({ + name: 'search2', + runtime: { + memory: '4GiB', + timeoutSeconds: 300, + concurrency: 4, + }, + tags: ['Searcher'], + httpMethod: ['get', 'post'], + returnType: [String, OutputServerEventStream], + exposeRoot: true, + }) + @CloudHTTPv2({ + runtime: { + memory: '8GiB', + timeoutSeconds: 300, + concurrency: 8, + maxInstances: 200, + }, + openapi: { + operation: { + parameters: { + 'Accept': { + description: `Specifies your preference for the response format. \n\n` + + `Supported formats:\n` + + `- text/event-stream\n` + + `- application/json or text/json\n` + + `- text/plain` + , + in: 'header', + schema: { type: 'string' } + }, + 'X-No-Cache': { + description: `Ignores internal cache if this header is specified with a value.`, + in: 'header', + schema: { type: 'string' } + }, + 'X-Respond-With': { + description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` + + `Supported formats:\n` + + `- markdown\n` + + `- html\n` + + `- text\n` + + `- screenshot\n` + , + in: 'header', + schema: { type: 'string' } + }, + 'X-Proxy-Url': { + description: `Specifies your custom proxy if you prefer to use one. \n\n` + + `Supported protocols:\n` + + `- http\n` + + `- https\n` + + `- socks4\n` + + `- socks5\n\n` + + `For authentication, https://user:pass@host:port`, + in: 'header', + schema: { type: 'string' } + }, + 'X-Set-Cookie': { + description: `Sets cookie(s) to the headless browser for your request. \n\n` + + `Syntax is the same with standard Set-Cookie`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-Generated-Alt': { + description: `Enable automatic alt-text generating for images without an meaningful alt-text.`, + in: 'header', + schema: { type: 'string' } + }, + } + } + }, + tags: ['Searcher'], + httpMethod: ['get', 'post'], + returnType: [String, OutputServerEventStream], + exposeRoot: true, + }) + async search( + @RPCReflect() rpcReflect: RPCReflection, + @Ctx() ctx: { + req: Request, + res: Response, + }, + auth: JinaEmbeddingsAuthDTO + ) { + const uid = await auth.solveUID(); + let chargeAmount = 0; + const noSlashPath = ctx.req.url.slice(1); + if (!noSlashPath) { + const latestUser = uid ? await auth.assertUser() : undefined; + const authMixin = latestUser ? ` +[Authenticated as] ${latestUser.user_id} (${latestUser.full_name}) +[Balance left] ${latestUser.wallet.total_balance} +` : ''; + + return assignTransferProtocolMeta(`${this.crawler.indexText}${authMixin}`, + { contentType: 'text/plain', envelope: null } + ); + } + + if (uid) { + const user = await auth.assertUser(); + if (!(user.wallet.total_balance > 0)) { + throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`); + } + + await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'], + [ + // 1000 requests per minute + new Date(Date.now() - 60 * 1000), 1000 + ] + ); + + rpcReflect.finally(() => { + if (chargeAmount) { + auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => { + this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) }); + }); + } + }); + } else if (ctx.req.ip) { + this.threadLocal.set('ip', ctx.req.ip); + await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'], + [ + // 100 requests per minute + new Date(Date.now() - 60 * 1000), 100 + ] + ); + } + + const customMode = ctx.req.get('x-respond-with') || 'default'; + const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt')); + const noCache = Boolean(ctx.req.get('x-no-cache')); + const cookies: CookieParam[] = []; + const setCookieHeaders = ctx.req.headers['x-set-cookie']; + if (Array.isArray(setCookieHeaders)) { + for (const setCookie of setCookieHeaders) { + cookies.push({ + ...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam, + }); + } + } else if (setCookieHeaders) { + cookies.push({ + ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam, + }); + } + this.threadLocal.set('withGeneratedAlt', withGeneratedAlt); + const crawlOpts: ScrappingOptions = { + proxyUrl: ctx.req.get('x-proxy-url'), + cookies, + favorScreenshot: customMode === 'screenshot' + }; + + const searchQuery = noSlashPath; + const r = await this.cachedWebSearch({ + q: searchQuery, + count: 5 + }); + + const urls = r.web.results.map((x) => new URL(x.url)); + const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache); + + if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { + const sseStream = new OutputServerEventStream(); + rpcReflect.return(sseStream); + + try { + for await (const scrapped of it) { + if (!scrapped) { + continue; + } + + chargeAmount = this.getChargeAmount(scrapped); + sseStream.write({ + event: 'data', + data: scrapped, + }); + } + } catch (err: any) { + this.logger.error(`Failed to collect search result for query ${searchQuery}`, + { err: marshalErrorLike(err) } + ); + sseStream.write({ + event: 'error', + data: marshalErrorLike(err), + }); + } + + sseStream.end(); + + return sseStream; + } + + let lastScrapped; + if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { + for await (const scrapped of it) { + lastScrapped = scrapped; + + if (!this.qualified(scrapped)) { + continue; + } + + chargeAmount = this.getChargeAmount(scrapped); + + return scrapped; + } + + if (!lastScrapped) { + throw new AssertionFailureError(`No content available for query ${searchQuery}`); + } + + chargeAmount = this.getChargeAmount(lastScrapped); + + return lastScrapped; + } + + for await (const scrapped of it) { + lastScrapped = scrapped; + + if (!this.qualified(scrapped)) { + continue; + } + chargeAmount = this.getChargeAmount(scrapped); + + return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null }); + } + + if (!lastScrapped) { + throw new AssertionFailureError(`No content available for query ${searchQuery}`); + } + + chargeAmount = this.getChargeAmount(lastScrapped); + + return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }); + } + + async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', + urls: URL[], options?: ScrappingOptions, noCache = false) { + + for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) { + const mapped = scrapped.map((x, i) => { + if (!x) { + const p = { + toString() { + return `[${i + 1}] No content available for ${urls[i]}`; + } + }; + const r = Object.create(p); + r.url = urls[i].toString(); + + return r; + } + return this.crawler.formatSnapshot(mode, x, urls[i]); + }); + + const resultArray = await Promise.all(mapped); + for (const [i, result] of resultArray.entries()) { + if (result && typeof result === 'object' && Object.hasOwn(result, 'toString')) { + result.toString = function (this: any) { + const mixins = []; + if (this.publishedTime) { + mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`); + } + + if (mode === 'markdown') { + return `[${i + 1}]\n${this.content}`; + } + + return `[${i + 1}] Title: ${this.title} +[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''} +[${i + 1}] Markdown Content: +${this.content} + `; + }; + } + } + resultArray.toString = function () { + return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n'; + }; + + yield resultArray; + } + } + + getChargeAmount(formatted: any[]) { + return _.sum( + formatted.map((x) => this.crawler.getChargeAmount(x) || 0) + ); + } + + qualified(scrapped: any[]) { + return _.every(scrapped, (x) => + (x as any)?.title && + ( + (x as any).content || + (x as any).screenShotUrl || + (x as any).screenshot || + (x as any).text || + (x as any).html + ) + ); + } + + async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) { + const queryDigest = objHashMd5B64Of(query); + let cache; + if (!noCache) { + cache = (await SearchResult.fromFirestoreQuery( + SearchResult.COLLECTION.where('queryDigest', '==', queryDigest) + .orderBy('createdAt', 'desc') + .limit(1) + ))[0]; + if (cache) { + const age = Date.now() - cache.createdAt.valueOf(); + const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs); + this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, { + query, digest: queryDigest, age, stale + }); + + if (!stale) { + return cache.response as WebSearchApiResponse; + } + } + } + + const r = await this.braveSearchService.webSearch(query); + + const nowDate = new Date(); + const record = SearchResult.from({ + query, + queryDigest, + response: r, + createdAt: nowDate, + expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs) + }); + SearchResult.save(record).catch((err) => { + this.logger.warn(`Failed to cache search result`, { err }); + }); + + return r; + } +} diff --git a/backend/functions/src/db/searched.ts b/backend/functions/src/db/searched.ts new file mode 100644 index 0000000..71d0d77 --- /dev/null +++ b/backend/functions/src/db/searched.ts @@ -0,0 +1,60 @@ +import { Also, parseJSONText, Prop } from 'civkit'; +import { FirestoreRecord } from '../shared/lib/firestore'; +import _ from 'lodash'; + +@Also({ + dictOf: Object +}) +export class SearchResult extends FirestoreRecord { + static override collectionName = 'searchResults'; + + override _id!: string; + + @Prop({ + required: true + }) + query!: any; + + @Prop({ + required: true + }) + queryDigest!: string; + + @Prop() + response?: any; + + @Prop() + createdAt!: Date; + + @Prop() + expireAt?: Date; + + [k: string]: any; + + static patchedFields = [ + 'query', + 'response', + ]; + + static override from(input: any) { + for (const field of this.patchedFields) { + if (typeof input[field] === 'string') { + input[field] = parseJSONText(input[field]); + } + } + + return super.from(input) as SearchResult; + } + + override degradeForFireStore() { + const copy: any = { ...this }; + + for (const field of (this.constructor as typeof SearchResult).patchedFields) { + if (typeof copy[field] === 'object') { + copy[field] = JSON.stringify(copy[field]) as any; + } + } + + return copy; + } +} diff --git a/backend/functions/src/services/brave-search.ts b/backend/functions/src/services/brave-search.ts new file mode 100644 index 0000000..30546eb --- /dev/null +++ b/backend/functions/src/services/brave-search.ts @@ -0,0 +1,71 @@ +import { AsyncService, DownstreamServiceFailureError } from 'civkit'; +import { singleton } from 'tsyringe'; +import { Logger } from '../shared/services/logger'; +import { SecretExposer } from '../shared/services/secrets'; +import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search'; +import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip'; +import { AsyncContext } from '../shared'; +import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types'; + +@singleton() +export class BraveSearchService extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + braveSearchHTTP!: BraveSearchHTTP; + + constructor( + protected globalLogger: Logger, + protected secretExposer: SecretExposer, + protected geoipControl: GeoIPService, + protected threadLocal: AsyncContext, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + this.emit('ready'); + + this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY); + } + + async webSearch(query: WebSearchQueryParams) { + const ip = this.threadLocal.get('ip'); + const extraHeaders: WebSearchOptionalHeaderOptions = {}; + if (ip) { + const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN); + + if (geoip?.city) { + extraHeaders['X-Loc-City'] = geoip.city; + } + if (geoip?.country) { + extraHeaders['X-Loc-Country'] = geoip.country.code; + } + if (geoip?.timezone) { + extraHeaders['X-Loc-Timezone'] = geoip.timezone; + } + if (geoip?.coordinates) { + extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`; + extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`; + } + if (geoip?.subdivisions?.length) { + extraHeaders['X-Loc-State'] = geoip.subdivisions[0].code; + extraHeaders['X-Loc-State-Name'] = geoip.subdivisions[0].name; + } + } + if (this.threadLocal.get('userAgent')) { + extraHeaders['User-Agent'] = this.threadLocal.get('userAgent'); + } + + try { + const r = await this.braveSearchHTTP.webSearch(query, { headers: extraHeaders as Record }); + + return r.parsed; + } catch (err) { + throw new DownstreamServiceFailureError({ message: `Search failed`, cause: err }); + } + + } + +} diff --git a/backend/functions/src/services/geoip.ts b/backend/functions/src/services/geoip.ts new file mode 100644 index 0000000..b6b4781 --- /dev/null +++ b/backend/functions/src/services/geoip.ts @@ -0,0 +1,123 @@ +import { container, singleton } from 'tsyringe'; +import fsp from 'fs/promises'; +import { CityResponse, Reader } from 'maxmind'; +import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit'; +import { Logger } from '../shared'; +import path from 'path'; + +export enum GEOIP_SUPPORTED_LANGUAGES { + EN = 'en', + ZH_CN = 'zh-CN', + JA = 'ja', + DE = 'de', + FR = 'fr', + ES = 'es', + PT_BR = 'pt-BR', + RU = 'ru', +} + +export class GeoIPInfo extends AutoCastable { + @Prop() + code?: string; + + @Prop() + name?: string; +} + +export class GeoIPCountryInfo extends GeoIPInfo { + @Prop() + eu?: boolean; +} + +export class GeoIPCityResponse extends AutoCastable { + @Prop() + continent?: GeoIPInfo; + + @Prop() + country?: GeoIPCountryInfo; + + @Prop({ + arrayOf: GeoIPInfo + }) + subdivisions?: GeoIPInfo[]; + + @Prop() + city?: string; + + @Prop({ + arrayOf: Number + }) + coordinates?: [number, number, number]; + + @Prop() + timezone?: string; +} + +@singleton() +export class GeoIPService extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + mmdbCity!: Reader; + + constructor( + protected globalLogger: Logger, + ) { + super(...arguments); + } + + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + @runOnce() + async _lazyload() { + const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb'); + + const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null }); + + this.mmdbCity = new Reader(dbBuff); + + this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`); + } + + + async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) { + await this._lazyload(); + + const r = this.mmdbCity.get(ip); + + if (!r) { + return undefined; + } + + return GeoIPCityResponse.from({ + continent: r.continent ? { + code: r.continent?.code, + name: r.continent?.names?.[lang] || r.continent?.names?.en, + } : undefined, + country: r.country ? { + code: r.country?.iso_code, + name: r.country?.names?.[lang] || r.country?.names.en, + eu: r.country?.is_in_european_union, + } : undefined, + city: r.city?.names?.[lang] || r.city?.names?.en, + subdivisions: r.subdivisions?.map((x) => ({ + code: x.iso_code, + name: x.names?.[lang] || x.names?.en, + })), + coordinates: r.location ? [ + r.location.latitude, r.location.longitude, r.location.accuracy_radius + ] : undefined, + timezone: r.location?.time_zone, + }); + } + +} + +const instance = container.resolve(GeoIPService); + +export default instance; diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 890f400..e70cabe 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -278,7 +278,7 @@ document.addEventListener('load', handlePageLoad); return page; } - async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator { + async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator { // parsedUrl.search = ''; const url = parsedUrl.toString(); @@ -287,10 +287,10 @@ document.addEventListener('load', handlePageLoad); let screenshot: Buffer | undefined; const page = await this.pagePool.acquire(); - if (options.proxyUrl) { + if (options?.proxyUrl) { await page.useProxy(options.proxyUrl); } - if (options.cookies) { + if (options?.cookies) { await page.setCookie(...options.cookies); } @@ -353,7 +353,7 @@ document.addEventListener('load', handlePageLoad); yield { ...snapshot, screenshot } as PageSnapshot; break; } - if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { + if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { screenshot = await page.screenshot(); lastHTML = snapshot.html; } diff --git a/thinapps-shared b/thinapps-shared index 584791b..2f2cdcf 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 584791b789cd483dab18735416744b4d10130993 +Subproject commit 2f2cdcff7b2738be33ee5aca858ef2d65eba29ed