From 8cfd0d67dca51eaea183f6b516e7cb6815c43a3c Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 8 May 2024 18:25:26 +0800 Subject: [PATCH] feat: jina paywall (#49) * feat: integrate with jina embeddings paywall --- backend/functions/package-lock.json | 42 +++++----- .../functions/src/cloud-functions/crawler.ts | 80 ++++++++++++++++--- thinapps-shared | 2 +- 3 files changed, 91 insertions(+), 33 deletions(-) diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index 4dda188..7cf5c17 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -178,6 +178,16 @@ "node": ">=6.9.0" } }, + "node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "peer": true, + "dependencies": { + "yallist": "^3.0.2" + } + }, "node_modules/@babel/helper-compilation-targets/node_modules/semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", @@ -6251,6 +6261,17 @@ "node": ">=10.19.0" } }, + "node_modules/http2-wrapper/node_modules/quick-lru": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz", + "integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/https-proxy-agent": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", @@ -8059,16 +8080,6 @@ "node": ">=8" } }, - "node_modules/lru-cache": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", - "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", - "dev": true, - "peer": true, - "dependencies": { - "yallist": "^3.0.2" - } - }, "node_modules/lru-memoizer": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz", @@ -9852,17 +9863,6 @@ "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==", "optional": true }, - "node_modules/quick-lru": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz", - "integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/range-parser": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index c5394f3..b010f64 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -5,7 +5,7 @@ import { AssertionFailureError, ParamValidationError, } from 'civkit'; import { singleton } from 'tsyringe'; -import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared'; +import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import { RateLimitControl } from '../shared/services/rate-limit'; import _ from 'lodash'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; @@ -19,6 +19,9 @@ import { Crawled } from '../db/crawled'; import { tidyMarkdown } from '../utils/markdown'; import { cleanAttribute } from '../utils/misc'; import { randomUUID } from 'crypto'; +import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; + +import { countGPTToken as estimateToken } from '../shared/utils/openai'; const md5Hasher = new HashManager('md5', 'hex'); @@ -296,23 +299,55 @@ ${this.content} req: Request, res: Response, }, + auth: JinaEmbeddingsAuthDTO ) { - if (ctx.req.ip) { - await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'], [ - // 100 requests per minute - new Date(Date.now() - 60 * 1000), 100 - ]); - } - + const uid = await auth.solveUID(); + let chargeAmount = 0; const noSlashURL = ctx.req.url.slice(1); if (!noSlashURL) { + const latestUser = uid ? await auth.assertUser() : undefined; + const authMixin = latestUser ? ` +[Authenticated as] ${latestUser.user_id} (${latestUser.full_name}) +[Balance left] ${latestUser.wallet.total_balance} +` : ''; + return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL [Homepage] https://jina.ai/reader [Source code] https://github.com/jina-ai/reader -`, +${authMixin}`, { contentType: 'text/plain', envelope: null } ); } + + if (uid) { + const user = await auth.assertUser(); + if (!(user.wallet.total_balance > 0)) { + throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`); + } + + await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'], + [ + // 1000 requests per minute + new Date(Date.now() - 60 * 1000), 1000 + ] + ); + + rpcReflect.finally(() => { + if (chargeAmount) { + auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => { + this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) }); + }); + } + }); + } else if (ctx.req.ip) { + await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'], + [ + // 100 requests per minute + new Date(Date.now() - 60 * 1000), 100 + ] + ); + } + let urlToCrawl; try { urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false })); @@ -364,7 +399,7 @@ ${this.content} } const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl); - + chargeAmount = this.getChargeAmount(formatted); sseStream.write({ event: 'data', data: formatted, @@ -392,6 +427,7 @@ ${this.content} } const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl); + chargeAmount = this.getChargeAmount(formatted); return formatted; } @@ -400,7 +436,10 @@ ${this.content} throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`); } - return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl); + const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl); + chargeAmount = this.getChargeAmount(formatted); + + return formatted; } for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) { @@ -410,6 +449,7 @@ ${this.content} } const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl); + chargeAmount = this.getChargeAmount(formatted); if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return assignTransferProtocolMeta(`${formatted}`, @@ -425,6 +465,7 @@ ${this.content} } const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl); + chargeAmount = this.getChargeAmount(formatted); if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return assignTransferProtocolMeta(`${formatted}`, @@ -563,4 +604,21 @@ ${this.content} } } + getChargeAmount(formatted: { [k: string]: any; }) { + const textContent = formatted?.content || formatted?.text || formatted?.html; + + if (typeof textContent === 'string') { + return estimateToken(textContent); + } + + const imageContent = formatted.screenshotUrl || formatted?.screenshot; + + if (imageContent) { + // OpenAI image token count for 1024x1024 image + return 765; + } + + return undefined; + } + } diff --git a/thinapps-shared b/thinapps-shared index d3bb3a7..584791b 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit d3bb3a7335ec9d96c68d1edf1b66fdf5e2fe5b7c +Subproject commit 584791b789cd483dab18735416744b4d10130993