feat: jina paywall (#49)

* feat: integrate with jina embeddings paywall
This commit is contained in:
Yanlong Wang 2024-05-08 18:25:26 +08:00 committed by GitHub
parent 2e025d10cf
commit 8cfd0d67dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 91 additions and 33 deletions

View File

@ -178,6 +178,16 @@
"node": ">=6.9.0"
}
},
"node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
"dev": true,
"peer": true,
"dependencies": {
"yallist": "^3.0.2"
}
},
"node_modules/@babel/helper-compilation-targets/node_modules/semver": {
"version": "6.3.1",
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
@ -6251,6 +6261,17 @@
"node": ">=10.19.0"
}
},
"node_modules/http2-wrapper/node_modules/quick-lru": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/https-proxy-agent": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
@ -8059,16 +8080,6 @@
"node": ">=8"
}
},
"node_modules/lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
"dev": true,
"peer": true,
"dependencies": {
"yallist": "^3.0.2"
}
},
"node_modules/lru-memoizer": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
@ -9852,17 +9863,6 @@
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
"optional": true
},
"node_modules/quick-lru": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/range-parser": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",

View File

@ -5,7 +5,7 @@ import {
AssertionFailureError, ParamValidationError,
} from 'civkit';
import { singleton } from 'tsyringe';
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { RateLimitControl } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@ -19,6 +19,9 @@ import { Crawled } from '../db/crawled';
import { tidyMarkdown } from '../utils/markdown';
import { cleanAttribute } from '../utils/misc';
import { randomUUID } from 'crypto';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { countGPTToken as estimateToken } from '../shared/utils/openai';
const md5Hasher = new HashManager('md5', 'hex');
@ -296,23 +299,55 @@ ${this.content}
req: Request,
res: Response,
},
auth: JinaEmbeddingsAuthDTO
) {
if (ctx.req.ip) {
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'], [
// 100 requests per minute
new Date(Date.now() - 60 * 1000), 100
]);
}
const uid = await auth.solveUID();
let chargeAmount = 0;
const noSlashURL = ctx.req.url.slice(1);
if (!noSlashURL) {
const latestUser = uid ? await auth.assertUser() : undefined;
const authMixin = latestUser ? `
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
[Balance left] ${latestUser.wallet.total_balance}
` : '';
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
[Homepage] https://jina.ai/reader
[Source code] https://github.com/jina-ai/reader
`,
${authMixin}`,
{ contentType: 'text/plain', envelope: null }
);
}
if (uid) {
const user = await auth.assertUser();
if (!(user.wallet.total_balance > 0)) {
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
}
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
[
// 1000 requests per minute
new Date(Date.now() - 60 * 1000), 1000
]
);
rpcReflect.finally(() => {
if (chargeAmount) {
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
});
}
});
} else if (ctx.req.ip) {
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
[
// 100 requests per minute
new Date(Date.now() - 60 * 1000), 100
]
);
}
let urlToCrawl;
try {
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
@ -364,7 +399,7 @@ ${this.content}
}
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
chargeAmount = this.getChargeAmount(formatted);
sseStream.write({
event: 'data',
data: formatted,
@ -392,6 +427,7 @@ ${this.content}
}
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
chargeAmount = this.getChargeAmount(formatted);
return formatted;
}
@ -400,7 +436,10 @@ ${this.content}
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
}
return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
chargeAmount = this.getChargeAmount(formatted);
return formatted;
}
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
@ -410,6 +449,7 @@ ${this.content}
}
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
chargeAmount = this.getChargeAmount(formatted);
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
@ -425,6 +465,7 @@ ${this.content}
}
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
chargeAmount = this.getChargeAmount(formatted);
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
@ -563,4 +604,21 @@ ${this.content}
}
}
getChargeAmount(formatted: { [k: string]: any; }) {
const textContent = formatted?.content || formatted?.text || formatted?.html;
if (typeof textContent === 'string') {
return estimateToken(textContent);
}
const imageContent = formatted.screenshotUrl || formatted?.screenshot;
if (imageContent) {
// OpenAI image token count for 1024x1024 image
return 765;
}
return undefined;
}
}

@ -1 +1 @@
Subproject commit d3bb3a7335ec9d96c68d1edf1b66fdf5e2fe5b7c
Subproject commit 584791b789cd483dab18735416744b4d10130993