mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
feat: jina paywall (#49)
* feat: integrate with jina embeddings paywall
This commit is contained in:
parent
2e025d10cf
commit
8cfd0d67dc
42
backend/functions/package-lock.json
generated
42
backend/functions/package-lock.json
generated
|
@ -178,6 +178,16 @@
|
|||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": {
|
||||
"version": "5.1.1",
|
||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
||||
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
|
||||
"dev": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"yallist": "^3.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/helper-compilation-targets/node_modules/semver": {
|
||||
"version": "6.3.1",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
|
||||
|
@ -6251,6 +6261,17 @@
|
|||
"node": ">=10.19.0"
|
||||
}
|
||||
},
|
||||
"node_modules/http2-wrapper/node_modules/quick-lru": {
|
||||
"version": "5.1.1",
|
||||
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
|
||||
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
|
||||
|
@ -8059,16 +8080,6 @@
|
|||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/lru-cache": {
|
||||
"version": "5.1.1",
|
||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
||||
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
|
||||
"dev": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"yallist": "^3.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/lru-memoizer": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
|
||||
|
@ -9852,17 +9863,6 @@
|
|||
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
|
||||
"optional": true
|
||||
},
|
||||
"node_modules/quick-lru": {
|
||||
"version": "5.1.1",
|
||||
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
|
||||
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/range-parser": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
||||
|
|
|
@ -5,7 +5,7 @@ import {
|
|||
AssertionFailureError, ParamValidationError,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
import { RateLimitControl } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
|
@ -19,6 +19,9 @@ import { Crawled } from '../db/crawled';
|
|||
import { tidyMarkdown } from '../utils/markdown';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
|
||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
|
@ -296,23 +299,55 @@ ${this.content}
|
|||
req: Request,
|
||||
res: Response,
|
||||
},
|
||||
auth: JinaEmbeddingsAuthDTO
|
||||
) {
|
||||
if (ctx.req.ip) {
|
||||
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'], [
|
||||
// 100 requests per minute
|
||||
new Date(Date.now() - 60 * 1000), 100
|
||||
]);
|
||||
}
|
||||
|
||||
const uid = await auth.solveUID();
|
||||
let chargeAmount = 0;
|
||||
const noSlashURL = ctx.req.url.slice(1);
|
||||
if (!noSlashURL) {
|
||||
const latestUser = uid ? await auth.assertUser() : undefined;
|
||||
const authMixin = latestUser ? `
|
||||
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
|
||||
[Balance left] ${latestUser.wallet.total_balance}
|
||||
` : '';
|
||||
|
||||
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
|
||||
[Homepage] https://jina.ai/reader
|
||||
[Source code] https://github.com/jina-ai/reader
|
||||
`,
|
||||
${authMixin}`,
|
||||
{ contentType: 'text/plain', envelope: null }
|
||||
);
|
||||
}
|
||||
|
||||
if (uid) {
|
||||
const user = await auth.assertUser();
|
||||
if (!(user.wallet.total_balance > 0)) {
|
||||
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
||||
}
|
||||
|
||||
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
||||
[
|
||||
// 1000 requests per minute
|
||||
new Date(Date.now() - 60 * 1000), 1000
|
||||
]
|
||||
);
|
||||
|
||||
rpcReflect.finally(() => {
|
||||
if (chargeAmount) {
|
||||
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
|
||||
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||
});
|
||||
}
|
||||
});
|
||||
} else if (ctx.req.ip) {
|
||||
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
||||
[
|
||||
// 100 requests per minute
|
||||
new Date(Date.now() - 60 * 1000), 100
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
let urlToCrawl;
|
||||
try {
|
||||
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
||||
|
@ -364,7 +399,7 @@ ${this.content}
|
|||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
||||
|
||||
chargeAmount = this.getChargeAmount(formatted);
|
||||
sseStream.write({
|
||||
event: 'data',
|
||||
data: formatted,
|
||||
|
@ -392,6 +427,7 @@ ${this.content}
|
|||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
||||
chargeAmount = this.getChargeAmount(formatted);
|
||||
|
||||
return formatted;
|
||||
}
|
||||
|
@ -400,7 +436,10 @@ ${this.content}
|
|||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||
}
|
||||
|
||||
return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
||||
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
||||
chargeAmount = this.getChargeAmount(formatted);
|
||||
|
||||
return formatted;
|
||||
}
|
||||
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
||||
|
@ -410,6 +449,7 @@ ${this.content}
|
|||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
||||
chargeAmount = this.getChargeAmount(formatted);
|
||||
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
|
@ -425,6 +465,7 @@ ${this.content}
|
|||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
||||
chargeAmount = this.getChargeAmount(formatted);
|
||||
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
|
@ -563,4 +604,21 @@ ${this.content}
|
|||
}
|
||||
}
|
||||
|
||||
getChargeAmount(formatted: { [k: string]: any; }) {
|
||||
const textContent = formatted?.content || formatted?.text || formatted?.html;
|
||||
|
||||
if (typeof textContent === 'string') {
|
||||
return estimateToken(textContent);
|
||||
}
|
||||
|
||||
const imageContent = formatted.screenshotUrl || formatted?.screenshot;
|
||||
|
||||
if (imageContent) {
|
||||
// OpenAI image token count for 1024x1024 image
|
||||
return 765;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit d3bb3a7335ec9d96c68d1edf1b66fdf5e2fe5b7c
|
||||
Subproject commit 584791b789cd483dab18735416744b4d10130993
|
Loading…
Reference in New Issue
Block a user