mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
parent
7c5712363c
commit
33e14e5404
|
@ -33,9 +33,6 @@
|
||||||
"functions": {
|
"functions": {
|
||||||
"port": 5001
|
"port": 5001
|
||||||
},
|
},
|
||||||
"auth": {
|
|
||||||
"port": 9099
|
|
||||||
},
|
|
||||||
"firestore": {
|
"firestore": {
|
||||||
"port": 9098
|
"port": 9098
|
||||||
},
|
},
|
||||||
|
|
24
backend/functions/package-lock.json
generated
24
backend/functions/package-lock.json
generated
|
@ -15,6 +15,7 @@
|
||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"civkit": "^0.6.5-047c0d8",
|
"civkit": "^0.6.5-047c0d8",
|
||||||
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
|
@ -27,6 +28,7 @@
|
||||||
"maxmind": "^4.3.18",
|
"maxmind": "^4.3.18",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
"openai": "^4.20.0",
|
"openai": "^4.20.0",
|
||||||
|
"pdfjs-dist": "^4.2.67",
|
||||||
"puppeteer": "^22.7.1",
|
"puppeteer": "^22.7.1",
|
||||||
"puppeteer-extra": "^3.3.6",
|
"puppeteer-extra": "^3.3.6",
|
||||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||||
|
@ -3923,6 +3925,16 @@
|
||||||
"integrity": "sha512-3DdaFaU/Zf1AnpLiFDeNCD4TOWe3Zl2RZaTzUvWiIk5ERzcCodOE20Vqq4fzCbNoHURFHT4/us/Lfq+S2zyY4w==",
|
"integrity": "sha512-3DdaFaU/Zf1AnpLiFDeNCD4TOWe3Zl2RZaTzUvWiIk5ERzcCodOE20Vqq4fzCbNoHURFHT4/us/Lfq+S2zyY4w==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
|
"node_modules/core-js": {
|
||||||
|
"version": "3.37.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/core-js/-/core-js-3.37.1.tgz",
|
||||||
|
"integrity": "sha512-Xn6qmxrQZyB0FFY8E3bgRXei3lWDJHhvI+u0q9TKIYM49G8pAr0FgnnrFRAmsbptZL1yxRADVXn+x5AGsbBfyw==",
|
||||||
|
"hasInstallScript": true,
|
||||||
|
"funding": {
|
||||||
|
"type": "opencollective",
|
||||||
|
"url": "https://opencollective.com/core-js"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/core-util-is": {
|
"node_modules/core-util-is": {
|
||||||
"version": "1.0.3",
|
"version": "1.0.3",
|
||||||
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
|
||||||
|
@ -9208,6 +9220,18 @@
|
||||||
"node": ">=8"
|
"node": ">=8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/pdfjs-dist": {
|
||||||
|
"version": "4.2.67",
|
||||||
|
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz",
|
||||||
|
"integrity": "sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"canvas": "^2.11.2",
|
||||||
|
"path2d": "^0.2.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/pend": {
|
"node_modules/pend": {
|
||||||
"version": "1.2.0",
|
"version": "1.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||||
|
|
|
@ -35,6 +35,7 @@
|
||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"civkit": "^0.6.5-047c0d8",
|
"civkit": "^0.6.5-047c0d8",
|
||||||
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
|
@ -47,6 +48,7 @@
|
||||||
"maxmind": "^4.3.18",
|
"maxmind": "^4.3.18",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
"openai": "^4.20.0",
|
"openai": "^4.20.0",
|
||||||
|
"pdfjs-dist": "^4.2.67",
|
||||||
"puppeteer": "^22.7.1",
|
"puppeteer": "^22.7.1",
|
||||||
"puppeteer-extra": "^3.3.6",
|
"puppeteer-extra": "^3.3.6",
|
||||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||||
|
|
|
@ -10,18 +10,18 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
import normalizeUrl from "@esm2cjs/normalize-url";
|
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||||
import { AltTextService } from '../services/alt-text';
|
import { AltTextService } from '../services/alt-text';
|
||||||
import TurndownService from 'turndown';
|
import TurndownService from 'turndown';
|
||||||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
||||||
import type { CookieParam } from 'puppeteer';
|
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import { cleanAttribute } from '../utils/misc';
|
import { cleanAttribute } from '../utils/misc';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
|
|
||||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||||
|
import { CrawlerOptions } from '../dto/scrapping-options';
|
||||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
|
import { PDFExtractor } from '../services/pdf-extract';
|
||||||
|
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
|
@ -69,6 +69,7 @@ export class CrawlerHost extends RPCHost {
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
protected altTextService: AltTextService,
|
protected altTextService: AltTextService,
|
||||||
|
protected pdfExtractor: PDFExtractor,
|
||||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
protected rateLimitControl: RateLimitControl,
|
protected rateLimitControl: RateLimitControl,
|
||||||
protected threadLocal: AsyncContext,
|
protected threadLocal: AsyncContext,
|
||||||
|
@ -76,7 +77,7 @@ export class CrawlerHost extends RPCHost {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
|
|
||||||
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
|
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
|
||||||
if (!snapshot.title?.trim()) {
|
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (options.cookies?.length) {
|
if (options.cookies?.length) {
|
||||||
|
@ -138,14 +139,14 @@ export class CrawlerHost extends RPCHost {
|
||||||
});
|
});
|
||||||
turnDownService.addRule('improved-inline-link', {
|
turnDownService.addRule('improved-inline-link', {
|
||||||
filter: function (node, options) {
|
filter: function (node, options) {
|
||||||
return (
|
return Boolean(
|
||||||
options.linkStyle === 'inlined' &&
|
options.linkStyle === 'inlined' &&
|
||||||
node.nodeName === 'A' &&
|
node.nodeName === 'A' &&
|
||||||
node.getAttribute('href')
|
node.getAttribute('href')
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
|
|
||||||
replacement: function (content, node) {
|
replacement: function (content, node: any) {
|
||||||
let href = node.getAttribute('href');
|
let href = node.getAttribute('href');
|
||||||
if (href) href = href.replace(/([()])/g, '\\$1');
|
if (href) href = href.replace(/([()])/g, '\\$1');
|
||||||
let title = cleanAttribute(node.getAttribute('title'));
|
let title = cleanAttribute(node.getAttribute('title'));
|
||||||
|
@ -226,6 +227,26 @@ export class CrawlerHost extends RPCHost {
|
||||||
}
|
}
|
||||||
} as FormattedPage;
|
} as FormattedPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let pdfMode = false;
|
||||||
|
if (snapshot.pdfs?.length && !snapshot.title) {
|
||||||
|
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0]);
|
||||||
|
if (pdf) {
|
||||||
|
pdfMode = true;
|
||||||
|
snapshot.title = pdf.meta?.Title;
|
||||||
|
snapshot.text = pdf.text || snapshot.text;
|
||||||
|
snapshot.parsed = {
|
||||||
|
content: pdf.content,
|
||||||
|
textContent: pdf.content,
|
||||||
|
length: pdf.content?.length,
|
||||||
|
byline: pdf.meta?.Author,
|
||||||
|
lang: pdf.meta?.Language || undefined,
|
||||||
|
title: pdf.meta?.Title,
|
||||||
|
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (mode === 'text') {
|
if (mode === 'text') {
|
||||||
return {
|
return {
|
||||||
...this.getGeneralSnapshotMixins(snapshot),
|
...this.getGeneralSnapshotMixins(snapshot),
|
||||||
|
@ -236,101 +257,108 @@ export class CrawlerHost extends RPCHost {
|
||||||
} as FormattedPage;
|
} as FormattedPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
let contentText = '';
|
||||||
let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
for (const plugin of this.turnDownPlugins) {
|
const imageIdxTrack = new Map<string, number[]>();
|
||||||
turnDownService = turnDownService.use(plugin);
|
do {
|
||||||
}
|
if (pdfMode) {
|
||||||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
contentText = snapshot.parsed?.content || snapshot.text;
|
||||||
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
break;
|
||||||
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
}
|
||||||
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
|
||||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
||||||
return undefined;
|
let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
|
||||||
|
for (const plugin of this.turnDownPlugins) {
|
||||||
|
turnDownService = turnDownService.use(plugin);
|
||||||
|
}
|
||||||
|
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||||
|
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
||||||
|
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
||||||
|
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||||
|
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||||
|
return undefined;
|
||||||
|
});
|
||||||
|
if (r && x.src) {
|
||||||
|
urlToAltMap[x.src.trim()] = r;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
if (r && x.src) {
|
|
||||||
urlToAltMap[x.src.trim()] = r;
|
await Promise.all(tasks);
|
||||||
|
}
|
||||||
|
let imgIdx = 0;
|
||||||
|
turnDownService.addRule('img-generated-alt', {
|
||||||
|
filter: 'img',
|
||||||
|
replacement: (_content, node: any) => {
|
||||||
|
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
||||||
|
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
||||||
|
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
||||||
|
if (dataSrc && !dataSrc.startsWith('data:')) {
|
||||||
|
linkPreferredSrc = dataSrc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let src;
|
||||||
|
try {
|
||||||
|
src = new URL(linkPreferredSrc, nominalUrl).toString();
|
||||||
|
} catch (_err) {
|
||||||
|
void 0;
|
||||||
|
}
|
||||||
|
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||||
|
if (!src) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
const mapped = urlToAltMap[src];
|
||||||
|
const imgSerial = ++imgIdx;
|
||||||
|
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
||||||
|
idxArr.push(imgSerial);
|
||||||
|
imageIdxTrack.set(src, idxArr);
|
||||||
|
|
||||||
|
if (mapped) {
|
||||||
|
imageSummary[src] = mapped || alt;
|
||||||
|
|
||||||
|
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
|
||||||
|
}
|
||||||
|
|
||||||
|
imageSummary[src] = alt || '';
|
||||||
|
|
||||||
|
return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
await Promise.all(tasks);
|
if (toBeTurnedToMd) {
|
||||||
}
|
try {
|
||||||
let imgIdx = 0;
|
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
} catch (err) {
|
||||||
const imageIdxTrack = new Map<string, number[]>();
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
turnDownService.addRule('img-generated-alt', {
|
const vanillaTurnDownService = this.getTurndown();
|
||||||
filter: 'img',
|
try {
|
||||||
replacement: (_content, node) => {
|
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
} catch (err2) {
|
||||||
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
|
||||||
if (dataSrc && !dataSrc.startsWith('data:')) {
|
|
||||||
linkPreferredSrc = dataSrc;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let src;
|
|
||||||
try {
|
|
||||||
src = new URL(linkPreferredSrc, nominalUrl).toString();
|
|
||||||
} catch (_err) {
|
|
||||||
void 0;
|
|
||||||
}
|
|
||||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
|
||||||
if (!src) {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
const mapped = urlToAltMap[src];
|
|
||||||
const imgSerial = ++imgIdx;
|
|
||||||
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
|
||||||
idxArr.push(imgSerial);
|
|
||||||
imageIdxTrack.set(src, idxArr);
|
|
||||||
|
|
||||||
if (mapped) {
|
|
||||||
imageSummary[src] = mapped || alt;
|
|
||||||
|
|
||||||
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
|
|
||||||
}
|
|
||||||
|
|
||||||
imageSummary[src] = alt || '';
|
|
||||||
|
|
||||||
return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
|
|
||||||
}
|
}
|
||||||
});
|
|
||||||
|
|
||||||
let contentText = '';
|
if (
|
||||||
if (toBeTurnedToMd) {
|
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
||||||
try {
|
&& toBeTurnedToMd !== snapshot.html
|
||||||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
) {
|
||||||
} catch (err) {
|
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
|
||||||
const vanillaTurnDownService = this.getTurndown();
|
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = turnDownService.turndown(snapshot.html);
|
||||||
} catch (err2) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
|
const vanillaTurnDownService = this.getTurndown();
|
||||||
|
try {
|
||||||
|
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
||||||
|
} catch (err2) {
|
||||||
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
||||||
|
contentText = snapshot.text;
|
||||||
if (
|
|
||||||
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
|
||||||
&& toBeTurnedToMd !== snapshot.html
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
contentText = turnDownService.turndown(snapshot.html);
|
|
||||||
} catch (err) {
|
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
|
||||||
const vanillaTurnDownService = this.getTurndown();
|
|
||||||
try {
|
|
||||||
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
|
||||||
} catch (err2) {
|
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
} while (false);
|
||||||
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
|
||||||
contentText = snapshot.text;
|
|
||||||
}
|
|
||||||
|
|
||||||
const cleanText = (contentText || '').trim();
|
const cleanText = (contentText || '').trim();
|
||||||
|
|
||||||
|
@ -514,7 +542,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response,
|
res: Response,
|
||||||
},
|
},
|
||||||
auth: JinaEmbeddingsAuthDTO
|
auth: JinaEmbeddingsAuthDTO,
|
||||||
|
crawlerOptions: CrawlerOptions,
|
||||||
) {
|
) {
|
||||||
const uid = await auth.solveUID();
|
const uid = await auth.solveUID();
|
||||||
let chargeAmount = 0;
|
let chargeAmount = 0;
|
||||||
|
@ -571,6 +600,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
let urlToCrawl;
|
let urlToCrawl;
|
||||||
|
const normalizeUrl = (await pNormalizeUrl).default;
|
||||||
try {
|
try {
|
||||||
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
@ -586,58 +616,19 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
const crawlOpts = this.configure(crawlerOptions);
|
||||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
|
||||||
const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
|
|
||||||
const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
|
|
||||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
|
||||||
let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
|
||||||
if (isNaN(cacheTolerance)) {
|
|
||||||
cacheTolerance = this.cacheValidMs;
|
|
||||||
if (noCache) {
|
|
||||||
cacheTolerance = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const targetSelector = ctx.req.get('x-target-selector') || undefined;
|
|
||||||
const waitForSelector = ctx.req.get('x-wait-for-selector') || targetSelector;
|
|
||||||
const cookies: CookieParam[] = [];
|
|
||||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
|
||||||
if (Array.isArray(setCookieHeaders)) {
|
|
||||||
for (const setCookie of setCookieHeaders) {
|
|
||||||
cookies.push({
|
|
||||||
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
|
|
||||||
domain: urlToCrawl.hostname,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else if (setCookieHeaders) {
|
|
||||||
cookies.push({
|
|
||||||
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
|
||||||
domain: urlToCrawl.hostname,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
|
||||||
this.threadLocal.set('withLinksSummary', withLinksSummary);
|
|
||||||
this.threadLocal.set('withImagesSummary', withImagesSummary);
|
|
||||||
|
|
||||||
const crawlOpts: ExtraScrappingOptions = {
|
|
||||||
proxyUrl: ctx.req.get('x-proxy-url'),
|
|
||||||
cookies,
|
|
||||||
favorScreenshot: customMode === 'screenshot',
|
|
||||||
waitForSelector,
|
|
||||||
targetSelector,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
const sseStream = new OutputServerEventStream();
|
const sseStream = new OutputServerEventStream();
|
||||||
rpcReflect.return(sseStream);
|
rpcReflect.return(sseStream);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
||||||
if (!scrapped) {
|
if (!scrapped) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
||||||
chargeAmount = this.getChargeAmount(formatted);
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
sseStream.write({
|
sseStream.write({
|
||||||
event: 'data',
|
event: 'data',
|
||||||
|
@ -659,13 +650,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
|
|
||||||
let lastScrapped;
|
let lastScrapped;
|
||||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
||||||
chargeAmount = this.getChargeAmount(formatted);
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
|
@ -675,21 +666,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
||||||
chargeAmount = this.getChargeAmount(formatted);
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
||||||
chargeAmount = this.getChargeAmount(formatted);
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
|
@ -703,9 +694,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
||||||
chargeAmount = this.getChargeAmount(formatted);
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
|
@ -915,4 +906,55 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
configure(opts: CrawlerOptions) {
|
||||||
|
|
||||||
|
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
||||||
|
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
||||||
|
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||||
|
|
||||||
|
const crawlOpts: ExtraScrappingOptions = {
|
||||||
|
proxyUrl: opts.proxyUrl,
|
||||||
|
cookies: opts.setCookies,
|
||||||
|
favorScreenshot: opts.respondWith === 'screenshot',
|
||||||
|
waitForSelector: opts.waitForSelector,
|
||||||
|
targetSelector: opts.targetSelector,
|
||||||
|
};
|
||||||
|
|
||||||
|
return crawlOpts;
|
||||||
|
}
|
||||||
|
|
||||||
|
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
||||||
|
const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
|
||||||
|
|
||||||
|
let lastSnapshot;
|
||||||
|
let goodEnough = false;
|
||||||
|
try {
|
||||||
|
for await (const x of it) {
|
||||||
|
lastSnapshot = x;
|
||||||
|
|
||||||
|
if (goodEnough) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lastSnapshot?.parsed?.content) {
|
||||||
|
// After it's good enough, wait for next snapshot;
|
||||||
|
goodEnough = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
if (lastSnapshot) {
|
||||||
|
return this.formatSnapshot(mode, lastSnapshot, url);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!lastSnapshot) {
|
||||||
|
throw new AssertionFailureError(`No content available`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.formatSnapshot(mode, lastSnapshot, url);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
289
backend/functions/src/cloud-functions/data-crunching.ts
Normal file
289
backend/functions/src/cloud-functions/data-crunching.ts
Normal file
|
@ -0,0 +1,289 @@
|
||||||
|
import {
|
||||||
|
Defer,
|
||||||
|
PromiseThrottle,
|
||||||
|
RPCHost,
|
||||||
|
RPCReflection,
|
||||||
|
} from 'civkit';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
import { CloudScheduleV2, CloudTaskV2, FirebaseStorageBucketControl, Logger, OutputServerEventStream, Param, RPCReflect, TempFileManager } from '../shared';
|
||||||
|
import _ from 'lodash';
|
||||||
|
import { CrawlerHost } from './crawler';
|
||||||
|
|
||||||
|
import { Crawled } from '../db/crawled';
|
||||||
|
import dayjs from 'dayjs';
|
||||||
|
import { createReadStream } from 'fs';
|
||||||
|
import { appendFile } from 'fs/promises';
|
||||||
|
import { createGzip } from 'zlib';
|
||||||
|
import { getFunctions } from 'firebase-admin/functions';
|
||||||
|
import { GoogleAuth } from 'google-auth-library';
|
||||||
|
|
||||||
|
dayjs.extend(require('dayjs/plugin/utc'));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the URL of a given v2 cloud function.
|
||||||
|
*
|
||||||
|
* @param {string} name the function's name
|
||||||
|
* @param {string} location the function's location
|
||||||
|
* @return {Promise<string>} The URL of the function
|
||||||
|
*/
|
||||||
|
async function getFunctionUrl(name: string, location = "us-central1") {
|
||||||
|
const projectId = `reader-6b7dc`;
|
||||||
|
const url = "https://cloudfunctions.googleapis.com/v2beta/" +
|
||||||
|
`projects/${projectId}/locations/${location}/functions/${name}`;
|
||||||
|
const auth = new GoogleAuth({
|
||||||
|
scopes: 'https://www.googleapis.com/auth/cloud-platform',
|
||||||
|
});
|
||||||
|
const client = await auth.getClient();
|
||||||
|
const res = await client.request<any>({ url });
|
||||||
|
const uri = res.data?.serviceConfig?.uri;
|
||||||
|
if (!uri) {
|
||||||
|
throw new Error(`Unable to retreive uri for function at ${url}`);
|
||||||
|
}
|
||||||
|
return uri;
|
||||||
|
}
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class DataCrunchingHost extends RPCHost {
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
pageCacheCrunchingPrefix = 'crunched-pages';
|
||||||
|
pageCacheCrunchingBatchSize = 5000;
|
||||||
|
pageCacheCrunchingTMinus = 6 * 24 * 60 * 60 * 1000;
|
||||||
|
rev = 7;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
|
||||||
|
protected crawler: CrawlerHost,
|
||||||
|
protected tempFileManager: TempFileManager,
|
||||||
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
|
) {
|
||||||
|
super(..._.without(arguments, crawler));
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
@CloudTaskV2({
|
||||||
|
runtime: {
|
||||||
|
cpu: 2,
|
||||||
|
memory: '4GiB',
|
||||||
|
timeoutSeconds: 3600,
|
||||||
|
concurrency: 2,
|
||||||
|
maxInstances: 200,
|
||||||
|
retryConfig: {
|
||||||
|
maxAttempts: 3,
|
||||||
|
minBackoffSeconds: 60,
|
||||||
|
},
|
||||||
|
rateLimits: {
|
||||||
|
maxConcurrentDispatches: 150,
|
||||||
|
maxDispatchesPerSecond: 2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
tags: ['DataCrunching'],
|
||||||
|
})
|
||||||
|
async crunchPageCacheWorker(
|
||||||
|
@Param('date') date: string,
|
||||||
|
@Param('offset', { default: 0 }) offset: number
|
||||||
|
) {
|
||||||
|
this.logger.info(`Crunching page cache @${date}+${offset}...`);
|
||||||
|
for await (const { fileName, records } of this.iterPageCacheRecords(date, offset)) {
|
||||||
|
this.logger.info(`Crunching ${fileName}...`);
|
||||||
|
const fileOnDrive = await this.crunchCacheRecords(records);
|
||||||
|
const fstream = createReadStream(fileOnDrive.path);
|
||||||
|
const gzipStream = createGzip();
|
||||||
|
fstream.pipe(gzipStream, { end: true });
|
||||||
|
await this.firebaseObjectStorage.bucket.file(fileName).save(gzipStream, {
|
||||||
|
contentType: 'application/jsonl+gzip',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.info(`Crunching page cache @${date}+${offset} done.`);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@CloudScheduleV2('2 0 * * *', {
|
||||||
|
name: 'crunchPageCacheEveryday',
|
||||||
|
runtime: {
|
||||||
|
cpu: 2,
|
||||||
|
memory: '4GiB',
|
||||||
|
timeoutSeconds: 1800,
|
||||||
|
timeZone: 'UTC',
|
||||||
|
retryCount: 3,
|
||||||
|
minBackoffSeconds: 60,
|
||||||
|
},
|
||||||
|
tags: ['DataCrunching'],
|
||||||
|
})
|
||||||
|
// @CloudHTTPv2({
|
||||||
|
// runtime: {
|
||||||
|
// cpu: 2,
|
||||||
|
// memory: '4GiB',
|
||||||
|
// timeoutSeconds: 3600,
|
||||||
|
// concurrency: 2,
|
||||||
|
// maxInstances: 200,
|
||||||
|
// },
|
||||||
|
// tags: ['DataCrunching'],
|
||||||
|
// })
|
||||||
|
async dispatchPageCacheCrunching(
|
||||||
|
@RPCReflect() rpcReflect: RPCReflection,
|
||||||
|
) {
|
||||||
|
const sse = new OutputServerEventStream({ highWaterMark: 4096 });
|
||||||
|
rpcReflect.return(sse);
|
||||||
|
rpcReflect.catch((err) => {
|
||||||
|
sse.end({ data: `Error: ${err.message}` });
|
||||||
|
});
|
||||||
|
for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
|
||||||
|
this.logger.info(`Dispatching ${fileName}...`);
|
||||||
|
sse.write({ data: `Dispatching ${fileName}...` });
|
||||||
|
|
||||||
|
await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
|
||||||
|
dispatchDeadlineSeconds: 1800,
|
||||||
|
uri: await getFunctionUrl('crunchPageCacheWorker'),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
sse.end({ data: 'done' });
|
||||||
|
sse.resume();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
|
||||||
|
const startOfToday = dayjs().utc().startOf('day');
|
||||||
|
const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
|
||||||
|
let theDay = startingPoint;
|
||||||
|
|
||||||
|
if (date) {
|
||||||
|
theDay = dayjs(date).utc().startOf('day');
|
||||||
|
}
|
||||||
|
|
||||||
|
let counter = 0;
|
||||||
|
if (inputOffset) {
|
||||||
|
counter = parseInt(inputOffset as string, 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (theDay.isBefore(startOfToday)) {
|
||||||
|
const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
|
||||||
|
const offset = counter;
|
||||||
|
counter += this.pageCacheCrunchingBatchSize;
|
||||||
|
const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
|
||||||
|
if (fileExists) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const records = await Crawled.fromFirestoreQuery(Crawled.COLLECTION
|
||||||
|
.where('createdAt', '>=', theDay.toDate())
|
||||||
|
.where('createdAt', '<', theDay.add(1, 'day').toDate())
|
||||||
|
.orderBy('createdAt', 'asc')
|
||||||
|
.offset(offset)
|
||||||
|
.limit(this.pageCacheCrunchingBatchSize)
|
||||||
|
);
|
||||||
|
|
||||||
|
this.logger.info(`Found ${records.length} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
|
||||||
|
|
||||||
|
if (!records.length) {
|
||||||
|
if (date) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
theDay = theDay.add(1, 'day');
|
||||||
|
counter = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
yield { fileName, records };
|
||||||
|
|
||||||
|
if (offset) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async* iterPageCacheChunks() {
|
||||||
|
const startOfToday = dayjs().utc().startOf('day');
|
||||||
|
const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
|
||||||
|
let theDay = startingPoint;
|
||||||
|
|
||||||
|
let counter = 0;
|
||||||
|
|
||||||
|
while (theDay.isBefore(startOfToday)) {
|
||||||
|
const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
|
||||||
|
const offset = counter;
|
||||||
|
counter += this.pageCacheCrunchingBatchSize;
|
||||||
|
const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
|
||||||
|
if (fileExists) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const nRecords = (await Crawled.COLLECTION
|
||||||
|
.where('createdAt', '>=', theDay.toDate())
|
||||||
|
.where('createdAt', '<', theDay.add(1, 'day').toDate())
|
||||||
|
.orderBy('createdAt', 'asc')
|
||||||
|
.offset(offset)
|
||||||
|
.limit(this.pageCacheCrunchingBatchSize)
|
||||||
|
.count().get()).data().count;
|
||||||
|
|
||||||
|
this.logger.info(`Found ${nRecords} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
|
||||||
|
if (nRecords < this.pageCacheCrunchingBatchSize) {
|
||||||
|
theDay = theDay.add(1, 'day');
|
||||||
|
counter = 0;
|
||||||
|
}
|
||||||
|
if (nRecords) {
|
||||||
|
yield { fileName, date: theDay.toISOString(), offset };
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async crunchCacheRecords(records: Crawled[]) {
|
||||||
|
const throttle = new PromiseThrottle(30);
|
||||||
|
const localFilePath = this.tempFileManager.alloc();
|
||||||
|
let nextDrainDeferred = Defer();
|
||||||
|
nextDrainDeferred.resolve();
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
await throttle.acquire();
|
||||||
|
this.firebaseObjectStorage.downloadFile(`snapshots/${record._id}`)
|
||||||
|
.then(async (snapshotTxt) => {
|
||||||
|
try {
|
||||||
|
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
|
||||||
|
|
||||||
|
let formatted = await this.crawler.formatSnapshot('default', snapshot);
|
||||||
|
if (!formatted.content) {
|
||||||
|
formatted = await this.crawler.formatSnapshot('markdown', snapshot);
|
||||||
|
}
|
||||||
|
|
||||||
|
await nextDrainDeferred.promise;
|
||||||
|
await appendFile(localFilePath, JSON.stringify({
|
||||||
|
url: snapshot.href,
|
||||||
|
title: snapshot.title || '',
|
||||||
|
html: snapshot.html || '',
|
||||||
|
text: snapshot.text || '',
|
||||||
|
content: formatted.content || '',
|
||||||
|
}) + '\n', { encoding: 'utf-8' });
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(`Failed to parse snapshot for ${record._id}`, { err });
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.finally(() => {
|
||||||
|
throttle.release();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await throttle.nextDrain();
|
||||||
|
|
||||||
|
|
||||||
|
const ro = {
|
||||||
|
path: localFilePath
|
||||||
|
};
|
||||||
|
|
||||||
|
this.tempFileManager.bindPathTo(ro, localFilePath);
|
||||||
|
|
||||||
|
return ro;
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,6 +19,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||||
import { SearchResult } from '../db/searched';
|
import { SearchResult } from '../db/searched';
|
||||||
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
||||||
|
import { CrawlerOptions } from '../dto/scrapping-options';
|
||||||
|
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
|
@ -145,7 +146,8 @@ export class SearcherHost extends RPCHost {
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response,
|
res: Response,
|
||||||
},
|
},
|
||||||
auth: JinaEmbeddingsAuthDTO
|
auth: JinaEmbeddingsAuthDTO,
|
||||||
|
crawlerOptions: CrawlerOptions,
|
||||||
) {
|
) {
|
||||||
const uid = await auth.solveUID();
|
const uid = await auth.solveUID();
|
||||||
let chargeAmount = 0;
|
let chargeAmount = 0;
|
||||||
|
@ -201,18 +203,7 @@ export class SearcherHost extends RPCHost {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
const crawlOpts = this.crawler.configure(crawlerOptions);
|
||||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
|
||||||
const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
|
|
||||||
const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
|
|
||||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
|
||||||
let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
|
||||||
if (isNaN(pageCacheTolerance)) {
|
|
||||||
pageCacheTolerance = this.pageCacheToleranceMs;
|
|
||||||
if (noCache) {
|
|
||||||
pageCacheTolerance = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||||
if (Array.isArray(setCookieHeaders)) {
|
if (Array.isArray(setCookieHeaders)) {
|
||||||
|
@ -226,27 +217,19 @@ export class SearcherHost extends RPCHost {
|
||||||
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
|
||||||
this.threadLocal.set('withLinksSummary', withLinksSummary);
|
|
||||||
this.threadLocal.set('withImagesSummary', withImagesSummary);
|
|
||||||
|
|
||||||
const crawlOpts: ScrappingOptions = {
|
|
||||||
proxyUrl: ctx.req.get('x-proxy-url'),
|
|
||||||
cookies,
|
|
||||||
favorScreenshot: customMode === 'screenshot'
|
|
||||||
};
|
|
||||||
|
|
||||||
const searchQuery = noSlashPath;
|
const searchQuery = noSlashPath;
|
||||||
const r = await this.cachedWebSearch({
|
const r = await this.cachedWebSearch({
|
||||||
q: searchQuery,
|
q: searchQuery,
|
||||||
count: 10
|
count: 10
|
||||||
}, noCache);
|
}, crawlerOptions.noCache);
|
||||||
|
|
||||||
if (!r.web?.results.length) {
|
if (!r.web?.results.length) {
|
||||||
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const it = this.fetchSearchResults(customMode, r.web?.results, crawlOpts, pageCacheTolerance);
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
||||||
|
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
||||||
|
);
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
const sseStream = new OutputServerEventStream();
|
const sseStream = new OutputServerEventStream();
|
||||||
|
|
65
backend/functions/src/db/pdf.ts
Normal file
65
backend/functions/src/db/pdf.ts
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
import { Also, Prop, parseJSONText } from 'civkit';
|
||||||
|
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||||
|
import _ from 'lodash';
|
||||||
|
|
||||||
|
@Also({
|
||||||
|
dictOf: Object
|
||||||
|
})
|
||||||
|
export class PDFContent extends FirestoreRecord {
|
||||||
|
static override collectionName = 'pdfs';
|
||||||
|
|
||||||
|
override _id!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
src!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
urlDigest!: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
meta?: { [k: string]: any; };
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
text?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
content?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
createdAt!: Date;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
expireAt?: Date;
|
||||||
|
|
||||||
|
static patchedFields = [
|
||||||
|
'meta'
|
||||||
|
];
|
||||||
|
|
||||||
|
static override from(input: any) {
|
||||||
|
for (const field of this.patchedFields) {
|
||||||
|
if (typeof input[field] === 'string') {
|
||||||
|
input[field] = parseJSONText(input[field]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return super.from(input) as PDFContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
override degradeForFireStore() {
|
||||||
|
const copy: any = { ...this };
|
||||||
|
|
||||||
|
for (const field of (this.constructor as typeof PDFContent).patchedFields) {
|
||||||
|
if (typeof copy[field] === 'object') {
|
||||||
|
copy[field] = JSON.stringify(copy[field]) as any;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
[k: string]: any;
|
||||||
|
}
|
110
backend/functions/src/dto/scrapping-options.ts
Normal file
110
backend/functions/src/dto/scrapping-options.ts
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
||||||
|
import type { Request, Response } from 'express';
|
||||||
|
import type { CookieParam } from 'puppeteer';
|
||||||
|
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
|
|
||||||
|
export class CrawlerOptions extends AutoCastable {
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: 'default',
|
||||||
|
})
|
||||||
|
respondWith!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: false,
|
||||||
|
})
|
||||||
|
withGeneratedAlt!: boolean;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: false,
|
||||||
|
})
|
||||||
|
withLinksSummary!: boolean;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: false,
|
||||||
|
})
|
||||||
|
withImagesSummary!: boolean;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: false,
|
||||||
|
})
|
||||||
|
noCache!: boolean;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
cacheTolerance?: number;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
targetSelector?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
waitForSelector?: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
arrayOf: String,
|
||||||
|
})
|
||||||
|
setCookies?: CookieParam[];
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
proxyUrl?: string;
|
||||||
|
|
||||||
|
static override from(input: any) {
|
||||||
|
const instance = super.from(input) as CrawlerOptions;
|
||||||
|
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||||
|
req: Request,
|
||||||
|
res: Response,
|
||||||
|
};
|
||||||
|
|
||||||
|
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format');
|
||||||
|
if (customMode !== undefined) {
|
||||||
|
instance.respondWith = customMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
const withGeneratedAlt = ctx.req.get('x-with-generated-alt');
|
||||||
|
if (withGeneratedAlt !== undefined) {
|
||||||
|
instance.withGeneratedAlt = Boolean(withGeneratedAlt);
|
||||||
|
}
|
||||||
|
const withLinksSummary = ctx.req.get('x-with-links-summary');
|
||||||
|
if (withLinksSummary !== undefined) {
|
||||||
|
instance.withLinksSummary = Boolean(withLinksSummary);
|
||||||
|
}
|
||||||
|
const withImagesSummary = ctx.req.get('x-with-images-summary');
|
||||||
|
if (withImagesSummary !== undefined) {
|
||||||
|
instance.withImagesSummary = Boolean(withImagesSummary);
|
||||||
|
}
|
||||||
|
const noCache = ctx.req.get('x-no-cache');
|
||||||
|
if (noCache !== undefined) {
|
||||||
|
instance.noCache = Boolean(noCache);
|
||||||
|
if (instance.noCache && instance.cacheTolerance === undefined) {
|
||||||
|
instance.cacheTolerance = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '');
|
||||||
|
if (!isNaN(cacheTolerance)) {
|
||||||
|
instance.cacheTolerance = cacheTolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
const targetSelector = ctx.req.get('x-target-selector');
|
||||||
|
instance.targetSelector ??= targetSelector;
|
||||||
|
const waitForSelector = ctx.req.get('x-wait-for-selector');
|
||||||
|
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
||||||
|
|
||||||
|
const cookies: CookieParam[] = [];
|
||||||
|
const setCookieHeaders = ctx.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
|
||||||
|
if (Array.isArray(setCookieHeaders)) {
|
||||||
|
for (const setCookie of setCookieHeaders) {
|
||||||
|
cookies.push({
|
||||||
|
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else if (setCookieHeaders && typeof setCookieHeaders === 'string') {
|
||||||
|
cookies.push({
|
||||||
|
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const proxyUrl = ctx.req.get('x-proxy-url');
|
||||||
|
instance.proxyUrl ??= proxyUrl;
|
||||||
|
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,7 +6,6 @@ import { ImageInterrogationManager } from '../shared/services/common-iminterroga
|
||||||
import { ImgBrief } from './puppeteer';
|
import { ImgBrief } from './puppeteer';
|
||||||
import { ImgAlt } from '../db/img-alt';
|
import { ImgAlt } from '../db/img-alt';
|
||||||
|
|
||||||
|
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
|
|
298
backend/functions/src/services/pdf-extract.ts
Normal file
298
backend/functions/src/services/pdf-extract.ts
Normal file
|
@ -0,0 +1,298 @@
|
||||||
|
import 'core-js/actual/promise/with-resolvers';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
import _ from 'lodash';
|
||||||
|
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||||
|
import { AsyncService, HashManager } from 'civkit';
|
||||||
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import { PDFContent } from '../db/pdf';
|
||||||
|
import dayjs from 'dayjs';
|
||||||
|
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
||||||
|
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
||||||
|
const timezone = require('dayjs/plugin/timezone');
|
||||||
|
dayjs.extend(timezone);
|
||||||
|
|
||||||
|
const pPdfjs = import('pdfjs-dist');
|
||||||
|
|
||||||
|
|
||||||
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
|
function stdDev(numbers: number[]) {
|
||||||
|
const mean = _.mean(numbers);
|
||||||
|
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
|
||||||
|
const avgSquareDiff = _.mean(squareDiffs);
|
||||||
|
return Math.sqrt(avgSquareDiff);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isRotatedByAtLeast35Degrees(transform: [number, number, number, number, number, number]): boolean {
|
||||||
|
const [a, b, c, d, _e, _f] = transform;
|
||||||
|
|
||||||
|
// Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
|
||||||
|
const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b
|
||||||
|
const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d
|
||||||
|
|
||||||
|
// Either angle1 or angle2 can be used to determine the rotation, they should be equivalent
|
||||||
|
const rotationAngle1 = Math.abs(angle1);
|
||||||
|
const rotationAngle2 = Math.abs(angle2);
|
||||||
|
|
||||||
|
// Check if the absolute rotation angle is greater than or equal to 35 degrees
|
||||||
|
return rotationAngle1 >= 35 || rotationAngle2 >= 35;
|
||||||
|
}
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class PDFExtractor extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
pdfjs!: Awaited<typeof pPdfjs>;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
this.pdfjs = await pPdfjs;
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
async extract(url: string | URL) {
|
||||||
|
const loadingTask = this.pdfjs.getDocument({
|
||||||
|
url,
|
||||||
|
disableFontFace: true,
|
||||||
|
verbosity: 0
|
||||||
|
});
|
||||||
|
|
||||||
|
const doc = await loadingTask.promise;
|
||||||
|
const meta = await doc.getMetadata();
|
||||||
|
|
||||||
|
const textItems: TextItem[][] = [];
|
||||||
|
|
||||||
|
for (const pg of _.range(0, doc.numPages)) {
|
||||||
|
const page = await doc.getPage(pg + 1);
|
||||||
|
const textContent = await page.getTextContent();
|
||||||
|
textItems.push((textContent.items as TextItem[]));
|
||||||
|
}
|
||||||
|
|
||||||
|
const articleCharHeights: number[] = [];
|
||||||
|
for (const textItem of textItems.flat()) {
|
||||||
|
if (textItem.height) {
|
||||||
|
articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const articleAvgHeight = _.mean(articleCharHeights);
|
||||||
|
const articleStdDevHeight = stdDev(articleCharHeights);
|
||||||
|
// const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)];
|
||||||
|
const mdOps: Array<{
|
||||||
|
text: string;
|
||||||
|
op?: 'new' | 'append';
|
||||||
|
mode: 'h1' | 'h2' | 'p' | 'appendix' | 'space';
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
const rawChunks: string[] = [];
|
||||||
|
|
||||||
|
let op: 'append' | 'new' = 'new';
|
||||||
|
let mode: 'h1' | 'h2' | 'p' | 'space' | 'appendix' = 'p';
|
||||||
|
for (const pageTextItems of textItems) {
|
||||||
|
const charHeights = [];
|
||||||
|
for (const textItem of pageTextItems as TextItem[]) {
|
||||||
|
if (textItem.height) {
|
||||||
|
charHeights.push(...Array(textItem.str.length).fill(textItem.height));
|
||||||
|
}
|
||||||
|
rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const avgHeight = _.mean(charHeights);
|
||||||
|
const stdDevHeight = stdDev(charHeights);
|
||||||
|
// const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)];
|
||||||
|
|
||||||
|
for (const textItem of pageTextItems) {
|
||||||
|
if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) {
|
||||||
|
mode = 'h1';
|
||||||
|
} else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) {
|
||||||
|
mode = 'h2';
|
||||||
|
} else if (textItem.height && textItem.height < avgHeight - stdDevHeight) {
|
||||||
|
mode = 'appendix';
|
||||||
|
} else if (textItem.height) {
|
||||||
|
mode = 'p';
|
||||||
|
} else {
|
||||||
|
mode = 'space';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isRotatedByAtLeast35Degrees(textItem.transform as any)) {
|
||||||
|
mode = 'appendix';
|
||||||
|
}
|
||||||
|
|
||||||
|
mdOps.push({
|
||||||
|
op,
|
||||||
|
mode,
|
||||||
|
text: textItem.str
|
||||||
|
});
|
||||||
|
|
||||||
|
if (textItem.hasEOL && !textItem.str) {
|
||||||
|
op = 'new';
|
||||||
|
} else {
|
||||||
|
op = 'append';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const mdChunks = [];
|
||||||
|
const appendixChunks = [];
|
||||||
|
mode = 'space';
|
||||||
|
for (const x of mdOps) {
|
||||||
|
const previousMode: string = mode;
|
||||||
|
const changeToMdChunks = [];
|
||||||
|
|
||||||
|
const isNewStart = x.mode !== 'space' && (x.op === 'new' || (previousMode === 'appendix' && x.mode !== previousMode));
|
||||||
|
|
||||||
|
if (isNewStart) {
|
||||||
|
switch (x.mode) {
|
||||||
|
case 'h1': {
|
||||||
|
changeToMdChunks.push(`\n\n# `);
|
||||||
|
mode = x.mode;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'h2': {
|
||||||
|
changeToMdChunks.push(`\n\n## `);
|
||||||
|
mode = x.mode;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'p': {
|
||||||
|
changeToMdChunks.push(`\n\n`);
|
||||||
|
mode = x.mode;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'appendix': {
|
||||||
|
mode = x.mode;
|
||||||
|
appendixChunks.push(`\n\n`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default: {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (x.mode === 'appendix' && appendixChunks.length) {
|
||||||
|
const lastChunk = appendixChunks[appendixChunks.length - 1];
|
||||||
|
if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
|
||||||
|
appendixChunks.push(' ');
|
||||||
|
}
|
||||||
|
} else if (mdChunks.length) {
|
||||||
|
const lastChunk = mdChunks[mdChunks.length - 1];
|
||||||
|
if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
|
||||||
|
changeToMdChunks.push(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x.text) {
|
||||||
|
if (x.mode == 'appendix') {
|
||||||
|
if (appendixChunks.length || isNewStart) {
|
||||||
|
appendixChunks.push(x.text);
|
||||||
|
} else {
|
||||||
|
changeToMdChunks.push(x.text);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
changeToMdChunks.push(x.text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) {
|
||||||
|
const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n');
|
||||||
|
changeToMdChunks.unshift(appendix);
|
||||||
|
changeToMdChunks.unshift(`\n\n`);
|
||||||
|
appendixChunks.length = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x.mode === 'space' && changeToMdChunks.length) {
|
||||||
|
changeToMdChunks.length = 1;
|
||||||
|
}
|
||||||
|
if (changeToMdChunks.length) {
|
||||||
|
mdChunks.push(...changeToMdChunks);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mdChunks.length) {
|
||||||
|
mdChunks[0] = mdChunks[0].trimStart();
|
||||||
|
}
|
||||||
|
|
||||||
|
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
||||||
|
}
|
||||||
|
|
||||||
|
async cachedExtract(url: string | URL) {
|
||||||
|
if (!url) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const digest = md5Hasher.hash(url.toString());
|
||||||
|
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
||||||
|
|
||||||
|
const existing = await PDFContent.fromFirestore(shortDigest);
|
||||||
|
|
||||||
|
if (existing) {
|
||||||
|
return {
|
||||||
|
meta: existing.meta,
|
||||||
|
content: existing.content,
|
||||||
|
text: existing.text
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let extracted;
|
||||||
|
|
||||||
|
try {
|
||||||
|
extracted = await this.extract(url);
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(`Unable to extract from pdf ${url}`, { err });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't try again until the next day
|
||||||
|
const expireMixin = extracted ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
|
||||||
|
|
||||||
|
await PDFContent.COLLECTION.doc(shortDigest).set(
|
||||||
|
{
|
||||||
|
_id: shortDigest,
|
||||||
|
src: url.toString(),
|
||||||
|
meta: extracted?.meta || {},
|
||||||
|
content: extracted?.content || '',
|
||||||
|
text: extracted?.text || '',
|
||||||
|
urlDigest: digest,
|
||||||
|
createdAt: new Date(),
|
||||||
|
...expireMixin
|
||||||
|
}, { merge: true }
|
||||||
|
);
|
||||||
|
|
||||||
|
return extracted;
|
||||||
|
}
|
||||||
|
|
||||||
|
parsePdfDate(pdfDate: string | undefined) {
|
||||||
|
if (!pdfDate) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
// Remove the 'D:' prefix
|
||||||
|
const cleanedDate = pdfDate.slice(2);
|
||||||
|
|
||||||
|
// Define the format without the timezone part first
|
||||||
|
const dateTimePart = cleanedDate.slice(0, 14);
|
||||||
|
const timezonePart = cleanedDate.slice(14);
|
||||||
|
|
||||||
|
// Construct the full date string in a standard format
|
||||||
|
const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`;
|
||||||
|
|
||||||
|
// Parse the date with timezone
|
||||||
|
const parsedDate = dayjs(formattedDate, "YYYYMMDDHHmmssZ");
|
||||||
|
|
||||||
|
const date = parsedDate.toDate();
|
||||||
|
|
||||||
|
if (!date.valueOf()) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
|
@ -50,6 +50,7 @@ export interface PageSnapshot {
|
||||||
parsed?: Partial<ReadabilityParsed> | null;
|
parsed?: Partial<ReadabilityParsed> | null;
|
||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
imgs?: ImgBrief[];
|
imgs?: ImgBrief[];
|
||||||
|
pdfs?: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtendedSnapshot extends PageSnapshot {
|
export interface ExtendedSnapshot extends PageSnapshot {
|
||||||
|
@ -62,6 +63,7 @@ export interface ScrappingOptions {
|
||||||
cookies?: CookieParam[];
|
cookies?: CookieParam[];
|
||||||
favorScreenshot?: boolean;
|
favorScreenshot?: boolean;
|
||||||
waitForSelector?: string;
|
waitForSelector?: string;
|
||||||
|
minIntervalMs?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,7 +99,9 @@ export class PuppeteerControl extends AsyncService {
|
||||||
livePages = new Set<Page>();
|
livePages = new Set<Page>();
|
||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
|
|
||||||
constructor(protected globalLogger: Logger) {
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
||||||
|
|
||||||
|
@ -219,7 +223,17 @@ function briefImgs(elem) {
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function giveSnapshot() {
|
function briefPDFs() {
|
||||||
|
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
||||||
|
|
||||||
|
return pdfTags.map((x)=> {
|
||||||
|
return x.src === 'about:blank' ? document.location.href : x.src;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function giveSnapshot(stopActiveSnapshot) {
|
||||||
|
if (stopActiveSnapshot) {
|
||||||
|
window.haltSnapshot = true;
|
||||||
|
}
|
||||||
let parsed;
|
let parsed;
|
||||||
try {
|
try {
|
||||||
parsed = new Readability(document.cloneNode(true)).parse();
|
parsed = new Readability(document.cloneNode(true)).parse();
|
||||||
|
@ -234,6 +248,7 @@ function giveSnapshot() {
|
||||||
text: document.body?.innerText,
|
text: document.body?.innerText,
|
||||||
parsed: parsed,
|
parsed: parsed,
|
||||||
imgs: [],
|
imgs: [],
|
||||||
|
pdfs: briefPDFs(),
|
||||||
};
|
};
|
||||||
if (parsed && parsed.content) {
|
if (parsed && parsed.content) {
|
||||||
const elem = document.createElement('div');
|
const elem = document.createElement('div');
|
||||||
|
@ -277,7 +292,7 @@ function giveSnapshot() {
|
||||||
});
|
});
|
||||||
|
|
||||||
await page.evaluateOnNewDocument(`
|
await page.evaluateOnNewDocument(`
|
||||||
let aftershot = undefined;
|
let lastTextLength = 0;
|
||||||
const handlePageLoad = () => {
|
const handlePageLoad = () => {
|
||||||
if (window.haltSnapshot) {
|
if (window.haltSnapshot) {
|
||||||
return;
|
return;
|
||||||
|
@ -285,26 +300,23 @@ const handlePageLoad = () => {
|
||||||
if (document.readyState !== 'complete') {
|
if (document.readyState !== 'complete') {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const parsed = giveSnapshot();
|
const thisTextLength = (document.body.innerText || '').length;
|
||||||
window.reportSnapshot(parsed);
|
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
||||||
if (!parsed.text) {
|
if (10 * deltaLength < lastTextLength) {
|
||||||
if (aftershot) {
|
// Change is not significant
|
||||||
clearTimeout(aftershot);
|
return;
|
||||||
}
|
|
||||||
aftershot = setTimeout(() => {
|
|
||||||
const r = giveSnapshot();
|
|
||||||
if (r && r.text) {
|
|
||||||
window.reportSnapshot(r);
|
|
||||||
}
|
|
||||||
}, 500);
|
|
||||||
}
|
}
|
||||||
|
const r = giveSnapshot();
|
||||||
|
window.reportSnapshot(r);
|
||||||
|
lastTextLength = thisTextLength;
|
||||||
};
|
};
|
||||||
|
setInterval(handlePageLoad, 500);
|
||||||
document.addEventListener('readystatechange', handlePageLoad);
|
document.addEventListener('readystatechange', handlePageLoad);
|
||||||
document.addEventListener('load', handlePageLoad);
|
document.addEventListener('load', handlePageLoad);
|
||||||
`);
|
`);
|
||||||
|
|
||||||
this.snMap.set(page, sn);
|
this.snMap.set(page, sn);
|
||||||
this.logger.warn(`Page ${sn} created.`);
|
this.logger.info(`Page ${sn} created.`);
|
||||||
this.lastPageCratedAt = Date.now();
|
this.lastPageCratedAt = Date.now();
|
||||||
this.livePages.add(page);
|
this.livePages.add(page);
|
||||||
|
|
||||||
|
@ -409,12 +421,12 @@ document.addEventListener('load', handlePageLoad);
|
||||||
finalized = true;
|
finalized = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
if (!snapshot.title || !snapshot.parsed?.content) {
|
if ((!snapshot.title || !snapshot.parsed?.content) && !(snapshot.pdfs?.length)) {
|
||||||
const salvaged = await this.salvage(url, page);
|
const salvaged = await this.salvage(url, page);
|
||||||
if (salvaged) {
|
if (salvaged) {
|
||||||
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -429,7 +441,7 @@ document.addEventListener('load', handlePageLoad);
|
||||||
if (options?.waitForSelector) {
|
if (options?.waitForSelector) {
|
||||||
page.waitForSelector(options.waitForSelector)
|
page.waitForSelector(options.waitForSelector)
|
||||||
.then(async () => {
|
.then(async () => {
|
||||||
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
finalized = true;
|
finalized = true;
|
||||||
nextSnapshotDeferred.resolve(snapshot);
|
nextSnapshotDeferred.resolve(snapshot);
|
||||||
|
@ -442,7 +454,11 @@ document.addEventListener('load', handlePageLoad);
|
||||||
try {
|
try {
|
||||||
let lastHTML = snapshot?.html;
|
let lastHTML = snapshot?.html;
|
||||||
while (true) {
|
while (true) {
|
||||||
await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
|
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
||||||
|
if (options?.minIntervalMs) {
|
||||||
|
ckpt.push(delay(options.minIntervalMs));
|
||||||
|
}
|
||||||
|
await Promise.race(ckpt);
|
||||||
if (finalized) {
|
if (finalized) {
|
||||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
yield { ...snapshot, screenshot } as PageSnapshot;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
{
|
{
|
||||||
"compilerOptions": {
|
"compilerOptions": {
|
||||||
"module": "commonjs",
|
"module": "node16",
|
||||||
|
|
||||||
"noImplicitReturns": true,
|
"noImplicitReturns": true,
|
||||||
"noUnusedLocals": true,
|
"noUnusedLocals": true,
|
||||||
"outDir": "build",
|
"outDir": "build",
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 17ee85dd08becd8a86acf993ae7952d8f911b05e
|
Subproject commit b0b597800a36e2aa8ee3d52715aa7c998b388f47
|
Loading…
Reference in New Issue
Block a user