mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
fix: Remove unused code and dependencies
This commit is contained in:
parent
c33929afb2
commit
80547abf38
|
@ -12,7 +12,7 @@ import { Request, Response } from 'express';
|
||||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||||
import { AltTextService } from '../services/alt-text';
|
import { AltTextService } from '../services/alt-text';
|
||||||
import TurndownService from 'turndown';
|
import TurndownService from 'turndown';
|
||||||
import { Crawled } from '../db/crawled';
|
// import { Crawled } from '../db/crawled';
|
||||||
import { cleanAttribute } from '../utils/misc';
|
import { cleanAttribute } from '../utils/misc';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
|
|
||||||
|
@ -89,8 +89,6 @@ export class CrawlerHost extends RPCHost {
|
||||||
// Potential privacy issue, dont cache if cookies are used
|
// Potential privacy issue, dont cache if cookies are used
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.setToCache(options.url, snapshot);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||||
|
@ -596,7 +594,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
})
|
})
|
||||||
async crawl(
|
async crawl(
|
||||||
@RPCReflect() rpcReflect: RPCReflection,
|
@RPCReflect() rpcReflect: RPCReflection,
|
||||||
@Ctx() ctx: {
|
ctx: {
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response,
|
res: Response,
|
||||||
},
|
},
|
||||||
|
@ -620,8 +618,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
ctx.req.hostname.toLowerCase()
|
ctx.req.hostname.toLowerCase()
|
||||||
);
|
);
|
||||||
|
|
||||||
// Rate limiting code removed
|
|
||||||
|
|
||||||
let urlToCrawl;
|
let urlToCrawl;
|
||||||
const normalizeUrl = (await pNormalizeUrl).default;
|
const normalizeUrl = (await pNormalizeUrl).default;
|
||||||
try {
|
try {
|
||||||
|
@ -649,16 +645,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove auth check
|
|
||||||
const crawlOpts = this.configure(crawlerOptions);
|
const crawlOpts = this.configure(crawlerOptions);
|
||||||
|
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
const sseStream = new OutputServerEventStream();
|
const sseStream = new OutputServerEventStream();
|
||||||
rpcReflect.return(sseStream);
|
rpcReflect.return(sseStream);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
for await (const scrapped of this.scrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||||
if (!scrapped) {
|
if (!scrapped) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -684,7 +678,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
|
|
||||||
let lastScrapped;
|
let lastScrapped;
|
||||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
for await (const scrapped of this.scrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -706,7 +700,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
for await (const scrapped of this.scrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -716,13 +710,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
|
|
||||||
if (crawlerOptions.timeout === undefined) {
|
if (crawlerOptions.timeout === undefined) {
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
);
|
);
|
||||||
|
@ -738,13 +730,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
);
|
);
|
||||||
|
@ -764,117 +754,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
return digest;
|
return digest;
|
||||||
}
|
}
|
||||||
|
|
||||||
async queryCache(urlToCrawl: URL, cacheTolerance: number) {
|
|
||||||
const digest = this.getUrlDigest(urlToCrawl);
|
|
||||||
|
|
||||||
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
|
||||||
|
|
||||||
if (!cache) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
const age = Date.now() - cache.createdAt.valueOf();
|
|
||||||
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
|
|
||||||
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
|
|
||||||
url: urlToCrawl, digest, age, stale, cacheTolerance
|
|
||||||
});
|
|
||||||
|
|
||||||
let snapshot: PageSnapshot | undefined;
|
|
||||||
let screenshotUrl: string | undefined;
|
|
||||||
let pageshotUrl: string | undefined;
|
|
||||||
const preparations = [
|
|
||||||
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
|
|
||||||
snapshot = JSON.parse(r.toString('utf-8'));
|
|
||||||
}),
|
|
||||||
cache.screenshotAvailable ?
|
|
||||||
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
|
||||||
screenshotUrl = r;
|
|
||||||
}) :
|
|
||||||
Promise.resolve(undefined),
|
|
||||||
cache.pageshotAvailable ?
|
|
||||||
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
|
||||||
pageshotUrl = r;
|
|
||||||
}) :
|
|
||||||
Promise.resolve(undefined)
|
|
||||||
];
|
|
||||||
try {
|
|
||||||
await Promise.all(preparations);
|
|
||||||
} catch (_err) {
|
|
||||||
// Swallow cache errors.
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
isFresh: !stale,
|
|
||||||
...cache,
|
|
||||||
snapshot: {
|
|
||||||
...snapshot,
|
|
||||||
screenshot: undefined,
|
|
||||||
pageshot: undefined,
|
|
||||||
screenshotUrl,
|
|
||||||
pageshotUrl,
|
|
||||||
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
|
|
||||||
const digest = this.getUrlDigest(urlToCrawl);
|
|
||||||
|
|
||||||
this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
|
|
||||||
const nowDate = new Date();
|
|
||||||
|
|
||||||
const cache = Crawled.from({
|
|
||||||
_id: randomUUID(),
|
|
||||||
url: urlToCrawl.toString(),
|
|
||||||
createdAt: nowDate,
|
|
||||||
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
|
||||||
urlPathDigest: digest,
|
|
||||||
});
|
|
||||||
|
|
||||||
const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`,
|
|
||||||
Buffer.from(
|
|
||||||
JSON.stringify({
|
|
||||||
...snapshot,
|
|
||||||
screenshot: undefined
|
|
||||||
}),
|
|
||||||
'utf-8'
|
|
||||||
),
|
|
||||||
{
|
|
||||||
metadata: {
|
|
||||||
contentType: 'application/json',
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).then((r) => {
|
|
||||||
cache.snapshotAvailable = true;
|
|
||||||
return r;
|
|
||||||
});
|
|
||||||
|
|
||||||
if (snapshot.screenshot) {
|
|
||||||
await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
|
|
||||||
metadata: {
|
|
||||||
contentType: 'image/png',
|
|
||||||
}
|
|
||||||
});
|
|
||||||
cache.screenshotAvailable = true;
|
|
||||||
}
|
|
||||||
if (snapshot.pageshot) {
|
|
||||||
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
|
|
||||||
metadata: {
|
|
||||||
contentType: 'image/png',
|
|
||||||
}
|
|
||||||
});
|
|
||||||
cache.pageshotAvailable = true;
|
|
||||||
}
|
|
||||||
await savingOfSnapshot;
|
|
||||||
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
|
||||||
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
|
||||||
|
|
||||||
return undefined;
|
|
||||||
});
|
|
||||||
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
async *scrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
async *scrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
||||||
if (crawlerOpts?.html) {
|
if (crawlerOpts?.html) {
|
||||||
const fakeSnapshot = {
|
const fakeSnapshot = {
|
||||||
|
|
|
@ -1,21 +1,25 @@
|
||||||
import 'reflect-metadata';
|
import 'reflect-metadata';
|
||||||
import './shared/lib/doom-domain';
|
// import './shared/lib/doom-domain';
|
||||||
import { initializeApp } from 'firebase-admin/app';
|
import { initializeApp } from 'firebase-admin/app';
|
||||||
|
import { CrawlerHost } from './cloud-functions/crawler';
|
||||||
|
// import { functions } from 'firebase-admin/functions';
|
||||||
|
import { https } from 'firebase-functions'
|
||||||
|
|
||||||
initializeApp();
|
initializeApp();
|
||||||
|
|
||||||
|
export const crawler = https.onRequest(CrawlerHost);
|
||||||
|
// import { loadModulesDynamically, registry } from './shared';
|
||||||
|
// import path from 'path';
|
||||||
|
// loadModulesDynamically(path.resolve(__dirname, 'cloud-functions'));
|
||||||
|
|
||||||
import { loadModulesDynamically, registry } from './shared';
|
// Object.assign(exports, registry.exportAll());
|
||||||
import path from 'path';
|
// Object.assign(exports, registry.exportGrouped({
|
||||||
loadModulesDynamically(path.resolve(__dirname, 'cloud-functions'));
|
// memory: '4GiB',
|
||||||
|
// timeoutSeconds: 540,
|
||||||
Object.assign(exports, registry.exportAll());
|
// }));
|
||||||
Object.assign(exports, registry.exportGrouped({
|
// registry.allHandsOnDeck().catch(() => void 0);
|
||||||
memory: '4GiB',
|
// registry.title = 'reader';
|
||||||
timeoutSeconds: 540,
|
// registry.version = '0.1.0';
|
||||||
}));
|
|
||||||
registry.allHandsOnDeck().catch(() => void 0);
|
|
||||||
registry.title = 'reader';
|
|
||||||
registry.version = '0.1.0';
|
|
||||||
|
|
||||||
process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`);
|
process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user