fix: Remove unused code and dependencies

This commit is contained in:
Harsh Gupta 2024-08-14 14:30:07 +05:30 committed by Harsh Gupta (aider)
parent c33929afb2
commit 80547abf38
2 changed files with 23 additions and 140 deletions

View File

@ -12,7 +12,7 @@ import { Request, Response } from 'express';
const pNormalizeUrl = import("@esm2cjs/normalize-url"); const pNormalizeUrl = import("@esm2cjs/normalize-url");
import { AltTextService } from '../services/alt-text'; import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown'; import TurndownService from 'turndown';
import { Crawled } from '../db/crawled'; // import { Crawled } from '../db/crawled';
import { cleanAttribute } from '../utils/misc'; import { cleanAttribute } from '../utils/misc';
import { randomUUID } from 'crypto'; import { randomUUID } from 'crypto';
@ -89,8 +89,6 @@ export class CrawlerHost extends RPCHost {
// Potential privacy issue, dont cache if cookies are used // Potential privacy issue, dont cache if cookies are used
return; return;
} }
await this.setToCache(options.url, snapshot);
}); });
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@ -596,7 +594,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}) })
async crawl( async crawl(
@RPCReflect() rpcReflect: RPCReflection, @RPCReflect() rpcReflect: RPCReflection,
@Ctx() ctx: { ctx: {
req: Request, req: Request,
res: Response, res: Response,
}, },
@ -620,8 +618,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
ctx.req.hostname.toLowerCase() ctx.req.hostname.toLowerCase()
); );
// Rate limiting code removed
let urlToCrawl; let urlToCrawl;
const normalizeUrl = (await pNormalizeUrl).default; const normalizeUrl = (await pNormalizeUrl).default;
try { try {
@ -649,16 +645,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}); });
} }
// Remove auth check
const crawlOpts = this.configure(crawlerOptions); const crawlOpts = this.configure(crawlerOptions);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream(); const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream); rpcReflect.return(sseStream);
try { try {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) { for await (const scrapped of this.scrap(urlToCrawl, crawlOpts, crawlerOptions)) {
if (!scrapped) { if (!scrapped) {
continue; continue;
} }
@ -684,7 +678,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let lastScrapped; let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) { for await (const scrapped of this.scrap(urlToCrawl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue; continue;
@ -706,7 +700,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return formatted; return formatted;
} }
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) { for await (const scrapped of this.scrap(urlToCrawl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue; continue;
@ -716,13 +710,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if (crawlerOptions.timeout === undefined) { if (crawlerOptions.timeout === undefined) {
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`, return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
); );
} }
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`, return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
); );
@ -738,13 +730,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl); const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`, return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
); );
} }
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`, return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
); );
@ -764,117 +754,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return digest; return digest;
} }
async queryCache(urlToCrawl: URL, cacheTolerance: number) {
const digest = this.getUrlDigest(urlToCrawl);
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (!cache) {
return undefined;
}
const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
url: urlToCrawl, digest, age, stale, cacheTolerance
});
let snapshot: PageSnapshot | undefined;
let screenshotUrl: string | undefined;
let pageshotUrl: string | undefined;
const preparations = [
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
snapshot = JSON.parse(r.toString('utf-8'));
}),
cache.screenshotAvailable ?
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
screenshotUrl = r;
}) :
Promise.resolve(undefined),
cache.pageshotAvailable ?
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
pageshotUrl = r;
}) :
Promise.resolve(undefined)
];
try {
await Promise.all(preparations);
} catch (_err) {
// Swallow cache errors.
return undefined;
}
return {
isFresh: !stale,
...cache,
snapshot: {
...snapshot,
screenshot: undefined,
pageshot: undefined,
screenshotUrl,
pageshotUrl,
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
};
}
async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
const digest = this.getUrlDigest(urlToCrawl);
this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
const nowDate = new Date();
const cache = Crawled.from({
_id: randomUUID(),
url: urlToCrawl.toString(),
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
urlPathDigest: digest,
});
const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`,
Buffer.from(
JSON.stringify({
...snapshot,
screenshot: undefined
}),
'utf-8'
),
{
metadata: {
contentType: 'application/json',
}
}
).then((r) => {
cache.snapshotAvailable = true;
return r;
});
if (snapshot.screenshot) {
await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
cache.screenshotAvailable = true;
}
if (snapshot.pageshot) {
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
cache.pageshotAvailable = true;
}
await savingOfSnapshot;
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
return undefined;
});
return r;
}
async *scrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) { async *scrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
if (crawlerOpts?.html) { if (crawlerOpts?.html) {
const fakeSnapshot = { const fakeSnapshot = {

View File

@ -1,21 +1,25 @@
import 'reflect-metadata'; import 'reflect-metadata';
import './shared/lib/doom-domain'; // import './shared/lib/doom-domain';
import { initializeApp } from 'firebase-admin/app'; import { initializeApp } from 'firebase-admin/app';
import { CrawlerHost } from './cloud-functions/crawler';
// import { functions } from 'firebase-admin/functions';
import { https } from 'firebase-functions'
initializeApp(); initializeApp();
export const crawler = https.onRequest(CrawlerHost);
// import { loadModulesDynamically, registry } from './shared';
// import path from 'path';
// loadModulesDynamically(path.resolve(__dirname, 'cloud-functions'));
import { loadModulesDynamically, registry } from './shared'; // Object.assign(exports, registry.exportAll());
import path from 'path'; // Object.assign(exports, registry.exportGrouped({
loadModulesDynamically(path.resolve(__dirname, 'cloud-functions')); // memory: '4GiB',
// timeoutSeconds: 540,
Object.assign(exports, registry.exportAll()); // }));
Object.assign(exports, registry.exportGrouped({ // registry.allHandsOnDeck().catch(() => void 0);
memory: '4GiB', // registry.title = 'reader';
timeoutSeconds: 540, // registry.version = '0.1.0';
}));
registry.allHandsOnDeck().catch(() => void 0);
registry.title = 'reader';
registry.version = '0.1.0';
process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`); process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`);