diff --git a/Dockerfile b/Dockerfile index 3d2d444..e9ee3c8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,3 +30,6 @@ RUN npm run build EXPOSE 3000 # Start the application CMD ["node", "build/server.js"] + +# Create local storage directory and set permissions +RUN mkdir -p /app/local-storage && chmod 777 /app/local-storage diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index e1dd1e5..5831b6d 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -29,6 +29,8 @@ const md5Hasher = new HashManager('md5', 'hex'); // const logger = new Logger('Crawler'); import { TransferProtocolMetadata } from 'civkit'; +import * as fs from 'fs'; +import * as path from 'path'; function sendResponse(res: Response, data: T, meta: TransferProtocolMetadata): T { if (meta.code) { @@ -328,22 +330,19 @@ export class CrawlerHost extends RPCHost { pageshotUrl?: string; }, nominalUrl?: URL) { console.log('Formatting snapshot', { mode, url: nominalUrl?.toString() }); + const host = this.threadLocal.get('host') || '192.168.178.100:1337'; + if (mode === 'screenshot') { if (snapshot.screenshot && !snapshot.screenshotUrl) { console.log('Saving screenshot'); - const fid = `instant-screenshots/${randomUUID()}`; - await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, { - metadata: { - contentType: 'image/png', - } - }); - snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs); + const fileName = `screenshot-${randomUUID()}.png`; + const filePath = await this.saveFileLocally(fileName, snapshot.screenshot); + snapshot.screenshotUrl = `http://${host}/instant-screenshots/${fileName}`; console.log('Screenshot saved and URL generated', { screenshotUrl: snapshot.screenshotUrl }); } return { ...this.getGeneralSnapshotMixins(snapshot), - // html: snapshot.html, screenshotUrl: snapshot.screenshotUrl, toString() { return this.screenshotUrl; @@ -353,13 +352,9 @@ export class CrawlerHost extends RPCHost { if (mode === 'pageshot') { if (snapshot.pageshot && !snapshot.pageshotUrl) { console.log('Saving pageshot'); - const fid = `instant-screenshots/${randomUUID()}`; - await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, { - metadata: { - contentType: 'image/png', - } - }); - snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs); + const fileName = `pageshot-${randomUUID()}.png`; + const filePath = await this.saveFileLocally(fileName, snapshot.pageshot); + snapshot.pageshotUrl = `http://${host}/instant-screenshots/${fileName}`; console.log('Pageshot saved and URL generated', { pageshotUrl: snapshot.pageshotUrl }); } @@ -647,24 +642,28 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; let urlToCrawl; const normalizeUrl = (await pNormalizeUrl).default; try { - urlToCrawl = new URL( - normalizeUrl( - (crawlerOptions.url || noSlashURL).trim(), - { - stripWWW: false, - removeTrailingSlash: false, - removeSingleSlash: false, - sortQueryParameters: false, - } - ) - ); - console.log('Normalized URL to crawl:', urlToCrawl.toString()); + const urlParam = req.query.url || req.url.slice(1); + const urlToNormalize = Array.isArray(urlParam) ? urlParam[0] : urlParam; + if (typeof urlToNormalize === 'string' && !urlToNormalize.startsWith('favicon.ico')) { + urlToCrawl = new URL( + normalizeUrl( + urlToNormalize.trim(), + { + stripWWW: false, + removeTrailingSlash: false, + removeSingleSlash: false, + sortQueryParameters: false, + } + ) + ); + console.log('Normalized URL to crawl:', urlToCrawl.toString()); + } else { + console.log('Skipping invalid or favicon URL:', urlToNormalize); + return sendResponse(res, 'Skipped', { contentType: 'text/plain', envelope: null }); + } } catch (err) { console.error('Error normalizing URL:', err); - throw new ParamValidationError({ - message: `${err}`, - path: 'url' - }); + return sendResponse(res, 'Invalid URL', { contentType: 'text/plain', envelope: null, code: 400 }); } if (urlToCrawl.protocol !== 'http:' && urlToCrawl.protocol !== 'https:') { console.error('Invalid protocol:', urlToCrawl.protocol); @@ -873,19 +872,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl); this.threadLocal.set('cacheTolerance', opts.cacheTolerance); this.threadLocal.set('userAgent', opts.userAgent); + this.threadLocal.set('host', req.headers.host || '192.168.178.100:1337'); if (opts.timeout) { this.threadLocal.set('timeout', opts.timeout * 1000); } const cookies = req.headers['x-set-cookie'] ? (Array.isArray(req.headers['x-set-cookie']) ? req.headers['x-set-cookie'] : [req.headers['x-set-cookie']]) - .flatMap(cookieString => - cookieString.split(';').map(cookie => { - const [name, ...valueParts] = cookie.trim().split('='); - const value = valueParts.join('='); - return { name, value, url: urlToCrawl.toString() }; - }) - ) + .map(cookie => { + const [name, value] = cookie.split('='); + return { name, value, url: urlToCrawl.toString() }; + }) : []; console.log('Cookies:', cookies); @@ -937,4 +934,23 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return this.formatSnapshot(mode, lastSnapshot, url); } + + async saveFileLocally(fileName: string, content: Buffer): Promise { + const localDir = path.join('/app', 'local-storage', 'instant-screenshots'); + console.log(`Attempting to save file in directory: ${localDir}`); + try { + if (!fs.existsSync(localDir)) { + console.log(`Directory ${localDir} does not exist. Creating it.`); + fs.mkdirSync(localDir, { recursive: true }); + } + const filePath = path.join(localDir, fileName); + console.log(`Writing file to: ${filePath}`); + await fs.promises.writeFile(filePath, content); + console.log(`File successfully written to: ${filePath}`); + return filePath; + } catch (error) { + console.error(`Error saving file locally: ${error}`); + throw error; + } + } } diff --git a/backend/functions/src/server.ts b/backend/functions/src/server.ts index 8e583ec..a295242 100644 --- a/backend/functions/src/server.ts +++ b/backend/functions/src/server.ts @@ -1,22 +1,21 @@ -import "reflect-metadata" +import 'reflect-metadata'; import express from 'express'; import { container } from 'tsyringe'; import { CrawlerHost } from './cloud-functions/crawler'; +import path from 'path'; const app = express(); const port = process.env.PORT || 3000; -container.registerSingleton(CrawlerHost); - const crawlerHost = container.resolve(CrawlerHost); app.use(express.json()); -// Example curl for /crawl: -// curl -X GET "http://localhost:3000/https://example.com" -app.get('/:url(*)', async (req, res) => { +// Serve static files from the local-storage directory +app.use('/instant-screenshots', express.static(path.join('/app', 'local-storage', 'instant-screenshots'))); + +app.all('*', async (req, res) => { try { - const url = req.params.url; await crawlerHost.crawl(req, res); } catch (error) { console.error('Error during crawl:', error); @@ -24,14 +23,8 @@ app.get('/:url(*)', async (req, res) => { } }); -// Example curl for /hello: -// curl -X GET "http://localhost:3000/hello" -app.get('/hello', (req, res) => { - res.json({ message: 'Hello, World!' }); -}); - app.listen(port, () => { console.log(`Server is running on port ${port}`); }); -export default app; \ No newline at end of file +export default app; diff --git a/backend/functions/src/shared/index.ts b/backend/functions/src/shared/index.ts index 5395bd6..1fae215 100644 --- a/backend/functions/src/shared/index.ts +++ b/backend/functions/src/shared/index.ts @@ -4,11 +4,12 @@ import { Logger } from './logger'; import { OutputServerEventStream } from './output-stream'; import { RPCReflect } from './rpc-reflect'; import { injectable } from 'tsyringe'; +import * as fs from 'fs'; +import * as path from 'path'; @injectable() export class AsyncContext { private storage: Map = new Map(); - set(key: string, value: any) { this.storage.set(key, value); } @@ -33,41 +34,44 @@ export function Param(name: string, options?: any): ParameterDecorator { @injectable() export class FirebaseStorageBucketControl { - bucket: any; + private localStorageDir: string; constructor() { - this.bucket = { - file: (fileName: string) => ({ - exists: async () => [true] - }) - }; + this.localStorageDir = path.join('/app', 'local-storage'); + if (!fs.existsSync(this.localStorageDir)) { + fs.mkdirSync(this.localStorageDir, { recursive: true }); + } } async uploadFile(filePath: string, destination: string): Promise { - console.log(`Mock: Uploading file from ${filePath} to ${destination}`); - return `https://storage.googleapis.com/mock-bucket/${destination}`; + const destPath = path.join(this.localStorageDir, destination); + await fs.promises.copyFile(filePath, destPath); + return `file://${destPath}`; } async downloadFile(filePath: string, destination: string): Promise { - console.log(`Mock: Downloading file from ${filePath} to ${destination}`); + const sourcePath = path.join(this.localStorageDir, filePath); + await fs.promises.copyFile(sourcePath, destination); } async deleteFile(filePath: string): Promise { - console.log(`Mock: Deleting file ${filePath}`); + const fullPath = path.join(this.localStorageDir, filePath); + await fs.promises.unlink(fullPath); } async fileExists(filePath: string): Promise { - console.log(`Mock: Checking if file ${filePath} exists`); - return true; + const fullPath = path.join(this.localStorageDir, filePath); + return fs.existsSync(fullPath); } async saveFile(filePath: string, content: Buffer, options?: any): Promise { - console.log(`Mock: Saving file ${filePath}`); + const fullPath = path.join(this.localStorageDir, filePath); + await fs.promises.writeFile(fullPath, content); } async signDownloadUrl(filePath: string, expirationTime: number): Promise { - console.log(`Mock: Signing download URL for ${filePath}`); - return `https://storage.googleapis.com/mock-bucket/${filePath}?token=mock-signed-url`; + const fullPath = path.join(this.localStorageDir, filePath); + return `file://${fullPath}`; } } diff --git a/local-storage/instant-screenshots/pageshot-2497b5f8-9560-42ec-9b23-3272b0d0f2dc.png b/local-storage/instant-screenshots/pageshot-2497b5f8-9560-42ec-9b23-3272b0d0f2dc.png new file mode 100644 index 0000000..2885066 Binary files /dev/null and b/local-storage/instant-screenshots/pageshot-2497b5f8-9560-42ec-9b23-3272b0d0f2dc.png differ diff --git a/local-storage/instant-screenshots/pageshot-44dd86cc-8257-4659-b4a7-80f11b5d7b0a.png b/local-storage/instant-screenshots/pageshot-44dd86cc-8257-4659-b4a7-80f11b5d7b0a.png new file mode 100644 index 0000000..2885066 Binary files /dev/null and b/local-storage/instant-screenshots/pageshot-44dd86cc-8257-4659-b4a7-80f11b5d7b0a.png differ