Commit Graph

208 Commits

Author SHA1 Message Date
Harsh Gupta
c77135490b feat: Add logging for scrapping options context 2024-08-15 14:59:49 +05:30
Harsh Gupta
77be0d08ff more console logs 2024-08-14 20:44:25 +05:30
Harsh Gupta
3e2bf6d39d add an express endpoint to run the crawl endpoint 2024-08-14 19:34:53 +05:30
Harsh Gupta (aider)
57b07507d1 feat: Add Express server with crawl endpoint 2024-08-14 19:17:25 +05:30
Harsh Gupta
32263c7e9e feat: Add express server for crawling functionality 2024-08-14 19:17:23 +05:30
Harsh Gupta
1d8b3eae0d increase memory limit 2024-08-14 16:59:51 +05:30
Harsh Gupta (aider)
b682ee5bb5 feat: add hello world endpoint in firebase cloud functions 2024-08-14 16:42:03 +05:30
Harsh Gupta
f927aab144 update deps 2024-08-14 16:40:13 +05:30
Harsh Gupta
54aae972ae fix the puppeteer thingy 2024-08-14 16:39:59 +05:30
Harsh Gupta (aider)
a72373f815 fix: Add try-catch block to handle errors in salvage method 2024-08-14 16:04:57 +05:30
Harsh Gupta
888546e064 fix: Make the salvage method private in the puppeteer service 2024-08-14 16:04:56 +05:30
Harsh Gupta (aider)
ef138360c2 fix: Remove private modifier from salvage method 2024-08-14 16:01:00 +05:30
Harsh Gupta (aider)
f6f3fc5bea fix: Improve error handling and add retry mechanism in PuppeteerControl 2024-08-14 15:49:13 +05:30
Harsh Gupta (aider)
a3a299fb38 fix: Implement retry mechanism and improve error handling for scraping function 2024-08-14 15:46:41 +05:30
Harsh Gupta
ddbf0030b4 fix the logger thingy 2024-08-14 15:35:20 +05:30
Harsh Gupta (aider)
a3f222638e feat: Add shared module dependencies and exports 2024-08-14 15:15:07 +05:30
Harsh Gupta (aider)
02abc2aaaa fix: Register Logger class with dependency injection container 2024-08-14 15:11:14 +05:30
Harsh Gupta
2d6447e8fc add mock shared libraries 2024-08-14 14:53:52 +05:30
Harsh Gupta
88a6bd7131 remove submodule shared 2024-08-14 14:53:22 +05:30
Harsh Gupta
cbe4fa94c1 remove alt-text service 2024-08-14 14:46:37 +05:30
Harsh Gupta
4c957adbce remove PDF extraction functionality 2024-08-14 14:44:01 +05:30
Harsh Gupta
db6cd7d76c fixes 2024-08-14 14:41:02 +05:30
Harsh Gupta (aider)
e9ac98a628 fix: Remove new keyword when using RPCReflection 2024-08-14 14:39:32 +05:30
Harsh Gupta
4e9b6b7ca5 fix: Update type annotations for mixins and suffixMixins arrays in crawler.ts 2024-08-14 14:39:31 +05:30
Harsh Gupta
87d9f772c1 more fixes 2024-08-14 14:35:07 +05:30
Harsh Gupta (aider)
2343c1d28b feat: Modify crawler.ts and index.ts to make crawl function usable as a Firebase function 2024-08-14 14:30:08 +05:30
Harsh Gupta
80547abf38 fix: Remove unused code and dependencies 2024-08-14 14:30:07 +05:30
Harsh Gupta (aider)
c33929afb2 refactor: remove usage of cache 2024-08-14 13:51:35 +05:30
Harsh Gupta
127c32abc9 fix: Remove unnecessary code for calculating charge amount 2024-08-14 13:51:33 +05:30
Harsh Gupta (aider)
6804b99533 fix: Remove billing and related flow 2024-08-14 13:49:46 +05:30
Harsh Gupta
6c17175c43 fix: Remove unused getChargeAmount function 2024-08-14 13:49:44 +05:30
Harsh Gupta (aider)
fbdc266660 fix: Remove auth and user info 2024-08-14 13:48:29 +05:30
Harsh Gupta
d380599986 strip more stuff 2024-08-14 13:47:25 +05:30
Harsh Gupta (aider)
bf27d39f1b fix: Replace estimateToken with a mock implementation 2024-08-14 13:45:02 +05:30
Harsh Gupta
aeb6ebed67 fix: Remove unnecessary SecurityCompromiseError import and usage
diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts
index ec00c1d..bb8ba1a 100644
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@@ -3,10 +3,10 @@ import {
     RPCHost, RPCReflection,
     HashManager,
     AssertionFailureError, ParamValidationError, Defer,
-    SecurityCompromiseError
+
 } from 'civkit';
 import { singleton } from 'tsyringe';
-import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
+import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
 import _ from 'lodash';
 import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
 import { Request, Response } from 'express';
@@ -660,7 +660,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
         if (!uid) {
             if (urlToCrawl.protocol === 'http:' && (!urlToCrawl.pathname || urlToCrawl.pathname === '/') &&
                 crawlerOptions.respondWith !== 'default') {
-                throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
+                throw new Error(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
             }
             const blockade = (await DomainBlockade.fromFirestoreQuery(
                 DomainBlockade.COLLECTION
@@ -669,7 +669,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
                     .limit(1)
             ))[0];
             if (blockade) {
-                throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
+                throw new Error(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
             }

         }
@@ -940,7 +940,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;

             yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
         } catch (err: any) {
-            if (cache && !(err instanceof SecurityCompromiseError)) {
+            if (cache && !(err instanceof Error)) {
                 this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
                 yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
                 return;
2024-08-14 13:45:01 +05:30
Harsh Gupta (aider)
d15b721bfa refactor: Remove rate limiting from crawler.ts 2024-08-14 13:43:04 +05:30
Harsh Gupta
aa862d4247 fix: Refactor crawler.ts by removing unused imports and code 2024-08-14 13:43:03 +05:30
Yanlong Wang
df58fcb3fa
fix: alleviate search performance issue 2024-08-09 15:03:24 +08:00
Yanlong Wang
eb74e9c6f8
fix: remove select element from markdown to walk around turndown performance issue 2024-08-09 10:55:36 +08:00
Yanlong Wang
e4ef6cb0f9
chore: reduce fetch count in search 2024-08-09 10:29:50 +08:00
Yanlong Wang
e529369ba6
fix: search with failed pages 2024-08-08 15:49:23 +08:00
Yanlong Wang
0dd05b5dab
chore: tweak concurrency 2024-08-06 17:58:27 +08:00
Yanlong Wang
7af2bde01f
fix: html rebasing with <base> tag 2024-08-06 13:15:10 +08:00
Yanlong Wang
40e91853e2 fix 2024-08-02 20:10:17 +08:00
Yanlong Wang
cda0f371e1
feat: updated rate policy 2024-08-02 19:39:51 +08:00
Yanlong Wang
0a2c0932fd
fix 2024-08-02 17:13:50 +08:00
Yanlong Wang
ee632199df
fix 2024-08-02 17:12:10 +08:00
Yanlong Wang
0a33207f8f
fix: another approach to suspected DoS abuse 2024-08-02 17:04:13 +08:00
yanlong.wang
e658e8102c
fix 2024-08-01 20:07:39 +08:00
yanlong.wang
f4f189c8e6
fix 2024-08-01 19:51:53 +08:00