Merge pull request #34 from mendableai/nsc/returnOnlyUrls

Implements the ability for the crawler to output all the links it found, without scraping
2024-11-16 11:42:24 +08:00 · 2024-04-24 10:34:42 -03:00 · 2024-04-24 10:34:42 -03:00 · f189589da4
commit f189589da4
parent 523dd15aac 07e93ee5fd
4 changed files with 35 additions and 9 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -13,15 +13,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1


 ### Scrape Website
-POST https://api.firecrawl.dev/v0/scrape HTTP/1.1
+POST http://localhost:3002/v0/crawl HTTP/1.1
 Authorization: Bearer 
 content-type: application/json

 {
-    "url":"https://www.mendable.ai"
+    "url":"https://www.mendable.ai",
+    "crawlerOptions": {
+        "returnOnlyUrls": true
+    }
 }


+
+
+
+
+
+
 ### Scrape Website
 POST http://localhost:3002/v0/scrape HTTP/1.1
 Authorization: Bearer 
@ -34,7 +43,7 @@ content-type: application/json


 ### Check Job Status
-GET http://localhost:3002/v0/crawl/status/333ab225-dc3e-418b-9d4b-8fb833cbaf89 HTTP/1.1
+GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
 Authorization: Bearer 

 ### Get Job Result
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -38,6 +38,10 @@ export type WebScraperOptions = {
  concurrentRequests?: number;
 };

+export interface DocumentUrl {
+  url: string;
+}
+
 export class Document {
  id?: string;
  url?: string; // Used only in /search for now
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -1,9 +1,10 @@
 import { Job } from "bull";
 import { CrawlResult, WebScraperOptions } from "../types";
 import { WebScraperDataProvider } from "../scraper/WebScraper";
-import { Progress } from "../lib/entities";
+import { DocumentUrl, Progress } from "../lib/entities";
 import { billTeam } from "../services/billing/credit_billing";
 import { Document } from "../lib/entities";
+
 export async function startWebScraperPipeline({
  job,
 }: {
@ -47,7 +48,7 @@ export async function runWebScraper({
 }): Promise<{
  success: boolean;
  message: string;
-  docs: CrawlResult[];
+  docs: Document[] | DocumentUrl[];
 }> {
  try {
    const provider = new WebScraperDataProvider();
@ -68,7 +69,7 @@ export async function runWebScraper({
    }
    const docs = (await provider.getDocuments(false, (progress: Progress) => {
      inProgress(progress);
-    })) as CrawlResult[];
+    })) as Document[];

    if (docs.length === 0) {
      return {
@ -79,7 +80,14 @@ export async function runWebScraper({
    }

    // remove docs with empty content
-    const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0);
+    const filteredDocs = crawlerOptions.returnOnlyUrls
+      ? docs.map((doc) => {
+          if (doc.metadata.sourceURL) {
+            return { url: doc.metadata.sourceURL };
+          }
+        })
+      : docs.filter((doc) => doc.content.trim().length > 0);
+

    const { success, credit_usage } = await billTeam(
      team_id,
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -80,11 +80,16 @@ export class WebScraperDataProvider {
        });
        let links = await crawler.start(inProgress, 5, this.limit);
        if (this.returnOnlyUrls) {
+          inProgress({
+            current: links.length,
+            total: links.length,
+            status: "COMPLETED",
+            currentDocumentUrl: this.urls[0],
+          });
          return links.map((url) => ({
            content: "",
+            markdown: "",
            metadata: { sourceURL: url },
-            provider: "web",
-            type: "text",
          }));
        }