Merge branch 'main' into v1-webscraper

2024-11-16 03:32:22 +08:00 · 2024-08-28 12:42:23 -03:00 · 2024-08-28 12:42:23 -03:00 · d872bf0c4c
commit d872bf0c4c
parent 9a43c6cda1 7565c2fc21
14 changed files with 558 additions and 18 deletions
--- a/.github/workflows/fly-direct.yml
+++ b/.github/workflows/fly-direct.yml
@ -28,6 +28,7 @@ jobs:
  deploy:
    name: Deploy app
    runs-on: ubuntu-latest
+    timeout-minutes: 15
    steps:
      - uses: actions/checkout@v3
      - uses: superfly/flyctl-actions/setup-flyctl@master
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@ -3,6 +3,7 @@ import { getRateLimiter } from "../services/rate-limiter";
 import {
  AuthResponse,
  NotificationType,
+  PlanType,
  RateLimiterMode,
 } from "../types";
 import { supabase_service } from "../services/supabase";
@ -101,7 +102,7 @@ export async function supaAuthenticateUser(
  team_id?: string;
  error?: string;
  status?: number;
-  plan?: string;
+  plan?: PlanType;
 }> {
  const authHeader = req.headers.authorization;
  if (!authHeader) {
@ -349,10 +350,10 @@ export async function supaAuthenticateUser(
  return {
    success: true,
    team_id: subscriptionData.team_id,
-    plan: subscriptionData.plan ?? "",
+    plan: (subscriptionData.plan ?? "") as PlanType,
  };
 }
-function getPlanByPriceId(price_id: string) {
+function getPlanByPriceId(price_id: string): PlanType {
  switch (price_id) {
    case process.env.STRIPE_PRICE_ID_STARTER:
      return "starter";
--- a/apps/api/src/controllers/v0/crawl.ts
+++ b/apps/api/src/controllers/v0/crawl.ts
@ -14,10 +14,11 @@ import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl
 import { getScrapeQueue } from "../../../src/services/queue-service";
 import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
 import * as Sentry from "@sentry/node";
+import { getJobPriority } from "../../lib/job-priority";

 export async function crawlController(req: Request, res: Response) {
  try {
-    const { success, team_id, error, status } = await authenticateUser(
+    const { success, team_id, error, status, plan } = await authenticateUser(
      req,
      res,
      RateLimiterMode.Crawl
@ -136,6 +137,7 @@ export async function crawlController(req: Request, res: Response) {
      crawlerOptions,
      pageOptions,
      team_id,
+      plan,
      createdAt: Date.now(),
    };

@ -151,7 +153,15 @@ export async function crawlController(req: Request, res: Response) {
      ? null
      : await crawler.tryGetSitemap();

+
    if (sitemap !== null && sitemap.length > 0) {
+      let jobPriority = 20;
+      // If it is over 1000, we need to get the job priority,
+      // otherwise we can use the default priority of 20
+      if(sitemap.length > 1000){
+        // set base to 21
+        jobPriority = await getJobPriority({plan, team_id, basePriority: 21})
+      }
      const jobs = sitemap.map((x) => {
        const url = x.url;
        const uuid = uuidv4();
@ -169,7 +179,7 @@ export async function crawlController(req: Request, res: Response) {
          },
          opts: {
            jobId: uuid,
-            priority: 20,
+            priority: jobPriority,
          },
        };
      });
@ -192,6 +202,10 @@ export async function crawlController(req: Request, res: Response) {
      }
    } else {
      await lockURL(id, sc, url);
+
+      // Not needed, first one should be 15.
+      // const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
+
      const job = await addScrapeJob(
        {
          url,
--- a/apps/api/src/controllers/v0/crawlPreview.ts
+++ b/apps/api/src/controllers/v0/crawlPreview.ts
@ -11,7 +11,7 @@ import * as Sentry from "@sentry/node";

 export async function crawlPreviewController(req: Request, res: Response) {
  try {
-    const { success, error, status } = await authenticateUser(
+    const { success, error, status, team_id:a, plan } = await authenticateUser(
      req,
      res,
      RateLimiterMode.Preview
@ -89,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
      crawlerOptions,
      pageOptions,
      team_id,
+      plan,
      robots,
      createdAt: Date.now(),
    };
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -2,7 +2,7 @@ import { ExtractorOptions, PageOptions } from './../../lib/entities';
 import { Request, Response } from "express";
 import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
 import { authenticateUser } from "../auth";
-import { RateLimiterMode } from "../../types";
+import { PlanType, RateLimiterMode } from "../../types";
 import { logJob } from "../../services/logging/log_job";
 import { Document } from "../../lib/entities";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
@ -13,6 +13,7 @@ import { getScrapeQueue } from '../../services/queue-service';
 import { v4 as uuidv4 } from "uuid";
 import { Logger } from '../../lib/logger';
 import * as Sentry from "@sentry/node";
+import { getJobPriority } from '../../lib/job-priority';

 export async function scrapeHelper(
  jobId: string,
@ -22,7 +23,7 @@ export async function scrapeHelper(
  pageOptions: PageOptions,
  extractorOptions: ExtractorOptions,
  timeout: number,
-  plan?: string
+  plan?: PlanType
 ): Promise<{
  success: boolean;
  error?: string;
@ -38,6 +39,8 @@ export async function scrapeHelper(
    return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
  }

+  const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
+
  const job = await addScrapeJob({
    url,
    mode: "single_urls",
@ -46,7 +49,7 @@ export async function scrapeHelper(
    pageOptions,
    extractorOptions,
    origin: req.body.origin ?? defaultOrigin,
-  }, {}, jobId);
+  }, {}, jobId, jobPriority);

  let doc;

--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@ -2,7 +2,7 @@ import { Request, Response } from "express";
 import { WebScraperDataProvider } from "../../scraper/WebScraper";
 import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
 import { authenticateUser } from "../auth";
-import { RateLimiterMode } from "../../types";
+import { PlanType, RateLimiterMode } from "../../types";
 import { logJob } from "../../services/logging/log_job";
 import { PageOptions, SearchOptions } from "../../lib/entities";
 import { search } from "../../search";
@ -12,6 +12,7 @@ import { Logger } from "../../lib/logger";
 import { getScrapeQueue } from "../../services/queue-service";
 import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
 import * as Sentry from "@sentry/node";
+import { getJobPriority } from "../../lib/job-priority";

 export async function searchHelper(
  jobId: string,
@ -20,6 +21,7 @@ export async function searchHelper(
  crawlerOptions: any,
  pageOptions: PageOptions,
  searchOptions: SearchOptions,
+  plan: PlanType
 ): Promise<{
  success: boolean;
  error?: string;
@ -76,6 +78,8 @@ export async function searchHelper(
    return { success: true, error: "No search results found", returnCode: 200 };
  }

+  const jobPriority = await getJobPriority({plan, team_id, basePriority: 20});
+  
  // filter out social media links

  const jobDatas = res.map(x => {
@ -92,7 +96,7 @@ export async function searchHelper(
      },
      opts: {
        jobId: uuid,
-        priority: 20,
+        priority: jobPriority,
      }
    };
  })
@ -135,7 +139,7 @@ export async function searchHelper(
 export async function searchController(req: Request, res: Response) {
  try {
    // make sure to authenticate user first, Bearer <token>
-    const { success, team_id, error, status } = await authenticateUser(
+    const { success, team_id, error, status, plan } = await authenticateUser(
      req,
      res,
      RateLimiterMode.Search
@ -176,6 +180,7 @@ export async function searchController(req: Request, res: Response) {
      crawlerOptions,
      pageOptions,
      searchOptions,
+      plan
    );
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -216,6 +216,8 @@ if (cluster.isMaster) {
  Logger.info(`Worker ${process.pid} started`);
 }

+
+
 // const sq = getScrapeQueue();

 // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
--- a/apps/api/src/lib/tests/job-priority.test.ts
+++ b/apps/api/src/lib/tests/job-priority.test.ts
@ -0,0 +1,134 @@
+import {
+  getJobPriority,
+  addJobPriority,
+  deleteJobPriority,
+} from "../job-priority";
+import { redisConnection } from "../../services/queue-service";
+import { PlanType } from "../../types";
+
+jest.mock("../../services/queue-service", () => ({
+  redisConnection: {
+    sadd: jest.fn(),
+    srem: jest.fn(),
+    scard: jest.fn(),
+    expire: jest.fn(),
+  },
+}));
+
+describe("Job Priority Tests", () => {
+  afterEach(() => {
+    jest.clearAllMocks();
+  });
+
+  test("addJobPriority should add job_id to the set and set expiration", async () => {
+    const team_id = "team1";
+    const job_id = "job1";
+    await addJobPriority(team_id, job_id);
+    expect(redisConnection.sadd).toHaveBeenCalledWith(
+      `limit_team_id:${team_id}`,
+      job_id
+    );
+    expect(redisConnection.expire).toHaveBeenCalledWith(
+      `limit_team_id:${team_id}`,
+      60
+    );
+  });
+
+  test("deleteJobPriority should remove job_id from the set", async () => {
+    const team_id = "team1";
+    const job_id = "job1";
+    await deleteJobPriority(team_id, job_id);
+    expect(redisConnection.srem).toHaveBeenCalledWith(
+      `limit_team_id:${team_id}`,
+      job_id
+    );
+  });
+
+  test("getJobPriority should return correct priority based on plan and set length", async () => {
+    const team_id = "team1";
+    const plan: PlanType = "standard";
+    (redisConnection.scard as jest.Mock).mockResolvedValue(150);
+
+    const priority = await getJobPriority({ plan, team_id });
+    expect(priority).toBe(10);
+
+    (redisConnection.scard as jest.Mock).mockResolvedValue(250);
+    const priorityExceeded = await getJobPriority({ plan, team_id });
+    expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4)
+  });
+
+  test("getJobPriority should handle different plans correctly", async () => {
+    const team_id = "team1";
+
+    (redisConnection.scard as jest.Mock).mockResolvedValue(50);
+    let plan: PlanType = "hobby";
+    let priority = await getJobPriority({ plan, team_id });
+    expect(priority).toBe(10);
+
+    (redisConnection.scard as jest.Mock).mockResolvedValue(150);
+    plan = "hobby";
+    priority = await getJobPriority({ plan, team_id });
+    expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3)
+
+    (redisConnection.scard as jest.Mock).mockResolvedValue(25);
+    plan = "free";
+    priority = await getJobPriority({ plan, team_id });
+    expect(priority).toBe(10);
+
+    (redisConnection.scard as jest.Mock).mockResolvedValue(60);
+    plan = "free";
+    priority = await getJobPriority({ plan, team_id });
+    expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5)
+  });
+
+  test("addJobPriority should reset expiration time when adding new job", async () => {
+    const team_id = "team1";
+    const job_id1 = "job1";
+    const job_id2 = "job2";
+
+    await addJobPriority(team_id, job_id1);
+    expect(redisConnection.expire).toHaveBeenCalledWith(
+      `limit_team_id:${team_id}`,
+      60
+    );
+
+    // Clear the mock calls
+    (redisConnection.expire as jest.Mock).mockClear();
+
+    // Add another job
+    await addJobPriority(team_id, job_id2);
+    expect(redisConnection.expire).toHaveBeenCalledWith(
+      `limit_team_id:${team_id}`,
+      60
+    );
+  });
+
+  test("Set should expire after 60 seconds", async () => {
+    const team_id = "team1";
+    const job_id = "job1";
+
+    jest.useFakeTimers();
+
+    await addJobPriority(team_id, job_id);
+    expect(redisConnection.expire).toHaveBeenCalledWith(
+      `limit_team_id:${team_id}`,
+      60
+    );
+
+    // Fast-forward time by 59 seconds
+    jest.advanceTimersByTime(59000);
+
+    // The set should still exist
+    expect(redisConnection.scard).not.toHaveBeenCalled();
+
+    // Fast-forward time by 2 more seconds (total 61 seconds)
+    jest.advanceTimersByTime(2000);
+
+    // Check if the set has been removed (scard should return 0)
+    (redisConnection.scard as jest.Mock).mockResolvedValue(0);
+    const setSize = await redisConnection.scard(`limit_team_id:${team_id}`);
+    expect(setSize).toBe(0);
+
+    jest.useRealTimers();
+  });
+});
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -6,6 +6,7 @@ export type StoredCrawl = {
    crawlerOptions: any;
    pageOptions: any;
    team_id: string;
+    plan: string;
    robots?: string;
    cancelled?: boolean;
    createdAt: number;
--- a/apps/api/src/lib/job-priority.ts
+++ b/apps/api/src/lib/job-priority.ts
@ -0,0 +1,91 @@
+import { redisConnection } from "../../src/services/queue-service";
+import { PlanType } from "../../src/types";
+import { Logger } from "./logger";
+
+const SET_KEY_PREFIX = "limit_team_id:";
+export async function addJobPriority(team_id, job_id) {
+  try {
+    const setKey = SET_KEY_PREFIX + team_id;
+
+    // Add scrape job id to the set
+    await redisConnection.sadd(setKey, job_id);
+
+    // This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
+    await redisConnection.expire(setKey, 60);
+  } catch (e) {
+    Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
+  }
+}
+
+export async function deleteJobPriority(team_id, job_id) {
+  try {
+    const setKey = SET_KEY_PREFIX + team_id;
+
+    // remove job_id from the set
+    await redisConnection.srem(setKey, job_id);
+  } catch (e) {
+    Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
+  }
+}
+
+export async function getJobPriority({
+  plan,
+  team_id,
+  basePriority = 10,
+}: {
+  plan: PlanType;
+  team_id: string;
+  basePriority?: number;
+}): Promise<number> {
+  try {
+    const setKey = SET_KEY_PREFIX + team_id;
+
+    // Get the length of the set
+    const setLength = await redisConnection.scard(setKey);
+
+    // Determine the priority based on the plan and set length
+    let planModifier = 1;
+    let bucketLimit = 0;
+
+    switch (plan) {
+      case "free":
+        bucketLimit = 25;
+        planModifier = 0.5;
+        break;
+      case "hobby":
+        bucketLimit = 100;
+        planModifier = 0.3;
+        break;
+      case "standard":
+      case "standardnew":
+        bucketLimit = 200;
+        planModifier = 0.2;
+        break;
+      case "growth":
+      case "growthdouble":
+        bucketLimit = 400;
+        planModifier = 0.1;
+        break;
+
+      default:
+        bucketLimit = 25;
+        planModifier = 1;
+        break;
+    }
+
+    // if length set is smaller than set, just return base priority
+    if (setLength <= bucketLimit) {
+      return basePriority;
+    } else {
+      // If not, we keep base priority + planModifier
+      return Math.ceil(
+        basePriority + Math.ceil((setLength - bucketLimit) * planModifier)
+      );
+    }
+  } catch (e) {
+    Logger.error(
+      `Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
+    );
+    return basePriority;
+  }
+}
--- a/apps/api/src/services/queue-jobs.ts
+++ b/apps/api/src/services/queue-jobs.ts
@ -8,10 +8,11 @@ async function addScrapeJobRaw(
  webScraperOptions: any,
  options: any,
  jobId: string,
+  jobPriority: number = 10
 ): Promise<Job> {
  return await getScrapeQueue().add(jobId, webScraperOptions, {
    ...options,
-    priority: webScraperOptions.crawl_id ? 20 : 10,
+    priority: jobPriority,
    jobId,
  });
 }
@ -20,7 +21,9 @@ export async function addScrapeJob(
  webScraperOptions: WebScraperOptions,
  options: any = {},
  jobId: string = uuidv4(),
+  jobPriority: number = 10
 ): Promise<Job> {
+  
  if (Sentry.isInitialized()) {
    const size = JSON.stringify(webScraperOptions).length;
    return await Sentry.startSpan({
@ -39,10 +42,10 @@ export async function addScrapeJob(
          baggage: Sentry.spanToBaggageHeader(span),
          size,
        },
-      }, options, jobId);
+      }, options, jobId, jobPriority);
    });
  } else {
-    return await addScrapeJobRaw(webScraperOptions, options, jobId);
+    return await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority);
  }
 }

--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -21,6 +21,8 @@ import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, ge
 import { StoredCrawl } from "../lib/crawl-redis";
 import { addScrapeJob } from "./queue-jobs";
 import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
+import { addJobPriority, deleteJobPriority, getJobPriority } from "../../src/lib/job-priority";
+import { PlanType } from "../types";

 if (process.env.ENV === "production") {
  initSDK({
@ -50,6 +52,7 @@ const processJobInternal = async (token: string, job: Job) => {
    await job.extendLock(token, jobLockExtensionTime);
  }, jobLockExtendInterval);

+  await addJobPriority(job.data.team_id, job.id );
  let err = null;
  try {
    const result = await processJob(job, token);
@ -67,6 +70,7 @@ const processJobInternal = async (token: string, job: Job) => {
    err = error;
    await job.moveToFailed(error, token, false);
  } finally {
+    await deleteJobPriority(job.data.team_id, job.id );
    clearInterval(extendLockInterval);
  }

@ -251,6 +255,16 @@ async function processJob(job: Job, token: string) {
          
          for (const link of links) {
            if (await lockURL(job.data.crawl_id, sc, link)) {
+              
+              // This seems to work really welel
+              const jobPriority = await getJobPriority({plan:sc.plan as PlanType, team_id: sc.team_id, basePriority: job.data.crawl_id ? 20 : 10})
+              const jobId = uuidv4();
+
+              // console.log("plan: ",  sc.plan);
+              // console.log("team_id: ", sc.team_id)
+              // console.log("base priority: ", job.data.crawl_id ? 20 : 10)
+              // console.log("job priority: " , jobPriority, "\n\n\n")
+
              const newJob = await addScrapeJob({
                url: link,
                mode: "single_urls",
@ -260,7 +274,7 @@ async function processJob(job: Job, token: string) {
                origin: job.data.origin,
                crawl_id: job.data.crawl_id,
                v1: job.data.v1,
-              });
+              }, {}, jobId, jobPriority);

              await addCrawlJob(job.data.crawl_id, newJob.id);
            }
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -116,8 +116,8 @@ export interface AuthResponse {
  team_id?: string;
  error?: string;
  status?: number;
-  plan?: string;
  api_key?: string;
+  plan?: PlanType;
 }
  

@ -140,4 +140,15 @@ export type ScrapeLog = {
  html?: string;
  ipv4_support?: boolean | null;
  ipv6_support?: boolean | null;
-};
+};
+
+export type PlanType = 
+  | "starter"
+  | "standard"
+  | "scale"
+  | "hobby"
+  | "standardnew"
+  | "growth"
+  | "growthdouble"
+  | "free"
+  | "";
--- a/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb
+++ b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb
@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Web Scraping and Extraction with Firecrawl and Claude\n",
+    "\n",
+    "This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Import Required Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from firecrawl import FirecrawlApp\n",
+    "from anthropic import Anthropic\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Set Up API Keys and URL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "URL to scrape: https://mendable.ai\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Retrieve API keys from environment variables\n",
+    "anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n",
+    "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
+    "\n",
+    "# Set the URL to scrape\n",
+    "url = \"https://mendable.ai\"  # Replace with the actual URL you want to scrape\n",
+    "\n",
+    "print(f\"URL to scrape: {url}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3: Initialize Firecrawl and Anthropic Clients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Firecrawl and Anthropic clients initialized.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize FirecrawlApp and Anthropic client\n",
+    "firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n",
+    "anthropic_client = Anthropic(api_key=anthropic_api_key)\n",
+    "\n",
+    "print(\"Firecrawl and Anthropic clients initialized.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4: Scrape the URL using Firecrawl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Page content scraped. Length: 16199 characters\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Scrape the URL using Firecrawl\n",
+    "page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n",
+    "\n",
+    "print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5: Prepare the Prompt for Claude"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prompt prepared for Claude.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Prepare the prompt for Claude\n",
+    "prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n",
+    "1. The title of the page\n",
+    "2. Whether the company is part of Y Combinator (YC)\n",
+    "3. Whether the company/product is open source\n",
+    "\n",
+    "Return the information in JSON format with the following schema:\n",
+    "{{\n",
+    "    \"main_header_title\": string,\n",
+    "    \"is_yc_company\": boolean,\n",
+    "    \"is_open_source\": boolean\n",
+    "}}\n",
+    "\n",
+    "Webpage content:\n",
+    "{page_content['content']}\n",
+    "\n",
+    "Return only the JSON, nothing else.\"\"\"\n",
+    "\n",
+    "print(\"Prompt prepared for Claude.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 6: Query Claude"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Claude response received.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Query Claude\n",
+    "response = anthropic_client.messages.create(\n",
+    "    model=\"claude-3-opus-20240229\",\n",
+    "    max_tokens=1000,\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": prompt}\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "print(\"Claude response received.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 7: Parse and Display the Result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"title\": \"Just in time answers for Sales and Support\",\n",
+      "  \"is_yc_company\": true,\n",
+      "  \"is_open_source\": false\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Parse and print the result\n",
+    "result = json.loads(response.content[0].text)\n",
+    "print(json.dumps(result, indent=2))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}