diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 0e2caeb7..e21e07de 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -146,7 +146,241 @@ describe("E2E Tests for API Routes", () => { ); }); - // Additional tests for insufficient credits? + it("should return a successful response with a valid API key and valid includes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["/blog/*"], + }, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + console.log({url}) + expect(url.startsWith("https://mendable.ai/blog/")).toBeTruthy(); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + }, 60000); // 60 seconds + + it("should return a successful response with a valid API key and valid excludes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["/blog/*"], + }, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://mendable.ai/blog/")).toBeFalsy(); + }); + }, 60000); // 60 seconds + + it("should return a successful response with a valid API key and valid excludes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 3, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + }, 60000); // 60 seconds + + it("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const depth = new URL(url).pathname.split("/").filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(1); + }); + }, 120000); + + it("should return a successful response with a valid API key and valid onlyMainContent option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].content).not.toContain("main menu"); + }, 60000); // 60 seconds + + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); + expect(completedResponse.body.data[0].html).toContain(" { @@ -248,7 +482,7 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(404); }); - it("should return a successful response for a valid crawl job", async () => { + it("should return a successful crawl status response for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -278,90 +512,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); }, 60000); // 60 seconds - - it("should return a successful response with max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 2 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have a maximum depth of 1 - urls.forEach((url) => { - const depth = new URL(url).pathname.split("/").filter(Boolean).length; - expect(depth).toBeLessThanOrEqual(1); - }); - }, 120000); - - it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://firecrawl.dev", - pageOptions: { includeHtml: true }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - - // 120 seconds - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); - expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); - expect(completedResponse.body.data[0].html).toContain(" { const crawlResponse = await request(TEST_URL) @@ -371,8 +522,6 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://jestjs.io" }); expect(crawlResponse.statusCode).toBe(200); - - // wait for 30 seconds await new Promise((r) => setTimeout(r, 10000)); diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json new file mode 100644 index 00000000..8577a6e2 --- /dev/null +++ b/apps/test-suite/data/crawl.json @@ -0,0 +1,226 @@ +[ + { + "website": "https://www.anthropic.com/claude", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://mendable.ai/pricing", + "expected_min_num_of_pages": 29, + "expected_crawled_pages": [ + "https://mendable.ai/", + "https://mendable.ai/blog", + "https://mendable.ai/signin", + "https://mendable.ai/signup", + "https://mendable.ai", + "https://mendable.ai/usecases/sales-enablement", + "https://mendable.ai/usecases/documentation", + "https://mendable.ai/usecases/cs-enablement", + "https://mendable.ai/usecases/productcopilot", + "https://mendable.ai/security" + ], + "notes": "This one should not go backwards, but it does!" + }, + { + "website": "https://openai.com/news", + "expected_min_num_of_pages": 59, + "expected_crawled_pages": [ + "https://openai.com/news/company/", + "https://openai.com/news/research/", + "https://openai.com/news/safety-and-alignment/", + "https://openai.com/news/stories/" + ] + }, + { + "website": "https://agentops.ai", + "expected_min_num_of_pages": 7, + "expected_crawled_pages": [ + "https://www.agentops.ai/blog/effortless-hr-management-with-saas", + "https://www.agentops.ai/blog/streamlining-hr-with-saas", + "https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions", + "https://www.agentops.ai/blog/efficient-hr-operations-with-saas", + "https://www.agentops.ai/blog/hr-made-simple-with-saas", + "https://www.agentops.ai/about-us", + "https://www.agentops.ai/contact-us" + ] + }, + { + "website": "https://ycombinator.com/companies", + "expected_min_num_of_pages": 45, + "expected_crawled_pages": [ + "https://www.ycombinator.com/companies/industry/elearning", + "https://www.ycombinator.com/companies/industry/computer-vision", + "https://www.ycombinator.com/companies/industry/health-tech", + "https://www.ycombinator.com/companies/industry/education", + "https://www.ycombinator.com/companies/industry/robotics", + "https://www.ycombinator.com/companies/industry/hardware", + "https://www.ycombinator.com/companies/industry/saas", + "https://www.ycombinator.com/companies/industry/hard-tech", + "https://www.ycombinator.com/companies/industry/developer-tools", + "https://www.ycombinator.com/companies/industry/entertainment", + "https://www.ycombinator.com/companies/industry/finance", + "https://www.ycombinator.com/companies/industry/generative-ai", + "https://www.ycombinator.com/companies/industry/machine-learning" + ] + }, + { + "website": "https://firecrawl.dev", + "expected_min_num_of_pages": 2, + "expected_crawled_pages": [ + "https://firecrawl.dev/", + "https://firecrawl.dev/pricing" + ] + }, + { + "website": "https://en.wikipedia.org/wiki/T._N._Seshan", + "expected_min_num_of_pages": 100, + "expected_crawled_pages": [ + "https://en.wikipedia.org/wiki/Wikipedia:Contents", + "https://en.wikipedia.org/wiki/Wikipedia:Contact_us", + "https://en.wikipedia.org/wiki/V._S._Ramadevi", + "https://en.wikipedia.org/wiki/Wikipedia:About", + "https://en.wikipedia.org/wiki/Help:Introduction", + "https://en.wikipedia.org/wiki/H._D._Deve_Gowda", + "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" + ] + }, + { + "website": "https://mendable.ai/blog", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.framer.com/pricing", + "expected_min_num_of_pages": 58, + "expected_crawled_pages": [ + "https://www.framer.com/features/navigation/", + "https://www.framer.com/contact/", + "https://www.framer.com/add-ons/", + "https://www.framer.com/free-saas-ui-kit/", + "https://www.framer.com/help/", + "https://www.framer.com/features/effects/", + "https://www.framer.com/enterprise/", + "https://www.framer.com/templates/" + ] + }, + { + "website": "https://fly.io/docs/gpus/gpu-quickstart", + "expected_min_num_of_pages": 39, + "expected_crawled_pages": [ + "https://fly.io/docs/getting-started/", + "https://fly.io/docs/hands-on/", + "https://fly.io/docs/about/support/", + "https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/", + "https://fly.io/docs/machines/flyctl/fly-machine-update/", + "https://fly.io/docs/blueprints/review-apps-guide/", + "https://fly.io/docs/blueprints/supercronic/" + ], + "notes": "This one should not go backwards, but it does!" + }, + { + "website": "https://news.ycombinator.com/", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.vellum.ai/llm-leaderboard", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.bigbadtoystore.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.instructables.com", + "expected_min_num_of_pages": 78, + "expected_crawled_pages": [ + "https://www.instructables.com/circuits/", + "https://www.instructables.com/circuits/apple/projects/", + "https://www.instructables.com/circuits/art/projects/", + "https://www.instructables.com/circuits/electronics/projects/", + "https://www.instructables.com/circuits/microsoft/projects/", + "https://www.instructables.com/circuits/microcontrollers/projects/", + "https://www.instructables.com/circuits/community/", + "https://www.instructables.com/circuits/leds/projects/", + "https://www.instructables.com/circuits/gadgets/projects/", + "https://www.instructables.com/circuits/arduino/projects/", + "https://www.instructables.com/circuits/lasers/projects/", + "https://www.instructables.com/circuits/clocks/projects/" + ] + }, + { + "website": "https://www.powells.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.royalacademy.org.uk", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.eastbaytimes.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.manchestereveningnews.co.uk", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://physicsworld.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://richmondconfidential.org", + "expected_min_num_of_pages": 50, + "expected_crawled_pages": [ + "https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/", + "https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/", + "https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/", + "https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/", + "https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/", + "https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/", + "https://richmondconfidential.org/2009/10/19/richmond-homicide-map/", + "https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/", + "https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/", + "https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/" + ] + }, + { + "website": "https://www.techinasia.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""], + "notes": "The website has a paywall and bot detectors." + }, + { + "website": "https://www.boardgamegeek.com", + "expected_min_num_of_pages": 15, + "expected_crawled_pages": [ + "https://www.boardgamegeek.com/browse/boardgameartist", + "https://www.boardgamegeek.com/browse/boardgamehonor", + "https://www.boardgamegeek.com/browse/boardgamepublisher", + "https://www.boardgamegeek.com/browse/boardgamepodcast", + "https://www.boardgamegeek.com/wiki/page/Index", + "https://www.boardgamegeek.com/browse/boardgamecategory", + "https://www.boardgamegeek.com/boardgame/random", + "https://www.boardgamegeek.com/browse/boardgamemechanic", + "https://www.boardgamegeek.com/forums", + "https://www.boardgamegeek.com/gonecardboard", + "https://www.boardgamegeek.com/browse/boardgameaccessory", + "https://www.boardgamegeek.com/browse/boardgamedesigner", + "https://www.boardgamegeek.com/", + "https://www.boardgamegeek.com/previews", + "https://www.boardgamegeek.com/browse/boardgame" + ] + }, + { + "website": "https://www.mountainproject.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + } +] diff --git a/apps/test-suite/data/websites.json b/apps/test-suite/data/scrape.json similarity index 100% rename from apps/test-suite/data/websites.json rename to apps/test-suite/data/scrape.json diff --git a/apps/test-suite/package.json b/apps/test-suite/package.json index 74ab7a6b..33aa2cd9 100644 --- a/apps/test-suite/package.json +++ b/apps/test-suite/package.json @@ -3,7 +3,9 @@ "version": "1.0.0", "description": "", "scripts": { - "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false" + "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false", + "test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts", + "test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts" }, "author": "", "license": "ISC", diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts new file mode 100644 index 00000000..b56a76e1 --- /dev/null +++ b/apps/test-suite/tests/crawl.test.ts @@ -0,0 +1,148 @@ +import request from "supertest"; +import dotenv from "dotenv"; +import { WebsiteScrapeError } from "../utils/types"; +import { logErrors } from "../utils/log"; + +import websitesData from "../data/crawl.json"; +import "dotenv/config"; + +import fs from 'fs'; +dotenv.config(); + +interface WebsiteData { + website: string; + expected_min_num_of_pages: number; + expected_crawled_pages: string[]; +} + +const TEST_URL = "http://127.0.0.1:3002"; + +describe("Crawling Checkup (E2E)", () => { + beforeAll(() => { + if (!process.env.TEST_API_KEY) { + throw new Error("TEST_API_KEY is not set"); + } + }); + + describe("Crawling website tests with a dataset", () => { + it("Should crawl the website and verify the response", async () => { + let passedTests = 0; + const batchSize = 15; + const batchPromises = []; + const startTime = new Date().getTime(); + const date = new Date(); + const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`; + + let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`; + const errorLog: WebsiteScrapeError[] = []; + + for (let i = 0; i < websitesData.length; i += batchSize) { + await new Promise(resolve => setTimeout(resolve, 10000)); + + const batch = websitesData.slice(i, i + batchSize); + const batchPromise = Promise.all( + batch.map(async (websiteData: WebsiteData) => { + try { + const crawlResponse = await request(TEST_URL || "") + .post("/v0/crawl") + .set("Content-Type", "application/json") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100 }}); + + await new Promise(resolve => setTimeout(resolve, 300000)); // wait for 300 seconds + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + console.log('-------------------') + console.log(websiteData.website); + + if (!completedResponse.body.data) { + console.log(completedResponse.body.partial_data.length); + const urls = completedResponse.body.partial_data.map((page: any) => page.metadata?.sourceURL); + console.log(urls); + } else { + console.log(completedResponse.body.data.length); + const urls = completedResponse.body.data.map((page: any) => page.metadata?.sourceURL); + console.log(urls); + } + + console.log('-------------------') + + // if (!completedResponse.body || completedResponse.body.status !== "completed") { + // errorLog.push({ + // website: websiteData.website, + // prompt: 'CRAWL', + // expected_output: 'SUCCESS', + // actual_output: 'FAILURE', + // error: `Crawl job did not complete successfully.` + // }); + // return null; + // } + + // // check how many webpages were crawled successfully + // // compares with expected_num_of_pages + // if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) { + // errorLog.push({ + // website: websiteData.website, + // prompt: 'CRAWL', + // expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`, + // actual_output: `FAILURE: ${completedResponse.body.data.length}`, + // error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` + // }); + // return null; + // } + + // // checks if crawled pages contain expected_crawled_pages + // if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.includes(page))) { + // errorLog.push({ + // website: websiteData.website, + // prompt: 'CRAWL', + // expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`, + // actual_output: `FAILURE: ${completedResponse.body.data}`, + // error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` + // }); + // return null; + // } + + passedTests++; + return { + website: websiteData.website, + statusCode: completedResponse.statusCode, + }; + } catch (error) { + console.error(`Error processing ${websiteData.website}: ${error}`); + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: 'SUCCESS', + actual_output: 'FAILURE', + error: `Error processing ${websiteData.website}: ${error}` + }); + return null; + } + }) + ); + batchPromises.push(batchPromise); + } + + (await Promise.all(batchPromises)).flat(); + const score = (passedTests / websitesData.length) * 100; + const endTime = new Date().getTime(); + const timeTaken = (endTime - startTime) / 1000; + console.log(`Score: ${score}%`); + + await logErrors(errorLog, timeTaken, 0, score, websitesData.length); + + if (process.env.ENV === "local" && errorLog.length > 0) { + if (!fs.existsSync(logsDir)){ + fs.mkdirSync(logsDir, { recursive: true }); + } + fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2)); + } + + expect(score).toBeGreaterThanOrEqual(95); + }, 350000); // 150 seconds timeout + }); +}); \ No newline at end of file diff --git a/apps/test-suite/index.test.ts b/apps/test-suite/tests/scrape.test.ts similarity index 93% rename from apps/test-suite/index.test.ts rename to apps/test-suite/tests/scrape.test.ts index 8d6c31f5..3f421dc4 100644 --- a/apps/test-suite/index.test.ts +++ b/apps/test-suite/tests/scrape.test.ts @@ -1,16 +1,14 @@ import request from "supertest"; import dotenv from "dotenv"; -import Anthropic from "@anthropic-ai/sdk"; -import { numTokensFromString } from "./utils/tokens"; +import { numTokensFromString } from "../utils/tokens"; import OpenAI from "openai"; -import { WebsiteScrapeError } from "./utils/types"; -import { logErrors } from "./utils/log"; +import { WebsiteScrapeError } from "../utils/types"; +import { logErrors } from "../utils/log"; -const websitesData = require("./data/websites.json"); +import websitesData from "../data/scrape.json"; import "dotenv/config"; -const fs = require('fs'); - +import fs from 'fs'; dotenv.config(); interface WebsiteData { @@ -21,8 +19,7 @@ interface WebsiteData { const TEST_URL = "http://127.0.0.1:3002"; - -describe("Scraping/Crawling Checkup (E2E)", () => { +describe("Scraping Checkup (E2E)", () => { beforeAll(() => { if (!process.env.TEST_API_KEY) { throw new Error("TEST_API_KEY is not set"); @@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => { return null; } - const anthropic = new Anthropic({ - apiKey: process.env.ANTHROPIC_API_KEY, - }); - const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }); diff --git a/apps/test-suite/tsconfig.json b/apps/test-suite/tsconfig.json index e075f973..afa29e71 100644 --- a/apps/test-suite/tsconfig.json +++ b/apps/test-suite/tsconfig.json @@ -39,7 +39,7 @@ // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ - // "resolveJsonModule": true, /* Enable importing .json files. */ + "resolveJsonModule": true, /* Enable importing .json files. */ // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */