diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index eef65125..a4163472 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -121,6 +121,49 @@ describe("E2E Tests for v1 API Routes", () => { }, 30000 ); // 30 seconds timeout + + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/abs/2410.04840", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).toContain("Strong Model Collapse"); + expect(response.body.data.metadata.error).toBeUndefined(); + expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse"); + expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse"); + expect(response.body.data.metadata.citation_author).toEqual([ + "Dohmatob, Elvis", + "Feng, Yunzhen", + "Subramonian, Arjun", + "Kempe, Julia" + ]); + expect(response.body.data.metadata.citation_date).toBe("2024/10/07"); + expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08"); + expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840"); + expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840"); + expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm"); + expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840"); + expect(response.body.data.metadata.statusCode).toBe(200); + }, + 30000 + ); it.concurrent( "should return a successful response with a valid API key and includeHtml set to true", async () => { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index e8520ccc..033de6e0 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -259,6 +259,8 @@ export type Document = { sourceURL?: string; statusCode?: number; error?: string; + [key: string]: string | string[] | number | undefined; + }; }; diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index fac53b38..531dc17c 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -34,6 +34,7 @@ interface Metadata { sourceURL?: string; pageStatusCode?: number; pageError?: string; + [key: string]: string | string[] | number | undefined; } export function extractMetadata(soup: CheerioAPI, url: string): Metadata { @@ -70,40 +71,78 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let pageStatusCode: number | null = null; let pageError: string | null = null; + const customMetadata: Record = {}; + try { + // TODO: remove this as it is redundant with the below implementation title = soup("title").text() || null; description = soup('meta[name="description"]').attr("content") || null; - - // Assuming the language is part of the URL as per the regex pattern - language = soup('html').attr('lang') || null; + + language = soup("html").attr("lang") || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; ogTitle = soup('meta[property="og:title"]').attr("content") || null; - ogDescription = soup('meta[property="og:description"]').attr("content") || null; + ogDescription = + soup('meta[property="og:description"]').attr("content") || null; ogUrl = soup('meta[property="og:url"]').attr("content") || null; ogImage = soup('meta[property="og:image"]').attr("content") || null; ogAudio = soup('meta[property="og:audio"]').attr("content") || null; - ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null; + ogDeterminer = + soup('meta[property="og:determiner"]').attr("content") || null; ogLocale = soup('meta[property="og:locale"]').attr("content") || null; - ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null; + ogLocaleAlternate = + soup('meta[property="og:locale:alternate"]') + .map((i, el) => soup(el).attr("content")) + .get() || null; ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null; ogVideo = soup('meta[property="og:video"]').attr("content") || null; - articleSection = soup('meta[name="article:section"]').attr("content") || null; + articleSection = + soup('meta[name="article:section"]').attr("content") || null; articleTag = soup('meta[name="article:tag"]').attr("content") || null; - publishedTime = soup('meta[property="article:published_time"]').attr("content") || null; - modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null; - dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null; + publishedTime = + soup('meta[property="article:published_time"]').attr("content") || null; + modifiedTime = + soup('meta[property="article:modified_time"]').attr("content") || null; + dctermsKeywords = + soup('meta[name="dcterms.keywords"]').attr("content") || null; dcDescription = soup('meta[name="dc.description"]').attr("content") || null; dcSubject = soup('meta[name="dc.subject"]').attr("content") || null; - dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null; - dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null; + dctermsSubject = + soup('meta[name="dcterms.subject"]').attr("content") || null; + dctermsAudience = + soup('meta[name="dcterms.audience"]').attr("content") || null; dcType = soup('meta[name="dc.type"]').attr("content") || null; dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null; dcDate = soup('meta[name="dc.date"]').attr("content") || null; - dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null; - dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; + dcDateCreated = + soup('meta[name="dc.date.created"]').attr("content") || null; + dctermsCreated = + soup('meta[name="dcterms.created"]').attr("content") || null; + try { + // Extract all meta tags for custom metadata + soup("meta").each((i, elem) => { + try { + const name = soup(elem).attr("name") || soup(elem).attr("property"); + const content = soup(elem).attr("content"); + + if (name && content) { + if (customMetadata[name] === undefined) { + customMetadata[name] = content; + } else if (Array.isArray(customMetadata[name])) { + (customMetadata[name] as string[]).push(content); + } else { + customMetadata[name] = [customMetadata[name] as string, content]; + } + } + } catch (error) { + Logger.error(`Error extracting custom metadata (in): ${error}`); + } + }); + } catch (error) { + Logger.error(`Error extracting custom metadata: ${error}`); + } } catch (error) { Logger.error(`Error extracting metadata: ${error}`); } @@ -141,5 +180,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(sourceURL ? { sourceURL } : {}), ...(pageStatusCode ? { pageStatusCode } : {}), ...(pageError ? { pageError } : {}), + ...customMetadata, }; }