fix crawl option conversion
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run

This commit is contained in:
Móricz Gergő 2024-11-05 12:28:44 +01:00
parent 2a96717f67
commit cd534326ba
4 changed files with 24 additions and 9 deletions

View File

@ -15,7 +15,7 @@ import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyCrawlerOptions, fromLegacyScrapeOptions, url as urlSchema } from "../v1/types"; import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
import { ZodError } from "zod"; import { ZodError } from "zod";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
@ -140,7 +140,7 @@ export async function crawlController(req: Request, res: Response) {
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: url, originUrl: url,
crawlerOptions: fromLegacyCrawlerOptions(crawlerOptions), crawlerOptions,
scrapeOptions, scrapeOptions,
internalOptions, internalOptions,
team_id, team_id,
@ -177,7 +177,7 @@ export async function crawlController(req: Request, res: Response) {
data: { data: {
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions, crawlerOptions,
team_id, team_id,
plan, plan,
pageOptions: pageOptions, pageOptions: pageOptions,

View File

@ -8,7 +8,7 @@ import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "..
import { addScrapeJob } from "../../../src/services/queue-jobs"; import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { fromLegacyCrawlerOptions, fromLegacyScrapeOptions } from "../v1/types"; import { fromLegacyScrapeOptions } from "../v1/types";
export async function crawlPreviewController(req: Request, res: Response) { export async function crawlPreviewController(req: Request, res: Response) {
try { try {
@ -91,7 +91,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: url, originUrl: url,
crawlerOptions: fromLegacyCrawlerOptions(crawlerOptions), crawlerOptions,
scrapeOptions, scrapeOptions,
internalOptions, internalOptions,
team_id, team_id,

View File

@ -5,6 +5,7 @@ import {
crawlRequestSchema, crawlRequestSchema,
CrawlResponse, CrawlResponse,
RequestWithAuth, RequestWithAuth,
toLegacyCrawlerOptions,
} from "./types"; } from "./types";
import { import {
addCrawlJob, addCrawlJob,
@ -70,7 +71,7 @@ export async function crawlController(
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: req.body.url, originUrl: req.body.url,
crawlerOptions, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
scrapeOptions, scrapeOptions,
internalOptions: {}, internalOptions: {},
team_id: req.auth.team_id, team_id: req.auth.team_id,

View File

@ -440,6 +440,20 @@ export interface ResponseWithSentry<
sentry?: string, sentry?: string,
} }
export function toLegacyCrawlerOptions(x: CrawlerOptions) {
return {
includes: x.includePaths,
excludes: x.excludePaths,
maxCrawledLinks: x.limit,
maxDepth: x.maxDepth,
limit: x.limit,
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
ignoreSitemap: x.ignoreSitemap,
};
}
export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } { export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
return { return {
crawlOptions: crawlerOptions.parse({ crawlOptions: crawlerOptions.parse({
@ -493,10 +507,10 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio
} }
} }
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, crawlOptions: CrawlerOptions, internalOptions: InternalOptions} { export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout); const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
const { crawlOptions, internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
return { scrapeOptions, crawlOptions, internalOptions: Object.assign(i1, i2) }; return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
} }
export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } { export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {