refactor: options dto

This commit is contained in:
yanlong.wang 2024-06-05 18:55:40 +08:00
parent f0668a96b4
commit 165cce6c91
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
5 changed files with 105 additions and 151 deletions

View File

@ -470,90 +470,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
concurrency: 22,
maxInstances: 455,
},
openapi: {
operation: {
parameters: {
'Accept': {
description: `Specifies your preference for the response format.\n\n` +
`Supported formats: \n` +
`- text/event-stream\n` +
`- application/json or text/json\n` +
`- text/plain`
,
in: 'header',
schema: { type: 'string' }
},
'X-Cache-Tolerance': {
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
in: 'header',
schema: { type: 'string' }
},
'X-No-Cache': {
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
in: 'header',
schema: { type: 'string' }
},
'X-Respond-With': {
description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
`Supported formats: \n` +
`- markdown\n` +
`- html\n` +
`- text\n` +
`- screenshot\n`
,
in: 'header',
schema: { type: 'string' }
},
'X-Wait-For-Selector': {
description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
'Example: `X-Wait-For-Selector: .content-block`\n'
,
in: 'header',
schema: { type: 'string' }
},
'X-Target-Selector': {
description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
'Implies `X-Wait-For-Selector: (same selector)`'
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
`Supported protocols: \n` +
`- http\n` +
`- https\n` +
`- socks4\n` +
`- socks5\n\n` +
`For authentication, https://user:pass@host:port`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Generated-Alt': {
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
`Note: Does not work when \`X-Respond-With\` is specified`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Images-Summary': {
description: `Enable dedicated summary section for images on the page.`,
in: 'header',
schema: { type: 'string' }
},
'X-With-links-Summary': {
description: `Enable dedicated summary section for hyper links on the page.`,
in: 'header',
schema: { type: 'string' }
},
}
}
},
tags: ['Crawler'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
@ -953,6 +869,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('userAgent', opts.userAgent);
const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl,
@ -960,6 +877,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
favorScreenshot: opts.respondWith === 'screenshot',
waitForSelector: opts.waitForSelector,
targetSelector: opts.targetSelector,
overrideUserAgent: opts.userAgent,
};
return crawlOpts;

View File

@ -71,71 +71,6 @@ export class SearcherHost extends RPCHost {
concurrency: 6,
maxInstances: 200,
},
openapi: {
operation: {
parameters: {
'Accept': {
description: `Specifies your preference for the response format. \n\n` +
`Supported formats:\n` +
`- text/event-stream\n` +
`- application/json or text/json\n` +
`- text/plain`
,
in: 'header',
schema: { type: 'string' }
},
'X-No-Cache': {
description: `Ignores internal cache if this header is specified with a value.`,
in: 'header',
schema: { type: 'string' }
},
'X-Respond-With': {
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
`Supported formats:\n` +
`- markdown\n` +
`- html\n` +
`- text\n` +
`- screenshot\n`
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
`Supported protocols:\n` +
`- http\n` +
`- https\n` +
`- socks4\n` +
`- socks5\n\n` +
`For authentication, https://user:pass@host:port`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Generated-Alt': {
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
`Note: Does not work when \`X-Respond-With\` is specified`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Images-Summary': {
description: `Enable dedicated summary section for images on the page.`,
in: 'header',
schema: { type: 'string' }
},
'X-With-links-Summary': {
description: `Enable dedicated summary section for hyper links on the page.`,
in: 'header',
schema: { type: 'string' }
},
}
}
},
tags: ['Searcher'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],

View File

@ -1,8 +1,100 @@
import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
import type { Request, Response } from 'express';
import type { CookieParam } from 'puppeteer';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
@Also({
openapi: {
operation: {
parameters: {
'Accept': {
description: `Specifies your preference for the response format.\n\n` +
`Supported formats: \n` +
`- text/event-stream\n` +
`- application/json or text/json\n` +
`- text/plain`
,
in: 'header',
schema: { type: 'string' }
},
'X-Cache-Tolerance': {
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
in: 'header',
schema: { type: 'string' }
},
'X-No-Cache': {
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
in: 'header',
schema: { type: 'string' }
},
'X-Respond-With': {
description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
`Supported formats: \n` +
`- markdown\n` +
`- html\n` +
`- text\n` +
`- screenshot\n`
,
in: 'header',
schema: { type: 'string' }
},
'X-Wait-For-Selector': {
description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
'Example: `X-Wait-For-Selector: .content-block`\n'
,
in: 'header',
schema: { type: 'string' }
},
'X-Target-Selector': {
description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
'Implies `X-Wait-For-Selector: (same selector)`'
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
`Supported protocols: \n` +
`- http\n` +
`- https\n` +
`- socks4\n` +
`- socks5\n\n` +
`For authentication, https://user:pass@host:port`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Generated-Alt': {
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
`Note: Does not work when \`X-Respond-With\` is specified`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Images-Summary': {
description: `Enable dedicated summary section for images on the page.`,
in: 'header',
schema: { type: 'string' }
},
'X-With-links-Summary': {
description: `Enable dedicated summary section for hyper links on the page.`,
in: 'header',
schema: { type: 'string' }
},
'X-User-Agent': {
description: `Override User-Agent.`,
in: 'header',
schema: { type: 'string' }
},
}
}
}
})
export class CrawlerOptions extends AutoCastable {
@Prop({
@ -47,6 +139,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
proxyUrl?: string;
@Prop()
userAgent?: string;
static override from(input: any) {
const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -87,6 +182,8 @@ export class CrawlerOptions extends AutoCastable {
instance.targetSelector ??= targetSelector;
const waitForSelector = ctx?.req.get('x-wait-for-selector');
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
const overrideUserAgent = ctx?.req.get('x-user-agent');
instance.userAgent ??= overrideUserAgent;
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);

View File

@ -65,6 +65,7 @@ export interface ScrappingOptions {
favorScreenshot?: boolean;
waitForSelector?: string;
minIntervalMs?: number;
overrideUserAgent?: string;
}
@ -417,6 +418,9 @@ document.addEventListener('load', handlePageLoad);
if (options?.cookies) {
await page.setCookie(...options.cookies);
}
if (options?.overrideUserAgent) {
await page.setUserAgent(options.overrideUserAgent);
}
let nextSnapshotDeferred = Defer();
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));

@ -1 +1 @@
Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1
Subproject commit a6116b73e99e3d335b0cd4cfcae8f4f0c7e72f6d