mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
refactor: options dto
This commit is contained in:
parent
f0668a96b4
commit
165cce6c91
|
@ -470,90 +470,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
concurrency: 22,
|
||||
maxInstances: 455,
|
||||
},
|
||||
openapi: {
|
||||
operation: {
|
||||
parameters: {
|
||||
'Accept': {
|
||||
description: `Specifies your preference for the response format.\n\n` +
|
||||
`Supported formats: \n` +
|
||||
`- text/event-stream\n` +
|
||||
`- application/json or text/json\n` +
|
||||
`- text/plain`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Cache-Tolerance': {
|
||||
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-No-Cache': {
|
||||
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Respond-With': {
|
||||
description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
|
||||
`Supported formats: \n` +
|
||||
`- markdown\n` +
|
||||
`- html\n` +
|
||||
`- text\n` +
|
||||
`- screenshot\n`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Wait-For-Selector': {
|
||||
description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
|
||||
'Example: `X-Wait-For-Selector: .content-block`\n'
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Target-Selector': {
|
||||
description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
|
||||
'Implies `X-Wait-For-Selector: (same selector)`'
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Proxy-Url': {
|
||||
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
||||
`Supported protocols: \n` +
|
||||
`- http\n` +
|
||||
`- https\n` +
|
||||
`- socks4\n` +
|
||||
`- socks5\n\n` +
|
||||
`For authentication, https://user:pass@host:port`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Set-Cookie': {
|
||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||
`Syntax is the same with standard Set-Cookie`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Generated-Alt': {
|
||||
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
|
||||
`Note: Does not work when \`X-Respond-With\` is specified`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Images-Summary': {
|
||||
description: `Enable dedicated summary section for images on the page.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-links-Summary': {
|
||||
description: `Enable dedicated summary section for hyper links on the page.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
tags: ['Crawler'],
|
||||
httpMethod: ['get', 'post'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
|
@ -953,6 +869,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
||||
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||
this.threadLocal.set('userAgent', opts.userAgent);
|
||||
|
||||
const crawlOpts: ExtraScrappingOptions = {
|
||||
proxyUrl: opts.proxyUrl,
|
||||
|
@ -960,6 +877,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
favorScreenshot: opts.respondWith === 'screenshot',
|
||||
waitForSelector: opts.waitForSelector,
|
||||
targetSelector: opts.targetSelector,
|
||||
overrideUserAgent: opts.userAgent,
|
||||
};
|
||||
|
||||
return crawlOpts;
|
||||
|
|
|
@ -71,71 +71,6 @@ export class SearcherHost extends RPCHost {
|
|||
concurrency: 6,
|
||||
maxInstances: 200,
|
||||
},
|
||||
openapi: {
|
||||
operation: {
|
||||
parameters: {
|
||||
'Accept': {
|
||||
description: `Specifies your preference for the response format. \n\n` +
|
||||
`Supported formats:\n` +
|
||||
`- text/event-stream\n` +
|
||||
`- application/json or text/json\n` +
|
||||
`- text/plain`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-No-Cache': {
|
||||
description: `Ignores internal cache if this header is specified with a value.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Respond-With': {
|
||||
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
|
||||
`Supported formats:\n` +
|
||||
`- markdown\n` +
|
||||
`- html\n` +
|
||||
`- text\n` +
|
||||
`- screenshot\n`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Proxy-Url': {
|
||||
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
||||
`Supported protocols:\n` +
|
||||
`- http\n` +
|
||||
`- https\n` +
|
||||
`- socks4\n` +
|
||||
`- socks5\n\n` +
|
||||
`For authentication, https://user:pass@host:port`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Set-Cookie': {
|
||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||
`Syntax is the same with standard Set-Cookie`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Generated-Alt': {
|
||||
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
|
||||
`Note: Does not work when \`X-Respond-With\` is specified`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Images-Summary': {
|
||||
description: `Enable dedicated summary section for images on the page.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-links-Summary': {
|
||||
description: `Enable dedicated summary section for hyper links on the page.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
tags: ['Searcher'],
|
||||
httpMethod: ['get', 'post'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
|
|
|
@ -1,8 +1,100 @@
|
|||
import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
||||
import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
||||
import type { Request, Response } from 'express';
|
||||
import type { CookieParam } from 'puppeteer';
|
||||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
|
||||
|
||||
@Also({
|
||||
openapi: {
|
||||
operation: {
|
||||
parameters: {
|
||||
'Accept': {
|
||||
description: `Specifies your preference for the response format.\n\n` +
|
||||
`Supported formats: \n` +
|
||||
`- text/event-stream\n` +
|
||||
`- application/json or text/json\n` +
|
||||
`- text/plain`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Cache-Tolerance': {
|
||||
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-No-Cache': {
|
||||
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Respond-With': {
|
||||
description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
|
||||
`Supported formats: \n` +
|
||||
`- markdown\n` +
|
||||
`- html\n` +
|
||||
`- text\n` +
|
||||
`- screenshot\n`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Wait-For-Selector': {
|
||||
description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
|
||||
'Example: `X-Wait-For-Selector: .content-block`\n'
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Target-Selector': {
|
||||
description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
|
||||
'Implies `X-Wait-For-Selector: (same selector)`'
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Proxy-Url': {
|
||||
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
||||
`Supported protocols: \n` +
|
||||
`- http\n` +
|
||||
`- https\n` +
|
||||
`- socks4\n` +
|
||||
`- socks5\n\n` +
|
||||
`For authentication, https://user:pass@host:port`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Set-Cookie': {
|
||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||
`Syntax is the same with standard Set-Cookie`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Generated-Alt': {
|
||||
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
|
||||
`Note: Does not work when \`X-Respond-With\` is specified`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Images-Summary': {
|
||||
description: `Enable dedicated summary section for images on the page.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-links-Summary': {
|
||||
description: `Enable dedicated summary section for hyper links on the page.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-User-Agent': {
|
||||
description: `Override User-Agent.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
export class CrawlerOptions extends AutoCastable {
|
||||
|
||||
@Prop({
|
||||
|
@ -47,6 +139,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||
@Prop()
|
||||
proxyUrl?: string;
|
||||
|
||||
@Prop()
|
||||
userAgent?: string;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as CrawlerOptions;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||
|
@ -87,6 +182,8 @@ export class CrawlerOptions extends AutoCastable {
|
|||
instance.targetSelector ??= targetSelector;
|
||||
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
||||
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
||||
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||
instance.userAgent ??= overrideUserAgent;
|
||||
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
|
||||
|
|
|
@ -65,6 +65,7 @@ export interface ScrappingOptions {
|
|||
favorScreenshot?: boolean;
|
||||
waitForSelector?: string;
|
||||
minIntervalMs?: number;
|
||||
overrideUserAgent?: string;
|
||||
}
|
||||
|
||||
|
||||
|
@ -417,6 +418,9 @@ document.addEventListener('load', handlePageLoad);
|
|||
if (options?.cookies) {
|
||||
await page.setCookie(...options.cookies);
|
||||
}
|
||||
if (options?.overrideUserAgent) {
|
||||
await page.setUserAgent(options.overrideUserAgent);
|
||||
}
|
||||
|
||||
let nextSnapshotDeferred = Defer();
|
||||
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1
|
||||
Subproject commit a6116b73e99e3d335b0cd4cfcae8f4f0c7e72f6d
|
Loading…
Reference in New Issue
Block a user