Skip to content

Commit

Permalink
step on type
Browse files Browse the repository at this point in the history
  • Loading branch information
Miniast committed May 13, 2024
1 parent 1009d6a commit e6fd4cd
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 48 deletions.
4 changes: 2 additions & 2 deletions dist/types/crawler.d.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
declare global {
var mainModule: string;
}
type globalOnlyOptions = {
type GlobalOnlyOptions = {
maxConnections: number;
/**
* Global option.
Expand Down Expand Up @@ -129,6 +129,6 @@ type requestOptions = {
release?: () => void;
callback?: (error: any, response: unknown, done: unknown) => void;
};
type crawlerOptions = Partial<globalOnlyOptions> & requestOptions;
type CrawlerOptions = Partial<GlobalOnlyOptions> & RequestOptions;
export { crawlerOptions, requestOptions };
//# sourceMappingURL=crawler.d.ts.map
54 changes: 27 additions & 27 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { Cluster } from "./rateLimiter/index.js";
import { isBoolean, isFunction, setDefaults, flattenDeep, lowerObjectKeys, isNumber } from "./lib/utils.js";
import { getValidOptions, alignOptions, getCharset } from "./options.js";
import { logOptions } from "./logger.js";
import type { crawlerOptions, requestOptions } from "./types/crawler.js";
import type { CrawlerOptions, RequestOptions, RequestConfig, CrawlerResponse, GlobalOnlyOptions } from "./types/crawler.js";
import { load } from "cheerio";
import got from "got";
import seenreq from "seenreq";
Expand All @@ -21,13 +21,13 @@ class Crawler extends EventEmitter {
private _UAIndex = 0;
private _proxyIndex = 0;

public options: crawlerOptions;
public options: CrawlerOptions;
public globalOnlyOptions: string[];
public seen: any;

constructor(options?: crawlerOptions) {
constructor(options?: CrawlerOptions) {
super();
const defaultOptions: crawlerOptions = {
const defaultOptions: CrawlerOptions = {
maxConnections: 10,
rateLimit: 0,
priorityLevels: 10,
Expand All @@ -54,10 +54,10 @@ class Crawler extends EventEmitter {
];

this._limiters = new Cluster({
maxConnections: this.options.maxConnections as number,
rateLimit: this.options.rateLimit as number,
priorityLevels: this.options.priorityLevels as number,
defaultPriority: this.options.priority as number,
maxConnections: this.options.maxConnections!,
rateLimit: this.options.rateLimit!,
priorityLevels: this.options.priorityLevels!,
defaultPriority: this.options.priority!,
homogeneous: this.options.homogeneous,
});

Expand All @@ -76,13 +76,13 @@ class Crawler extends EventEmitter {
});
}

private _checkHtml = (headers: Record<string, string>): boolean => {
const contentType = headers["content-type"];
private _detectHtmlOnHeaders = (headers: Record<string, unknown>): boolean => {
const contentType = headers["content-type"] as string;
if (/xml|html/i.test(contentType)) return true;
return false;
};

private _schedule = (options: crawlerOptions): void => {
private _schedule = (options: CrawlerOptions): void => {
this.emit("schedule", options);
this._limiters.getRateLimiter(options.rateLimiterId).submit(options.priority as number, (done, rateLimiterId) => {
options.release = () => {
Expand Down Expand Up @@ -111,7 +111,7 @@ class Crawler extends EventEmitter {
});
};

private _execute = async (options: crawlerOptions): Promise<void> => {
private _execute = async (options: CrawlerOptions): Promise<CrawlerResponse> => {
if (options.proxy) log.debug(`Using proxy: ${options.proxy}`);
else if (options.proxies) log.debug(`Using proxies: ${options.proxies}`);

Expand Down Expand Up @@ -166,15 +166,15 @@ class Crawler extends EventEmitter {
}
};

private _handler = (error: any | null, options: requestOptions, response?: any): any => {
private _handler = (error: unknown, options: RequestOptions, response?: CrawlerResponse): CrawlerResponse => {
if (error) {
if (options.retries && options.retries > 0) {
log.warn(
`${error} when fetching ${options.url} ${options.retries ? `(${options.retries} retries left)` : ""}`
);
setTimeout(() => {
options.retries!--;
this._execute(options as crawlerOptions);
this._execute(options as CrawlerOptions);
}, options.retryInterval);
return;
}
Expand Down Expand Up @@ -217,7 +217,7 @@ class Crawler extends EventEmitter {
}

if (options.jQuery === true) {
if (response.body === "" || !this._checkHtml(response.headers)) {
if (response.body === "" || !this._detectHtmlOnHeaders(response.headers)) {
log.warn("response body is not HTML, skip injecting. Set jQuery to false to mute this warning.");
} else {
try {
Expand Down Expand Up @@ -251,13 +251,13 @@ class Crawler extends EventEmitter {
* crawler.setLimiter(0, "rateLimit", 1000);
* ```
*/
public setLimiter(rateLimiterId: number, property: string, value: any): void {
public setLimiter(rateLimiterId: number, property: string, value: unknown): void {
if (!isNumber(rateLimiterId)) {
log.error("rateLimiterId must be a number");
return;
}
if (property === "rateLimit") {
this._limiters.getRateLimiter(rateLimiterId).setRateLimit(value);
this._limiters.getRateLimiter(rateLimiterId).setRateLimit(value as number);
}
// @todo other properties
}
Expand All @@ -279,23 +279,23 @@ class Crawler extends EventEmitter {
* await crawler.send("https://example.com");
* ```
*/
public send = async (options: string | requestOptions): Promise<any> => {
options = getValidOptions(options) as requestOptions;
public send = async (options: RequestConfig): Promise<CrawlerResponse> => {
options = getValidOptions(options);
options.retries = options.retries ?? 0;
setDefaults(options, this.options);
this.globalOnlyOptions.forEach(globalOnlyOption => {
delete (options as any)[globalOnlyOption];
delete options[globalOnlyOption as keyof RequestOptions];
});
options.skipEventRequest = isBoolean(options.skipEventRequest) ? options.skipEventRequest : true;
delete options.preRequest;
return await this._execute(options as crawlerOptions);
return await this._execute(options);
};
/**
* @deprecated
* @description Old interface version. It is recommended to use `Crawler.send()` instead.
* @see Crawler.send
*/
public direct = async (options: string | requestOptions): Promise<any> => {
public direct = async (options: RequestConfig): Promise<CrawlerResponse> => {
return await this.send(options);
};

Expand All @@ -312,12 +312,12 @@ class Crawler extends EventEmitter {
* });
* ```
*/
public add = (options: string | requestOptions | requestOptions[]): void => {
public add = (options: RequestConfig): void => {
let optionsArray = Array.isArray(options) ? options : [options];
optionsArray = flattenDeep(optionsArray);
optionsArray.forEach(options => {
try {
options = getValidOptions(options) as requestOptions;
options = getValidOptions(options) as RequestOptions;
} catch (err) {
log.warn(err);
return;
Expand All @@ -329,7 +329,7 @@ class Crawler extends EventEmitter {
});
if (!this.options.skipDuplicates) {
try {
this._schedule(options as crawlerOptions);
this._schedule(options as CrawlerOptions);
} catch (err) { }
return;
}
Expand All @@ -339,7 +339,7 @@ class Crawler extends EventEmitter {
.then((rst: any) => {
if (!rst) {
try {
this._schedule(options as crawlerOptions);
this._schedule(options as CrawlerOptions);
} catch (err) { }
}
})
Expand All @@ -351,7 +351,7 @@ class Crawler extends EventEmitter {
* @description Old interface version. It is recommended to use `Crawler.add()` instead.
* @see Crawler.add
*/
public queue = (options: string | requestOptions | requestOptions[]): void => {
public queue = (options: RequestConfig): void => {
return this.add(options);
};
}
Expand Down
5 changes: 3 additions & 2 deletions src/options.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { HttpProxyAgent, HttpsProxyAgent } from "hpagent";
import http2Wrapper from "http2-wrapper";
import { cleanObject, getType, isValidUrl } from "./lib/utils.js";
import { RequestConfig, RequestOptions } from "./types/crawler.js";

export const getCharset = (headers: Record<string, string>): null | string => {
let charset = null;
Expand All @@ -14,11 +15,11 @@ export const getCharset = (headers: Record<string, string>): null | string => {
return charset;
};

export const getValidOptions = (options: unknown): Object => {
export const getValidOptions = (options: RequestConfig): RequestOptions => {
const type = getType(options);
if (type === "string") {
try {
if (isValidUrl(options as string)) return { url: options };
if (isValidUrl(options as string)) return { url: options } as RequestOptions;
options = JSON.parse(options as string);
return options as Object;
} catch (e) {
Expand Down
36 changes: 19 additions & 17 deletions src/types/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,44 +1,47 @@
declare global {
var mainModule: string;
}

type globalOnlyOptions = {
export type GlobalOnlyOptions = {
/**
* Global Only option.
* @default 10
* @description The maximum number of requests that can be sent simultaneously.
* @example If the value is 10, the crawler will send at most 10 requests at the same time.
* Note: The maxConnections(> 1) will be valid only if the global ratelimit is set to be 0.
*/
maxConnections: number;
/**
* Global option.
* Global Only option.
* @default 10
* @description The number of levels of priority. Can be only assigned at the beginning.
*/
priorityLevels: number;
/**
* Global option.
* Global Only option.
* @default 1000
* @description The default priority of the tasks. Can be only assigned at the beginning.
* @example 1000 means 1000 milliseconds delay between after the first request.
*/
rateLimit: number;
/**
* Global option.
* Global Only option.
* @default false
* @description If true, the crawler will skip duplicate tasks.
* @example If the task is already in the queue, the crawler will not add it again.
*/
skipDuplicates: boolean;
/**
* Global option.
* Global Only option.
* @default false
* @description If true, the crawler will dynamically reallocate the tasks within the queue blocked due to header blocking to other queues.
*/
homogeneous: boolean;
/**
* Global option.
* Global Only option.
* @default undefined
* @description If passed, the crawler will rotate the user agent for each request. The "userAgents" option must be an array if activated.
*/
userAgents?: string | string[];
};

type requestOptions = {
export type RequestOptions = {
forceUTF8?: boolean;
/**
* crawlerOption
Expand Down Expand Up @@ -77,7 +80,6 @@ type requestOptions = {
http2?: boolean;
body?: string | Record<string, unknown>;
headers?: Record<string, unknown>;

agent?: any;

/**
Expand All @@ -91,7 +93,7 @@ type requestOptions = {
*/
qs?: Record<string, unknown>;
searchParams?: Record<string, unknown>;

/**
* @deprecated Please use "rejectUnauthorized" instead.
*/
Expand Down Expand Up @@ -137,11 +139,11 @@ type requestOptions = {
jsonReplacer?: Function;
stringifyJson?: Function;

preRequest?: (options: requestOptions, done?: (error?: Error | null) => void) => void;
preRequest?: (options: RequestOptions, done?: (error?: Error | null) => void) => void;
release?: () => void;
callback?: (error: any, response: unknown, done: unknown) => void;
};

type crawlerOptions = Partial<globalOnlyOptions> & requestOptions;

export { crawlerOptions, requestOptions };
export type RequestConfig = string | RequestOptions | RequestOptions[];
export type CrawlerOptions = Partial<GlobalOnlyOptions> & RequestOptions;
export type CrawlerResponse = any

0 comments on commit e6fd4cd

Please sign in to comment.