-
Notifications
You must be signed in to change notification settings - Fork 878
/
crawler.ts
167 lines (155 loc) · 4.92 KB
/
crawler.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
export type GlobalOnlyOptions = {
/**
* Global Only option.
* @default 10
* @description The maximum number of requests that can be sent simultaneously.
* @example If the value is 10, the crawler will send at most 10 requests at the same time.
* Note: The maxConnections(> 1) will be valid only if the global ratelimit is set to be 0.
*/
maxConnections: number;
/**
* Global Only option.
* @default 10
* @description The number of levels of priority. Can be only assigned at the beginning.
*/
priorityLevels: number;
/**
* Global Only option.
* @default 0
* @description The default priority of the tasks. Can be only assigned at the beginning.
* @example 1000 means 1000 milliseconds delay between after the first request.
*/
rateLimit: number;
/**
* Global Only option.
* @default false
* @description If true, the crawler will skip duplicate tasks.
* @example If the task is already in the queue, the crawler will not add it again.
*/
skipDuplicates: boolean;
/**
* Global Only option.
* @default false
* @description If true, the crawler will dynamically reallocate the tasks within the queue blocked due to header blocking to other queues.
*/
homogeneous: boolean;
/**
* Global Only option.
* @default undefined
* @description If passed, the crawler will rotate the user agent for each request. The "userAgents" option must be an array if activated.
*/
userAgents?: string | string[];
/**
* Global Only option.
* @default false
* @description If true, the crawler will mute all warning and error messages. The request error will be still thrown.
*/
silence?: boolean;
};
export type RequestOptions = {
forceUTF8?: boolean;
/**
* crawlerOption
* @default true
* @description If true, the crawler will use the cheerio library to parse the HTML content.
* @see cheerio.load()
* @example If inject successfully, the response object will have "$" property, which is a function to use jQuery.
*/
jQuery?: boolean;
/**
* @deprecated
* @description Please use "encoding" instead.
*/
incomingEncoding?: string | null;
/**
* @default "utf8"
* @description The encoding of the response body.
*/
encoding?: string | null;
/**
* @default 0
* @description rateLimiter ID
*/
rateLimiterId?: number;
/**
* @default 2
* @description The retry count of the request.
*/
retries?: number;
/**
* @default 3000
* @description The interval between retries in milliseconds.
*/
retryInterval?: number;
/**
* @default 20000
* @description The global timeout of the request in milliseconds.
*/
timeout?: number;
priority?: number;
seenreq?: any;
method?: string;
skipEventRequest?: boolean;
html?: boolean;
proxies?: string[];
proxy?: string;
http2?: boolean;
body?: string | Record<string, unknown>;
headers?: Record<string, unknown>;
agent?: any;
/**
* @deprecated Please use "url" instead.
*/
uri?: string | ((urlFn: (url: string) => void) => void);
url?: string | ((urlFn: (url: string) => void) => void);
/**
* @deprecated Please use "searchParams" instead.
*/
qs?: string | Record<string, unknown>;
searchParams?: Record<string, unknown>;
/**
* @deprecated Please use "rejectUnauthorized" instead.
*/
strictSSL?: boolean;
/**
* @description If false, the crawler will ignore SSL certificate errors.
* @default true
*/
rejectUnauthorized?: boolean;
/**
* @deprecated Please use "decompress" instead.
*/
gzip?: boolean;
decompress?: boolean;
/**
* @deprecated Please use "cookieJar" instead.
* @see tough-cookie https://github.com/sindresorhus/got/blob/main/documentation/migration-guides/request.md
*/
jar?: object;
cookieJar?: object;
/**
* @default false
*
* If true, the crawler will parse the response body as JSON.
* This will set 'jQuery' to false.
*/
isJson?: boolean;
referer?: string;
userParams?: unknown;
/**
* @deprecated Please use "parseJson" instead.
*/
jsonReviver?: (text: string) => unknown;
parseJson?: (text: string) => unknown;
/**
* @deprecated Please use "stringifyJson" instead.
*/
jsonReplacer?: (object: unknown) => string;
stringifyJson?: (object: unknown) => string;
preRequest?: (options: RequestOptions, done?: (error?: Error | null) => void) => void;
release?: () => void;
callback?: (error: unknown, response: CrawlerResponse, done?: unknown) => void;
};
export type RequestConfig = string | RequestOptions | RequestOptions[];
export type CrawlerOptions = Partial<GlobalOnlyOptions> & RequestOptions;
export type CrawlerResponse = any;