Skip to content

Commit

Permalink
fix global userAgents
Browse files Browse the repository at this point in the history
  • Loading branch information
Miniast committed Jun 21, 2024
1 parent e707cb9 commit 7bf212e
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 42 deletions.
16 changes: 0 additions & 16 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ class Crawler extends EventEmitter {
private _proxyIndex = 0;

public options: CrawlerOptions;
public globalOnlyOptions: string[];
public seen: any;

constructor(options?: CrawlerOptions) {
Expand Down Expand Up @@ -52,15 +51,6 @@ class Crawler extends EventEmitter {
log.settings.minLevel = 7;
}

this.globalOnlyOptions = [
"maxConnections",
"rateLimit",
"priorityLevels",
"skipDuplicates",
"homogeneous",
"userAgents",
];

this._limiters = new Cluster({
maxConnections: this.options.maxConnections!,
rateLimit: this.options.rateLimit!,
Expand Down Expand Up @@ -294,9 +284,6 @@ class Crawler extends EventEmitter {
options = getValidOptions(options);
options.retries = options.retries ?? 0;
setDefaults(options, this.options);
this.globalOnlyOptions.forEach(globalOnlyOption => {
delete options[globalOnlyOption as keyof RequestOptions];
});
options.skipEventRequest = isBoolean(options.skipEventRequest) ? options.skipEventRequest : true;
delete options.preRequest;
return await this._execute(options);
Expand Down Expand Up @@ -334,9 +321,6 @@ class Crawler extends EventEmitter {
}
setDefaults(options, this.options);
options.headers = { ...this.options.headers, ...options.headers };
this.globalOnlyOptions.forEach(globalOnlyOption => {
delete (options as any)[globalOnlyOption];
});
if (!this.options.skipDuplicates) {
this._schedule(options as CrawlerOptions);
return;
Expand Down
66 changes: 41 additions & 25 deletions src/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,43 @@ import http2Wrapper from "http2-wrapper";
import { cleanObject, getType, isValidUrl } from "./lib/utils.js";
import { RequestConfig, RequestOptions } from "./types/crawler.js";

export const globalOnlyOptions = [
"maxConnections",
"priorityLevels",
"rateLimit",
"skipDuplicates",
"homogeneous",
"userAgents",
"silence",
];
export const crawlerOnlyOptions = [
"rateLimiterId",
"forceUTF8",
"jQuery",
"retryInterval",
"priority",
"proxy",
"retries",
"preRequest",
"callback",
"release",
"isJson",
"referer",
"rejectUnauthorized",
"userParams",
].concat(globalOnlyOptions);
export const deprecatedOptions = [
"uri",
"qs",
"strictSSL",
"incomingEncoding",
"gzip",
"jar",
"jsonReviver",
"jsonReplacer",
"skipEventRequest",
];

export const getCharset = (headers: Record<string, unknown>): null | string => {
let charset = null;
const contentType = headers["content-type"] as string;
Expand Down Expand Up @@ -33,28 +70,6 @@ export const getValidOptions = (options: RequestConfig): RequestOptions => {
};

export const alignOptions = (options: RequestOptions): any => {
const crawlerOnlyOptions = [
"rateLimiterId",
"forceUTF8",
"incomingEncoding",
"jQuery",
"retryInterval",
"priority",
"proxy",
"retries",
"preRequest",
"callback",
"release",
"userAgents",
"isJson",
"referer",
"rejectUnauthorized",
"userParams",
"silence",
];
const deprecatedOptions = ["uri", "qs", "strictSSL", "gzip", "jar", "jsonReviver", "jsonReplacer", "skipEventRequest"].concat(
crawlerOnlyOptions
);
const gotOptions = {
...options,
url: options.url ?? options.uri,
Expand Down Expand Up @@ -102,10 +117,11 @@ export const alignOptions = (options: RequestOptions): any => {
* @deprecated The support of incomingEncoding will be removed in the next major version.
*/
if (options.encoding === undefined) options.encoding = options.incomingEncoding;
delete options["incomingEncoding"];
gotOptions.responseType = "buffer";
Object.keys(gotOptions).forEach(key => {
if (deprecatedOptions.includes(key)) {

const invalidOptions = crawlerOnlyOptions.concat(deprecatedOptions);
invalidOptions.forEach(key => {
if (key in gotOptions) {
delete gotOptions[key];
}
});
Expand Down
2 changes: 1 addition & 1 deletion test/examples.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test.before(t => {
});
test.beforeEach(t => {
nock("http://nockhost")
.get(uri => uri.indexOf("status") >= 0)
.get(url => url.indexOf("status") >= 0)
.times(20)
.reply(200, "Yes");
t.context.crawler = new Crawler({
Expand Down
45 changes: 45 additions & 0 deletions test/userAgent.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import test from "ava";
import { testCb, testCbSync } from "./lib/avaTestCb.js";
import nock from "nock";
import Crawler from "../dist/index.js";

test.before(t => {
nock.cleanAll();
nock("http://nockhost").get(url => url.indexOf("status") >= 0).times(20).reply(200, "Yes");
t.context.calledAgents = [];
t.context.crawler = new Crawler({
silence: true,
jQuery: false,
userAgents: [
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Googlebot/2.1 (+http://www.google.com/bot.html)",
"test/1.0",
"test/2.0"
],
callback: (error, res, done) => {
t.context.calledAgents.push(res.request.options.headers["user-agent"]);
done();
}
});
});

testCbSync(test, "should rotate user agents if userAgents is set.", async t => {
t.context.crawler.add([
"http://nockhost/status1",
"http://nockhost/status2",
"http://nockhost/status3",
"http://nockhost/status4",
"http://nockhost/status1",
])
t.context.crawler.on("drain", () => {
t.deepEqual(t.context.calledAgents, [
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Googlebot/2.1 (+http://www.google.com/bot.html)",
"test/1.0",
"test/2.0",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
]);
t.end();
});
});

0 comments on commit 7bf212e

Please sign in to comment.