From 7bf212e809db7023c2a741587fbba28e61339682 Mon Sep 17 00:00:00 2001 From: MiniAst Date: Fri, 21 Jun 2024 18:00:58 +0800 Subject: [PATCH] fix global userAgents --- src/crawler.ts | 16 ------------ src/options.ts | 66 +++++++++++++++++++++++++++++------------------ test/examples.js | 2 +- test/userAgent.js | 45 ++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 42 deletions(-) create mode 100644 test/userAgent.js diff --git a/src/crawler.ts b/src/crawler.ts index f5bbd2e..46c9f7c 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -23,7 +23,6 @@ class Crawler extends EventEmitter { private _proxyIndex = 0; public options: CrawlerOptions; - public globalOnlyOptions: string[]; public seen: any; constructor(options?: CrawlerOptions) { @@ -52,15 +51,6 @@ class Crawler extends EventEmitter { log.settings.minLevel = 7; } - this.globalOnlyOptions = [ - "maxConnections", - "rateLimit", - "priorityLevels", - "skipDuplicates", - "homogeneous", - "userAgents", - ]; - this._limiters = new Cluster({ maxConnections: this.options.maxConnections!, rateLimit: this.options.rateLimit!, @@ -294,9 +284,6 @@ class Crawler extends EventEmitter { options = getValidOptions(options); options.retries = options.retries ?? 0; setDefaults(options, this.options); - this.globalOnlyOptions.forEach(globalOnlyOption => { - delete options[globalOnlyOption as keyof RequestOptions]; - }); options.skipEventRequest = isBoolean(options.skipEventRequest) ? options.skipEventRequest : true; delete options.preRequest; return await this._execute(options); @@ -334,9 +321,6 @@ class Crawler extends EventEmitter { } setDefaults(options, this.options); options.headers = { ...this.options.headers, ...options.headers }; - this.globalOnlyOptions.forEach(globalOnlyOption => { - delete (options as any)[globalOnlyOption]; - }); if (!this.options.skipDuplicates) { this._schedule(options as CrawlerOptions); return; diff --git a/src/options.ts b/src/options.ts index 0bbf804..dea0189 100644 --- a/src/options.ts +++ b/src/options.ts @@ -3,6 +3,43 @@ import http2Wrapper from "http2-wrapper"; import { cleanObject, getType, isValidUrl } from "./lib/utils.js"; import { RequestConfig, RequestOptions } from "./types/crawler.js"; +export const globalOnlyOptions = [ + "maxConnections", + "priorityLevels", + "rateLimit", + "skipDuplicates", + "homogeneous", + "userAgents", + "silence", +]; +export const crawlerOnlyOptions = [ + "rateLimiterId", + "forceUTF8", + "jQuery", + "retryInterval", + "priority", + "proxy", + "retries", + "preRequest", + "callback", + "release", + "isJson", + "referer", + "rejectUnauthorized", + "userParams", +].concat(globalOnlyOptions); +export const deprecatedOptions = [ + "uri", + "qs", + "strictSSL", + "incomingEncoding", + "gzip", + "jar", + "jsonReviver", + "jsonReplacer", + "skipEventRequest", +]; + export const getCharset = (headers: Record): null | string => { let charset = null; const contentType = headers["content-type"] as string; @@ -33,28 +70,6 @@ export const getValidOptions = (options: RequestConfig): RequestOptions => { }; export const alignOptions = (options: RequestOptions): any => { - const crawlerOnlyOptions = [ - "rateLimiterId", - "forceUTF8", - "incomingEncoding", - "jQuery", - "retryInterval", - "priority", - "proxy", - "retries", - "preRequest", - "callback", - "release", - "userAgents", - "isJson", - "referer", - "rejectUnauthorized", - "userParams", - "silence", - ]; - const deprecatedOptions = ["uri", "qs", "strictSSL", "gzip", "jar", "jsonReviver", "jsonReplacer", "skipEventRequest"].concat( - crawlerOnlyOptions - ); const gotOptions = { ...options, url: options.url ?? options.uri, @@ -102,10 +117,11 @@ export const alignOptions = (options: RequestOptions): any => { * @deprecated The support of incomingEncoding will be removed in the next major version. */ if (options.encoding === undefined) options.encoding = options.incomingEncoding; - delete options["incomingEncoding"]; gotOptions.responseType = "buffer"; - Object.keys(gotOptions).forEach(key => { - if (deprecatedOptions.includes(key)) { + + const invalidOptions = crawlerOnlyOptions.concat(deprecatedOptions); + invalidOptions.forEach(key => { + if (key in gotOptions) { delete gotOptions[key]; } }); diff --git a/test/examples.js b/test/examples.js index fc3171a..89d5fa6 100644 --- a/test/examples.js +++ b/test/examples.js @@ -9,7 +9,7 @@ test.before(t => { }); test.beforeEach(t => { nock("http://nockhost") - .get(uri => uri.indexOf("status") >= 0) + .get(url => url.indexOf("status") >= 0) .times(20) .reply(200, "Yes"); t.context.crawler = new Crawler({ diff --git a/test/userAgent.js b/test/userAgent.js new file mode 100644 index 0000000..ea9d408 --- /dev/null +++ b/test/userAgent.js @@ -0,0 +1,45 @@ +import test from "ava"; +import { testCb, testCbSync } from "./lib/avaTestCb.js"; +import nock from "nock"; +import Crawler from "../dist/index.js"; + +test.before(t => { + nock.cleanAll(); + nock("http://nockhost").get(url => url.indexOf("status") >= 0).times(20).reply(200, "Yes"); + t.context.calledAgents = []; + t.context.crawler = new Crawler({ + silence: true, + jQuery: false, + userAgents: [ + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Googlebot/2.1 (+http://www.google.com/bot.html)", + "test/1.0", + "test/2.0" + ], + callback: (error, res, done) => { + t.context.calledAgents.push(res.request.options.headers["user-agent"]); + done(); + } + }); +}); + +testCbSync(test, "should rotate user agents if userAgents is set.", async t => { + t.context.crawler.add([ + "http://nockhost/status1", + "http://nockhost/status2", + "http://nockhost/status3", + "http://nockhost/status4", + "http://nockhost/status1", + ]) + t.context.crawler.on("drain", () => { + t.deepEqual(t.context.calledAgents, [ + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Googlebot/2.1 (+http://www.google.com/bot.html)", + "test/1.0", + "test/2.0", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + ]); + t.end(); + }); +}); +