Skip to content

Commit

Permalink
milestone
Browse files Browse the repository at this point in the history
  • Loading branch information
Miniast committed Apr 30, 2024
1 parent 8276305 commit a9805c8
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 318 deletions.
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
"got": "^13.0.0",
"hpagent": "^1.2.0",
"iconv-lite": "^0.6.3",
"seenreq": "^3.0.0"
"seenreq": "^3.0.0",
"tslog": "^4.9.2"
},
"devDependencies": {
"@types/got": "^9.6.12",
Expand Down
26 changes: 10 additions & 16 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 16 additions & 18 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,19 @@ import { EventEmitter } from "events";
import { Cluster } from "./rateLimiter/index.js";
import { isFunction, setDefaults, flattenDeep } from "./lib/utils.js";
import { getValidOptions, alignOptions } from "./options.js";
import { logOptions } from "./logger.js";
import type { crawlerOptions, requestOptions } from "./types/crawler.js";
import { promisify } from "util";
import { load } from "cheerio";
import got from "got";
import seenreq from "seenreq";
import iconv from "iconv-lite";
import { Logger } from "tslog";

//@todo change log method
process.env.NODE_ENV = process.env.NODE_ENV ?? process.argv[2] ?? "debug";
process.env.NODE_ENV = process.env.NODE_ENV ?? process.argv[2] ?? "production";

if (process.env.NODE_ENV !== "debug") {
console.log = () => { };
console.error = () => { };
console.debug = () => { };
}
const log = process.env.NODE_ENV === "debug" ? new Logger(logOptions) : new Logger({ type: "hidden" });

class Crawler extends EventEmitter {
private _limiters: Cluster;
Expand Down Expand Up @@ -67,13 +65,13 @@ class Crawler extends EventEmitter {
this.seen
.initialize()
.then(() => {
console.log("seenreq initialized");
log.info("seenreq initialized");
})
.catch((err: any) => {
console.error(err);
log.error(err);
});
this.on("_release", () => {
console.debug(`Queue size: ${this.queueSize}`);
log.debug(`Queue size: ${this.queueSize}`);
if (this._limiters.empty) this.emit("drain");
});
}
Expand Down Expand Up @@ -123,8 +121,8 @@ class Crawler extends EventEmitter {
};

private _execute = async (options: crawlerOptions): Promise<void> => {
if (options.proxy) console.debug(`Using proxy: ${options.proxy}`);
else if (options.proxies) console.debug(`Using proxies: ${options.proxies}`);
if (options.proxy) log.debug(`Using proxy: ${options.proxy}`);
else if (options.proxies) log.debug(`Using proxies: ${options.proxies}`);

options.headers = options.headers ?? {};

Expand All @@ -146,7 +144,7 @@ class Crawler extends EventEmitter {
try {
await promisify(options.preRequest as any)(options);
} catch (err) {
console.error(err);
log.error(err);
}
}

Expand All @@ -156,14 +154,14 @@ class Crawler extends EventEmitter {
const response = await got(alignOptions({ ...options }));
return this._handler(null, options, response);
} catch (error) {
console.log("error:", error);
log.info("error:", error);
return this._handler(error, options);
}
};

private _handler = (error: any | null, options: requestOptions, response?: any): any => {
if (error) {
console.log(
log.info(
`Error: ${error} when fetching ${options.url} ${options.retries ? `(${options.retries} retries left)` : ""
}`
);
Expand All @@ -182,14 +180,14 @@ class Crawler extends EventEmitter {
}

if (!response.body) response.body = "";
console.debug("Got " + (options.url || "html") + " (" + response.body.length + " bytes)...");
log.debug("Got " + (options.url || "html") + " (" + response.body.length + " bytes)...");
response.options = options;
let resError = null;
try {
if (options.forceUTF8) {
const charset = options.incomingEncoding || this._getCharset(response.headers, response.body);
response.charset = charset;
console.debug("Charset: " + charset);
log.debug("Charset: " + charset);
if (charset && charset !== "utf-8" && charset != "ascii") {
response.body = iconv.decode(response.body, charset);
response.body = response.body.toString();
Expand All @@ -206,7 +204,7 @@ class Crawler extends EventEmitter {
try {
response.$ = load(response.body);
} catch (err) {
console.error(err);
log.error(err);
}
}
}
Expand Down Expand Up @@ -265,7 +263,7 @@ class Crawler extends EventEmitter {
this._schedule(options as crawlerOptions);
}
})
.catch((err: any) => console.error(err));
.catch((err: any) => log.error(err));
});
};
/**
Expand Down
24 changes: 24 additions & 0 deletions src/logger.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
export const logOptions = {
type: "pretty" as any,
name: "Crawler",
hideLogPositionForProduction: true,
prettyLogTemplate: "{{name}} {{logLevelName}} ",
prettyLogStyles: {
logLevelName: {
SILLY: ["bold", "white"],
TRACE: ["bold", "whiteBright"],
DEBUG: ["bold", "green"],
INFO: ["bold", "blue"],
WARN: ["bold", "yellow"],
ERROR: ["bold", "red"],
FATAL: ["bold", "redBright"],
},
name: ["bold", "blue"],
dateIsoStr: "white",
filePathWithLine: "white",
nameWithDelimiterPrefix: ["white", "bold"],
nameWithDelimiterSuffix: ["white", "bold"],
errorName: ["bold", "bgRedBright", "whiteBright"],
fileName: ["yellow"],
},
}
5 changes: 5 additions & 0 deletions src/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ export const alignOptions = (options: any): any => {
stringifyJson: options.jsonReplacer,
};
gotOptions.agent = gotOptions.agent ?? (options.proxy ? defaultagent : undefined);

if(gotOptions.encoding === null){
gotOptions.responseType = gotOptions.responseType ?? "buffer";
delete gotOptions.encoding;
}

Object.keys(gotOptions).forEach(key => {
if (deprecatedOptions.includes(key)) {
Expand Down
77 changes: 63 additions & 14 deletions src/types/crawler.d.ts → src/types/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ type globalOnlyOptions = {
* Global option.
* @default 1000
* @description The default priority of the tasks. Can be only assigned at the beginning.
* @example 1000 means around 1000 milliseconds delay.
* @example 1000 means 1000 milliseconds delay between after the first request.
*/
rateLimit: number;
/**
Expand All @@ -33,43 +33,92 @@ type globalOnlyOptions = {
/**
* Global option.
* @default false
* @description If true, the crawler will rotate the user agent for each request. The "userAgent" option must be an array if this option is true.
* @description If true, the crawler will rotate the user agent for each request. The "userAgent" option must be an array if activated.
*/
rotateUA: boolean;
};

type requestOptions = {
forceUTF8?: boolean;
/**
* crawlerOption
* @default false
* @description If true, the crawler will use the cheerio library to parse the HTML content.
* @see cheerio.load()
* @example If inject successfully, the response object will have "$" property, which is a function to use jQuery.
*/
jQuery?: boolean;
/**
* @deprecated
* @description Use "encoding" instead.
* @description Please use "encoding" instead.
*/
incomingEncoding?: string | null;
/**
* @default "utf8"
* @description The encoding of the response body.
*/
encoding?: string | null;
retries?: number;
retryTimeout?: number;
timeout?: number;
priority?: number;
seenreq?: any;

uri?: string | function;
url?: string | function;
body?: string | Record<string, unknown>;
userAgent?: string;
headers?: Record<string, unknown>;
encoding?: string | null;
json?: boolean;
headers?: Record<string, unknown>;
gzip?: boolean;
method?: string;
skipEventRequest?: boolean;
html?: boolean;
proxies?: string[];
proxy?: string;
http2?: boolean;
debug?: boolean;
logger?: any;
body?: string | Record<string, unknown>;
userAgent?: string | string[];
headers?: Record<string, unknown>;

agent?: any;
/**
* @deprecated Please use "url" instead.
*/
uri?: string | Function;
url?: string | Function;
/**
* @deprecated Please use "searchParams" instead.
*/
qs?: Record<string, unknown>;
/**
* @description The query string of the URL.
*/
searchParams?: Record<string, unknown>;
/**
* @deprecated Please use "rejectUnauthorized" instead.
*/
strictSSL?: boolean;
/**
* @description If false, the crawler will ignore SSL certificate errors.
* @default true
*/
rejectUnauthorized?: boolean;
json?: boolean;
/**
* @deprecated Please use "decompress" instead.
*/
gzip?: boolean;
decompress?: boolean;
/**
* @deprecated Please use "cookieJar" instead.
*/
jar?: Object;
cookieJar?: Object;
/**
* @deprecated Please use "parseJson" instead.
*/
jsonReviver?: Function;
parseJson?: Function;
/**
* @deprecated Please use "stringifyJson" instead.
*/
jsonReplacer?: Function;
stringifyJson?: Function;

preRequest?: (options: requestOptions, done: (error: Error | null, options: requestOptions) => void) => void;
release?: () => void;
callback?: (error: any, response: unknown, done: unknown) => void;
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { crawler } from './dist/index.js';
crawler.add({
url: 'https://www.google.com',
method: 'GET',
incomingEncoding: 'utf8',
headers: {
'Content-Type': 'application/json'
},
Expand Down
Loading

0 comments on commit a9805c8

Please sign in to comment.