From 8a6877ac387ab5b88c4c953530efd4943625c0d2 Mon Sep 17 00:00:00 2001 From: MiniAst Date: Fri, 26 Apr 2024 18:04:22 +0800 Subject: [PATCH] step --- .vscode/launch.json | 33 +- package.json | 5 +- pnpm-lock.yaml | 610 ++++++++++++++++----------------- src/crawler.ts | 172 +++++++--- src/lib/utils.ts | 30 +- src/rateLimiter/cluster.ts | 38 +- src/rateLimiter/rateLimiter.ts | 6 +- src/test.ts | 5 - src/types/crawler.d.ts | 6 +- src/types/index.d.ts | 1 + test.ts | 4 + tsconfig.json | 8 +- 12 files changed, 522 insertions(+), 396 deletions(-) delete mode 100644 src/test.ts create mode 100644 src/types/index.d.ts create mode 100644 test.ts diff --git a/.vscode/launch.json b/.vscode/launch.json index ec1388d..9061469 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -2,15 +2,34 @@ "version": "0.2.0", "configurations": [ { - "name": "ts-node", + "name": "tsx", "type": "node", "request": "launch", - "runtimeExecutable": "ts-node", - "runtimeArgs": ["-r", "ts-node/register/transpile-only"], - "args": ["${relativeFile}"], - "cwd": "${workspaceRoot}", - "internalConsoleOptions": "openOnSessionStart", - "skipFiles": ["/**", "node_modules/**"] + + // Debug current file in VSCode + "program": "${file}", + + /* + Path to tsx binary + Assuming locally installed + */ + "runtimeExecutable": "${workspaceRoot}/node_modules/.bin/tsx", + + /* + Open terminal when debugging starts (Optional) + Useful to see console.logs + */ + "console": "integratedTerminal", + "internalConsoleOptions": "neverOpen", + + // Files to exclude from debugger (e.g. call stack) + "skipFiles": [ + // Node.js internal core modules + "/**", + + // Ignore all dependencies (optional) + "${workspaceFolder}/node_modules/**" + ] } ] } diff --git a/package.json b/package.json index f0d1ff1..9cfaba4 100644 --- a/package.json +++ b/package.json @@ -14,11 +14,12 @@ "author": "", "license": "MIT", "dependencies": { - "got": "^13.0.0" + "got": "^13.0.0", + "seenreq": "^3.0.0" }, "devDependencies": { "@types/got": "^9.6.12", "@types/node": "^20.10.6", - "ts-node": "^10.9.2" + "tsx": "^4.7.3" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fd4f3de..eb2bfe0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -5,18 +5,12 @@ settings: excludeLinksFromLockfile: false dependencies: - cheerio: - specifier: 1.0.0-rc.12 - version: 1.0.0-rc.12 - dayjs: - specifier: ^1.11.10 - version: 1.11.10 got: specifier: ^13.0.0 version: 13.0.0 - winston: - specifier: ^3.11.0 - version: 3.11.0 + seenreq: + specifier: ^3.0.0 + version: 3.0.0 devDependencies: '@types/got': @@ -24,18 +18,16 @@ devDependencies: version: 9.6.12 '@types/node': specifier: ^20.10.6 - version: 20.10.6 + version: 20.12.7 ts-node: specifier: ^10.9.2 - version: 10.9.2(@types/node@20.10.6)(typescript@5.3.3) + version: 10.9.2(@types/node@20.12.7)(typescript@5.4.5) + tsx: + specifier: ^4.7.3 + version: 4.7.3 packages: - /@colors/colors@1.6.0: - resolution: {integrity: sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==} - engines: {node: '>=0.1.90'} - dev: false - /@cspotcode/source-map-support@0.8.1: resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} engines: {node: '>=12'} @@ -43,16 +35,215 @@ packages: '@jridgewell/trace-mapping': 0.3.9 dev: true - /@dabh/diagnostics@2.0.3: - resolution: {integrity: sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==} - dependencies: - colorspace: 1.1.4 - enabled: 2.0.0 - kuler: 2.0.0 - dev: false + /@esbuild/aix-ppc64@0.19.12: + resolution: {integrity: sha512-bmoCYyWdEL3wDQIVbcyzRyeKLgk2WtWLTWz1ZIAZF/EGbNOwSA6ew3PftJ1PqMiOOGu0OyFMzG53L0zqIpPeNA==} + engines: {node: '>=12'} + cpu: [ppc64] + os: [aix] + requiresBuild: true + dev: true + optional: true + + /@esbuild/android-arm64@0.19.12: + resolution: {integrity: sha512-P0UVNGIienjZv3f5zq0DP3Nt2IE/3plFzuaS96vihvD0Hd6H/q4WXUGpCxD/E8YrSXfNyRPbpTq+T8ZQioSuPA==} + engines: {node: '>=12'} + cpu: [arm64] + os: [android] + requiresBuild: true + dev: true + optional: true + + /@esbuild/android-arm@0.19.12: + resolution: {integrity: sha512-qg/Lj1mu3CdQlDEEiWrlC4eaPZ1KztwGJ9B6J+/6G+/4ewxJg7gqj8eVYWvao1bXrqGiW2rsBZFSX3q2lcW05w==} + engines: {node: '>=12'} + cpu: [arm] + os: [android] + requiresBuild: true + dev: true + optional: true + + /@esbuild/android-x64@0.19.12: + resolution: {integrity: sha512-3k7ZoUW6Q6YqhdhIaq/WZ7HwBpnFBlW905Fa4s4qWJyiNOgT1dOqDiVAQFwBH7gBRZr17gLrlFCRzF6jFh7Kew==} + engines: {node: '>=12'} + cpu: [x64] + os: [android] + requiresBuild: true + dev: true + optional: true + + /@esbuild/darwin-arm64@0.19.12: + resolution: {integrity: sha512-B6IeSgZgtEzGC42jsI+YYu9Z3HKRxp8ZT3cqhvliEHovq8HSX2YX8lNocDn79gCKJXOSaEot9MVYky7AKjCs8g==} + engines: {node: '>=12'} + cpu: [arm64] + os: [darwin] + requiresBuild: true + dev: true + optional: true + + /@esbuild/darwin-x64@0.19.12: + resolution: {integrity: sha512-hKoVkKzFiToTgn+41qGhsUJXFlIjxI/jSYeZf3ugemDYZldIXIxhvwN6erJGlX4t5h417iFuheZ7l+YVn05N3A==} + engines: {node: '>=12'} + cpu: [x64] + os: [darwin] + requiresBuild: true + dev: true + optional: true + + /@esbuild/freebsd-arm64@0.19.12: + resolution: {integrity: sha512-4aRvFIXmwAcDBw9AueDQ2YnGmz5L6obe5kmPT8Vd+/+x/JMVKCgdcRwH6APrbpNXsPz+K653Qg8HB/oXvXVukA==} + engines: {node: '>=12'} + cpu: [arm64] + os: [freebsd] + requiresBuild: true + dev: true + optional: true + + /@esbuild/freebsd-x64@0.19.12: + resolution: {integrity: sha512-EYoXZ4d8xtBoVN7CEwWY2IN4ho76xjYXqSXMNccFSx2lgqOG/1TBPW0yPx1bJZk94qu3tX0fycJeeQsKovA8gg==} + engines: {node: '>=12'} + cpu: [x64] + os: [freebsd] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-arm64@0.19.12: + resolution: {integrity: sha512-EoTjyYyLuVPfdPLsGVVVC8a0p1BFFvtpQDB/YLEhaXyf/5bczaGeN15QkR+O4S5LeJ92Tqotve7i1jn35qwvdA==} + engines: {node: '>=12'} + cpu: [arm64] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-arm@0.19.12: + resolution: {integrity: sha512-J5jPms//KhSNv+LO1S1TX1UWp1ucM6N6XuL6ITdKWElCu8wXP72l9MM0zDTzzeikVyqFE6U8YAV9/tFyj0ti+w==} + engines: {node: '>=12'} + cpu: [arm] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-ia32@0.19.12: + resolution: {integrity: sha512-Thsa42rrP1+UIGaWz47uydHSBOgTUnwBwNq59khgIwktK6x60Hivfbux9iNR0eHCHzOLjLMLfUMLCypBkZXMHA==} + engines: {node: '>=12'} + cpu: [ia32] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-loong64@0.19.12: + resolution: {integrity: sha512-LiXdXA0s3IqRRjm6rV6XaWATScKAXjI4R4LoDlvO7+yQqFdlr1Bax62sRwkVvRIrwXxvtYEHHI4dm50jAXkuAA==} + engines: {node: '>=12'} + cpu: [loong64] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-mips64el@0.19.12: + resolution: {integrity: sha512-fEnAuj5VGTanfJ07ff0gOA6IPsvrVHLVb6Lyd1g2/ed67oU1eFzL0r9WL7ZzscD+/N6i3dWumGE1Un4f7Amf+w==} + engines: {node: '>=12'} + cpu: [mips64el] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-ppc64@0.19.12: + resolution: {integrity: sha512-nYJA2/QPimDQOh1rKWedNOe3Gfc8PabU7HT3iXWtNUbRzXS9+vgB0Fjaqr//XNbd82mCxHzik2qotuI89cfixg==} + engines: {node: '>=12'} + cpu: [ppc64] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-riscv64@0.19.12: + resolution: {integrity: sha512-2MueBrlPQCw5dVJJpQdUYgeqIzDQgw3QtiAHUC4RBz9FXPrskyyU3VI1hw7C0BSKB9OduwSJ79FTCqtGMWqJHg==} + engines: {node: '>=12'} + cpu: [riscv64] + os: [linux] + requiresBuild: true + dev: true + optional: true - /@jridgewell/resolve-uri@3.1.1: - resolution: {integrity: sha512-dSYZh7HhCDtCKm4QakX0xFpsRDqjjtZf/kjI/v3T3Nwt5r8/qz/M19F9ySyOqU94SXBmeG9ttTul+YnR4LOxFA==} + /@esbuild/linux-s390x@0.19.12: + resolution: {integrity: sha512-+Pil1Nv3Umes4m3AZKqA2anfhJiVmNCYkPchwFJNEJN5QxmTs1uzyy4TvmDrCRNT2ApwSari7ZIgrPeUx4UZDg==} + engines: {node: '>=12'} + cpu: [s390x] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/linux-x64@0.19.12: + resolution: {integrity: sha512-B71g1QpxfwBvNrfyJdVDexenDIt1CiDN1TIXLbhOw0KhJzE78KIFGX6OJ9MrtC0oOqMWf+0xop4qEU8JrJTwCg==} + engines: {node: '>=12'} + cpu: [x64] + os: [linux] + requiresBuild: true + dev: true + optional: true + + /@esbuild/netbsd-x64@0.19.12: + resolution: {integrity: sha512-3ltjQ7n1owJgFbuC61Oj++XhtzmymoCihNFgT84UAmJnxJfm4sYCiSLTXZtE00VWYpPMYc+ZQmB6xbSdVh0JWA==} + engines: {node: '>=12'} + cpu: [x64] + os: [netbsd] + requiresBuild: true + dev: true + optional: true + + /@esbuild/openbsd-x64@0.19.12: + resolution: {integrity: sha512-RbrfTB9SWsr0kWmb9srfF+L933uMDdu9BIzdA7os2t0TXhCRjrQyCeOt6wVxr79CKD4c+p+YhCj31HBkYcXebw==} + engines: {node: '>=12'} + cpu: [x64] + os: [openbsd] + requiresBuild: true + dev: true + optional: true + + /@esbuild/sunos-x64@0.19.12: + resolution: {integrity: sha512-HKjJwRrW8uWtCQnQOz9qcU3mUZhTUQvi56Q8DPTLLB+DawoiQdjsYq+j+D3s9I8VFtDr+F9CjgXKKC4ss89IeA==} + engines: {node: '>=12'} + cpu: [x64] + os: [sunos] + requiresBuild: true + dev: true + optional: true + + /@esbuild/win32-arm64@0.19.12: + resolution: {integrity: sha512-URgtR1dJnmGvX864pn1B2YUYNzjmXkuJOIqG2HdU62MVS4EHpU2946OZoTMnRUHklGtJdJZ33QfzdjGACXhn1A==} + engines: {node: '>=12'} + cpu: [arm64] + os: [win32] + requiresBuild: true + dev: true + optional: true + + /@esbuild/win32-ia32@0.19.12: + resolution: {integrity: sha512-+ZOE6pUkMOJfmxmBZElNOx72NKpIa/HFOMGzu8fqzQJ5kgf6aTGrcJaFsNiVMH4JKpMipyK+7k0n2UXN7a8YKQ==} + engines: {node: '>=12'} + cpu: [ia32] + os: [win32] + requiresBuild: true + dev: true + optional: true + + /@esbuild/win32-x64@0.19.12: + resolution: {integrity: sha512-T1QyPSDCyMXaO3pzBkF96E8xMkiRYbUEZADd29SyPGabqxMViNoii+NcK7eWJAEoU6RZyEm5lVSIjTmcdoB9HA==} + engines: {node: '>=12'} + cpu: [x64] + os: [win32] + requiresBuild: true + dev: true + optional: true + + /@jridgewell/resolve-uri@3.1.2: + resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==} engines: {node: '>=6.0.0'} dev: true @@ -63,7 +254,7 @@ packages: /@jridgewell/trace-mapping@0.3.9: resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} dependencies: - '@jridgewell/resolve-uri': 3.1.1 + '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.4.15 dev: true @@ -79,8 +270,8 @@ packages: defer-to-connect: 2.0.1 dev: false - /@tsconfig/node10@1.0.9: - resolution: {integrity: sha512-jNsYVVxU8v5g43Erja32laIDHXeoNvFEpX33OK4d6hljo3jDhCBDhx5dhCCTMWUojscpAagGiRkBKxpdl9fxqA==} + /@tsconfig/node10@1.0.11: + resolution: {integrity: sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==} dev: true /@tsconfig/node12@1.0.11: @@ -98,7 +289,7 @@ packages: /@types/got@9.6.12: resolution: {integrity: sha512-X4pj/HGHbXVLqTpKjA2ahI4rV/nNBc9mGO2I/0CgAra+F2dKgMXnENv2SRpemScBzBAI4vMelIVYViQxlSE6xA==} dependencies: - '@types/node': 20.10.6 + '@types/node': 20.12.7 '@types/tough-cookie': 4.0.5 form-data: 2.5.1 dev: true @@ -107,8 +298,8 @@ packages: resolution: {integrity: sha512-1m0bIFVc7eJWyve9S0RnuRgcQqF/Xd5QsUZAZeQFr1Q3/p9JWoQQEqmVy+DPTNpGXwhgIetAoYF8JSc33q29QA==} dev: false - /@types/node@20.10.6: - resolution: {integrity: sha512-Vac8H+NlRNNlAmDfGUP7b5h/KA+AtWIzuXy0E6OyP8f1tCLYAtPvKRRDJjAPqhpCb0t6U2j7/xqAuLEebW2kiw==} + /@types/node@20.12.7: + resolution: {integrity: sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==} dependencies: undici-types: 5.26.5 dev: true @@ -117,12 +308,8 @@ packages: resolution: {integrity: sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==} dev: true - /@types/triple-beam@1.3.5: - resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==} - dev: false - - /acorn-walk@8.3.1: - resolution: {integrity: sha512-TgUZgYvqZprrl7YldZNoa9OciCAyZR+Ejm9eXzKCmjsF5IKp/wgQ7Z/ZpjpGTIUPwrHQIcYeI8qDh4PsEwxMbw==} + /acorn-walk@8.3.2: + resolution: {integrity: sha512-cjkyv4OtNCIeqhHrfS81QWXoCBPExR/J62oyEqepVw8WaQeSqpW2uhuLPh1m9eWhDuOo/jUXVTlifvesOWp/4A==} engines: {node: '>=0.4.0'} dev: true @@ -136,18 +323,10 @@ packages: resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} dev: true - /async@3.2.5: - resolution: {integrity: sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==} - dev: false - /asynckit@0.4.0: resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} dev: true - /boolbase@1.0.0: - resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} - dev: false - /cacheable-lookup@7.0.0: resolution: {integrity: sha512-+qJyx4xiKra8mZrcwhjMRMUhD5NR1R8esPkzIYxX96JiecFoxAXFuz/GpR3+ev4PE1WamHip78wV0vcmPQtp8w==} engines: {node: '>=14.16'} @@ -162,69 +341,10 @@ packages: http-cache-semantics: 4.1.1 keyv: 4.5.4 mimic-response: 4.0.0 - normalize-url: 8.0.0 + normalize-url: 8.0.1 responselike: 3.0.0 dev: false - /cheerio-select@2.1.0: - resolution: {integrity: sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==} - dependencies: - boolbase: 1.0.0 - css-select: 5.1.0 - css-what: 6.1.0 - domelementtype: 2.3.0 - domhandler: 5.0.3 - domutils: 3.1.0 - dev: false - - /cheerio@1.0.0-rc.12: - resolution: {integrity: sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==} - engines: {node: '>= 6'} - dependencies: - cheerio-select: 2.1.0 - dom-serializer: 2.0.0 - domhandler: 5.0.3 - domutils: 3.1.0 - htmlparser2: 8.0.2 - parse5: 7.1.2 - parse5-htmlparser2-tree-adapter: 7.0.0 - dev: false - - /color-convert@1.9.3: - resolution: {integrity: sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==} - dependencies: - color-name: 1.1.3 - dev: false - - /color-name@1.1.3: - resolution: {integrity: sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==} - dev: false - - /color-name@1.1.4: - resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} - dev: false - - /color-string@1.9.1: - resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} - dependencies: - color-name: 1.1.4 - simple-swizzle: 0.2.2 - dev: false - - /color@3.2.1: - resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} - dependencies: - color-convert: 1.9.3 - color-string: 1.9.1 - dev: false - - /colorspace@1.1.4: - resolution: {integrity: sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==} - dependencies: - color: 3.2.1 - text-hex: 1.0.0 - dev: false - /combined-stream@1.0.8: resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==} engines: {node: '>= 0.8'} @@ -236,25 +356,6 @@ packages: resolution: {integrity: sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==} dev: true - /css-select@5.1.0: - resolution: {integrity: sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==} - dependencies: - boolbase: 1.0.0 - css-what: 6.1.0 - domhandler: 5.0.3 - domutils: 3.1.0 - nth-check: 2.1.1 - dev: false - - /css-what@6.1.0: - resolution: {integrity: sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==} - engines: {node: '>= 6'} - dev: false - - /dayjs@1.11.10: - resolution: {integrity: sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ==} - dev: false - /decompress-response@6.0.0: resolution: {integrity: sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==} engines: {node: '>=10'} @@ -277,49 +378,36 @@ packages: engines: {node: '>=0.3.1'} dev: true - /dom-serializer@2.0.0: - resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} - dependencies: - domelementtype: 2.3.0 - domhandler: 5.0.3 - entities: 4.5.0 - dev: false - - /domelementtype@2.3.0: - resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==} - dev: false - - /domhandler@5.0.3: - resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==} - engines: {node: '>= 4'} - dependencies: - domelementtype: 2.3.0 - dev: false - - /domutils@3.1.0: - resolution: {integrity: sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==} - dependencies: - dom-serializer: 2.0.0 - domelementtype: 2.3.0 - domhandler: 5.0.3 - dev: false - - /enabled@2.0.0: - resolution: {integrity: sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==} - dev: false - - /entities@4.5.0: - resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} - engines: {node: '>=0.12'} - dev: false - - /fecha@4.2.3: - resolution: {integrity: sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==} - dev: false - - /fn.name@1.1.0: - resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==} - dev: false + /esbuild@0.19.12: + resolution: {integrity: sha512-aARqgq8roFBj054KvQr5f1sFu0D65G+miZRCuJyJ0G13Zwx7vRar5Zhn2tkQNzIXcBrNVsv/8stehpj+GAjgbg==} + engines: {node: '>=12'} + hasBin: true + requiresBuild: true + optionalDependencies: + '@esbuild/aix-ppc64': 0.19.12 + '@esbuild/android-arm': 0.19.12 + '@esbuild/android-arm64': 0.19.12 + '@esbuild/android-x64': 0.19.12 + '@esbuild/darwin-arm64': 0.19.12 + '@esbuild/darwin-x64': 0.19.12 + '@esbuild/freebsd-arm64': 0.19.12 + '@esbuild/freebsd-x64': 0.19.12 + '@esbuild/linux-arm': 0.19.12 + '@esbuild/linux-arm64': 0.19.12 + '@esbuild/linux-ia32': 0.19.12 + '@esbuild/linux-loong64': 0.19.12 + '@esbuild/linux-mips64el': 0.19.12 + '@esbuild/linux-ppc64': 0.19.12 + '@esbuild/linux-riscv64': 0.19.12 + '@esbuild/linux-s390x': 0.19.12 + '@esbuild/linux-x64': 0.19.12 + '@esbuild/netbsd-x64': 0.19.12 + '@esbuild/openbsd-x64': 0.19.12 + '@esbuild/sunos-x64': 0.19.12 + '@esbuild/win32-arm64': 0.19.12 + '@esbuild/win32-ia32': 0.19.12 + '@esbuild/win32-x64': 0.19.12 + dev: true /form-data-encoder@2.1.4: resolution: {integrity: sha512-yDYSgNMraqvnxiEXO4hi88+YZxaHC6QKzb5N84iRCTDeRO7ZALpir/lVmf/uXUhnwUr2O4HU8s/n6x+yNjQkHw==} @@ -335,11 +423,25 @@ packages: mime-types: 2.1.35 dev: true + /fsevents@2.3.3: + resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} + engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} + os: [darwin] + requiresBuild: true + dev: true + optional: true + /get-stream@6.0.1: resolution: {integrity: sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==} engines: {node: '>=10'} dev: false + /get-tsconfig@4.7.3: + resolution: {integrity: sha512-ZvkrzoUA0PQZM6fy6+/Hce561s+faD1rsNwhnO5FelNjyy7EMGJ3Rz1AQ8GYDWjhRs/7dBLOEJvhK8MiEJOAFg==} + dependencies: + resolve-pkg-maps: 1.0.0 + dev: true + /got@13.0.0: resolution: {integrity: sha512-XfBk1CxOOScDcMr9O1yKkNaQyy865NbYs+F7dr4H0LZMVgCj2Le59k6PqbNHoL5ToeaEQUYh6c6yMfVcc6SJxA==} engines: {node: '>=16'} @@ -357,15 +459,6 @@ packages: responselike: 3.0.0 dev: false - /htmlparser2@8.0.2: - resolution: {integrity: sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==} - dependencies: - domelementtype: 2.3.0 - domhandler: 5.0.3 - domutils: 3.1.0 - entities: 4.5.0 - dev: false - /http-cache-semantics@4.1.1: resolution: {integrity: sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ==} dev: false @@ -378,19 +471,6 @@ packages: resolve-alpn: 1.2.1 dev: false - /inherits@2.0.4: - resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} - dev: false - - /is-arrayish@0.3.2: - resolution: {integrity: sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==} - dev: false - - /is-stream@2.0.1: - resolution: {integrity: sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==} - engines: {node: '>=8'} - dev: false - /json-buffer@3.0.1: resolution: {integrity: sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==} dev: false @@ -401,22 +481,6 @@ packages: json-buffer: 3.0.1 dev: false - /kuler@2.0.0: - resolution: {integrity: sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==} - dev: false - - /logform@2.6.0: - resolution: {integrity: sha512-1ulHeNPp6k/LD8H91o7VYFBng5i1BDE7HoKxVbZiGFidS1Rj65qcywLxX+pVfAPoQJEjRdvKcusKwOupHCVOVQ==} - engines: {node: '>= 12.0.0'} - dependencies: - '@colors/colors': 1.6.0 - '@types/triple-beam': 1.3.5 - fecha: 4.2.3 - ms: 2.1.3 - safe-stable-stringify: 2.4.3 - triple-beam: 1.4.1 - dev: false - /lowercase-keys@3.0.0: resolution: {integrity: sha512-ozCC6gdQ+glXOQsveKD0YsDy8DSQFjDTz4zyzEHNV5+JP5D62LmfDZ6o1cycFx9ouG940M5dE8C8CTewdj2YWQ==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -448,63 +512,34 @@ packages: engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} dev: false - /ms@2.1.3: - resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} + /node-url-utils@0.4.0: + resolution: {integrity: sha512-cg9J2VQ0nNvrZR+edoDaAoInvGRgi2r/LCSwOwNCgVASBo0rQiDbSwa4s0MpEcpPf1p4qokC14BXMx5TPxSU5w==} + engines: {node: '>=0.2.x'} dev: false - /normalize-url@8.0.0: - resolution: {integrity: sha512-uVFpKhj5MheNBJRTiMZ9pE/7hD1QTeEvugSJW/OmLzAp78PB5O6adfMNTvmfKhXBkvCzC+rqifWcVYpGFwTjnw==} + /normalize-url@8.0.1: + resolution: {integrity: sha512-IO9QvjUMWxPQQhs60oOu10CRkWCiZzSUkzbXGGV9pviYl1fXYcvkzQ5jV9z8Y6un8ARoVRl4EtC6v6jNqbaJ/w==} engines: {node: '>=14.16'} dev: false - /nth-check@2.1.1: - resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} - dependencies: - boolbase: 1.0.0 - dev: false - - /one-time@1.0.0: - resolution: {integrity: sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==} - dependencies: - fn.name: 1.1.0 - dev: false - /p-cancelable@3.0.0: resolution: {integrity: sha512-mlVgR3PGuzlo0MmTdk4cXqXWlwQDLnONTAg6sm62XkMJEiRxN3GL3SffkYvqwonbkJBcrI7Uvv5Zh9yjvn2iUw==} engines: {node: '>=12.20'} dev: false - /parse5-htmlparser2-tree-adapter@7.0.0: - resolution: {integrity: sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==} - dependencies: - domhandler: 5.0.3 - parse5: 7.1.2 - dev: false - - /parse5@7.1.2: - resolution: {integrity: sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==} - dependencies: - entities: 4.5.0 - dev: false - /quick-lru@5.1.1: resolution: {integrity: sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==} engines: {node: '>=10'} dev: false - /readable-stream@3.6.2: - resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==} - engines: {node: '>= 6'} - dependencies: - inherits: 2.0.4 - string_decoder: 1.3.0 - util-deprecate: 1.0.2 - dev: false - /resolve-alpn@1.2.1: resolution: {integrity: sha512-0a1F4l73/ZFZOakJnQ3FvkJ2+gSTQWz/r2KE5OdDY0TxPm5h4GkqkWWfM47T7HsbnOtcJVEF4epCVy6u7Q3K+g==} dev: false + /resolve-pkg-maps@1.0.0: + resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} + dev: true + /responselike@3.0.0: resolution: {integrity: sha512-40yHxbNcl2+rzXvZuVkrYohathsSJlMTXKryG5y8uciHv1+xDLHQpgjG64JUO9nrEq2jGLH6IZ8BcZyw3wrweg==} engines: {node: '>=14.16'} @@ -512,41 +547,13 @@ packages: lowercase-keys: 3.0.0 dev: false - /safe-buffer@5.2.1: - resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} - dev: false - - /safe-stable-stringify@2.4.3: - resolution: {integrity: sha512-e2bDA2WJT0wxseVd4lsDP4+3ONX6HpMXQa1ZhFQ7SU+GjvORCmShbCMltrtIDfkYhVHrOcPtj+KhmDBdPdZD1g==} - engines: {node: '>=10'} - dev: false - - /simple-swizzle@0.2.2: - resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} - dependencies: - is-arrayish: 0.3.2 - dev: false - - /stack-trace@0.0.10: - resolution: {integrity: sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==} - dev: false - - /string_decoder@1.3.0: - resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==} + /seenreq@3.0.0: + resolution: {integrity: sha512-wSe7hb83TKkyweL8Jq5a1xuStmqfwxiJn2SXjA/Wns42aUJjlWzPzj/jWaomOCRY5ZpIRkiyh/+5pNz/20363A==} dependencies: - safe-buffer: 5.2.1 - dev: false - - /text-hex@1.0.0: - resolution: {integrity: sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==} + node-url-utils: 0.4.0 dev: false - /triple-beam@1.4.1: - resolution: {integrity: sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==} - engines: {node: '>= 14.0.0'} - dev: false - - /ts-node@10.9.2(@types/node@20.10.6)(typescript@5.3.3): + /ts-node@10.9.2(@types/node@20.12.7)(typescript@5.4.5): resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==} hasBin: true peerDependencies: @@ -561,24 +568,35 @@ packages: optional: true dependencies: '@cspotcode/source-map-support': 0.8.1 - '@tsconfig/node10': 1.0.9 + '@tsconfig/node10': 1.0.11 '@tsconfig/node12': 1.0.11 '@tsconfig/node14': 1.0.3 '@tsconfig/node16': 1.0.4 - '@types/node': 20.10.6 + '@types/node': 20.12.7 acorn: 8.11.3 - acorn-walk: 8.3.1 + acorn-walk: 8.3.2 arg: 4.1.3 create-require: 1.1.1 diff: 4.0.2 make-error: 1.3.6 - typescript: 5.3.3 + typescript: 5.4.5 v8-compile-cache-lib: 3.0.1 yn: 3.1.1 dev: true - /typescript@5.3.3: - resolution: {integrity: sha512-pXWcraxM0uxAS+tN0AG/BF2TyqmHO014Z070UsJ+pFvYuRSq8KH8DmWpnbXe0pEPDHXZV3FcAbJkijJ5oNEnWw==} + /tsx@4.7.3: + resolution: {integrity: sha512-+fQnMqIp/jxZEXLcj6WzYy9FhcS5/Dfk8y4AtzJ6ejKcKqmfTF8Gso/jtrzDggCF2zTU20gJa6n8XqPYwDAUYQ==} + engines: {node: '>=18.0.0'} + hasBin: true + dependencies: + esbuild: 0.19.12 + get-tsconfig: 4.7.3 + optionalDependencies: + fsevents: 2.3.3 + dev: true + + /typescript@5.4.5: + resolution: {integrity: sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==} engines: {node: '>=14.17'} hasBin: true dev: true @@ -587,40 +605,10 @@ packages: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} dev: true - /util-deprecate@1.0.2: - resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} - dev: false - /v8-compile-cache-lib@3.0.1: resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==} dev: true - /winston-transport@4.6.0: - resolution: {integrity: sha512-wbBA9PbPAHxKiygo7ub7BYRiKxms0tpfU2ljtWzb3SjRjv5yl6Ozuy/TkXf00HTAt+Uylo3gSkNwzc4ME0wiIg==} - engines: {node: '>= 12.0.0'} - dependencies: - logform: 2.6.0 - readable-stream: 3.6.2 - triple-beam: 1.4.1 - dev: false - - /winston@3.11.0: - resolution: {integrity: sha512-L3yR6/MzZAOl0DsysUXHVjOwv8mKZ71TrA/41EIduGpOOV5LQVodqN+QdQ6BS6PJ/RdIshZhq84P/fStEZkk7g==} - engines: {node: '>= 12.0.0'} - dependencies: - '@colors/colors': 1.6.0 - '@dabh/diagnostics': 2.0.3 - async: 3.2.5 - is-stream: 2.0.1 - logform: 2.6.0 - one-time: 1.0.0 - readable-stream: 3.6.2 - safe-stable-stringify: 2.4.3 - stack-trace: 0.0.10 - triple-beam: 1.4.1 - winston-transport: 4.6.0 - dev: false - /yn@3.1.1: resolution: {integrity: sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==} engines: {node: '>=6'} diff --git a/src/crawler.ts b/src/crawler.ts index e06fa9a..fb39dc5 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1,30 +1,22 @@ import { EventEmitter } from "events"; -import got from "got"; import { RateLimiter, Cluster } from "./rateLimiter/index.js"; -import { getType, isValidUrl, setDefaults, flattenDeep } from "./lib/utils.js"; +import { getType, isValidUrl, isFunction, setDefaults, flattenDeep } from "./lib/utils.js"; import type { crawlerOptions, requestOptions } from "./types/crawler.js"; - -const normalizeContentType = (contentType: string) => { - //@todo -}; -const crawler = (options: crawlerOptions) => {}; -// 导入依赖库 +import { promisify } from "util"; +import got from "got"; +import seenreq from "seenreq"; // 定义 Crawler 类 -class Crawler extends EventEmitter{ +class Crawler extends EventEmitter { private _limiters: Cluster; public options: crawlerOptions; public globalOnlyOptions: string[]; - // private limiters: Bottleneck.Cluster; - // private http2Connections: Record; - // //private log: (level: string, message: string) => void; - // private seen: seenreq; + public seen: any; constructor(options: crawlerOptions) { super(); const defaultOptions: crawlerOptions = { - autoWindowClose: true, forceUTF8: true, gzip: true, incomingEncoding: null, @@ -105,7 +97,21 @@ class Crawler extends EventEmitter{ try { options = this._getValidOptions(options) as requestOptions; setDefaults(options, this.options); - options.headers = {...this.options.headers, ...options.headers}; + options.headers = { ...this.options.headers, ...options.headers }; + this.globalOnlyOptions.forEach(globalOnlyOption => { + delete (options as any)[globalOnlyOption]; + }); + if (!this.options.skipDuplicates) { + this._schedule(options); + } + this.seen + .exists(options, options.seenreq) + .then((rst: any) => { + if (!rst) { + this._schedule(options); + } + }) + .catch((err: any) => console.error(err)); } catch (err) { console.warn(err); } @@ -114,54 +120,126 @@ class Crawler extends EventEmitter{ private _schedule = async (options: requestOptions): Promise => { this.emit("schedule", options); - const limiter = this._limiters.createRateLimiter(options.rateLimit); - limiter.submit(options.priority, async () => { - try { - await this._execute(options); - } catch (err) { - console.warn(err); + this._limiters.getRateLimiter(options.rateLimit).submit(options.priority, (done, limiter) => { + options.release = () => { + done(); + this.emit("_release"); + }; + options.callback = options.callback || options.release; + + if (limiter) { + this.emit("limiterChange", options, limiter); + } + + if (options.html) { + this._onContent(null, options, { body: options.html, headers: { "content-type": "text/html" } }); + } else if (typeof options.uri === "function") { + options.uri(uri => { + options.uri = uri; + this._execute(options); + }); + } else { + this._execute(options); } }); - } + }; private _execute = async (options: Partial): Promise => { - options.retries = options.retries || this.options.retries || 0; - // delete all globalonly options - // this.globalOnlyOptions.forEach(globalOnlyOption => { - // delete options[globalOnlyOption]; - // }); - // logger.debug(`${options.method} ${options.uri}`); - if (!options.headers) { - options.headers = {}; + const reqOptions = { ...options }; + reqOptions.headers = reqOptions.headers ?? {}; + if (reqOptions.forceUTF8 || reqOptions.json) reqOptions.encoding = null; + if (reqOptions.rotateUA && Array.isArray(reqOptions.userAgent)) { + reqOptions.headers["user-agent"] = + reqOptions.userAgent[Math.floor(Math.random() * reqOptions.userAgent.length)]; + } else { + reqOptions.headers["user-agent"] = reqOptions.userAgent; } - if (options.forceUTF8) { - options.headers["Accept-Charset"] = "utf-8"; - options.encoding = null; + + if (reqOptions.referer) { + reqOptions.headers.referer = reqOptions.referer; } - if (options.json) { - options.encoding = null; + if (reqOptions.proxies) { + reqOptions.proxy = reqOptions.proxies[Math.floor(Math.random() * reqOptions.proxies.length)]; } - if (options.userAgent) { - if (this.options.rotateUA && Array.isArray(options.userAgent)) { - // If "rotateUA" is true, rotate User-Agent - options.headers["User-Agent"] = options.userAgent[0]; - options.userAgent.push(options.userAgent.shift()); - } else { - options.headers["User-Agent"] = options.userAgent; + if (isFunction(reqOptions.preRequest)) { + try { + await promisify(reqOptions.preRequest as any)(reqOptions); + } catch (err) { + console.error(err); } } - if (options.referer) { - options.headers["Referer"] = options.referer; + if (reqOptions.skipEventRequest !== true) { + this.emit("request", reqOptions); } - if (options.proxies && options.proxies.length) { - options.proxy = options.proxies[0]; + try { + const response = await got(reqOptions.uri, reqOptions); + this._handler((error = null), reqOptions, response); + } catch (error) { + console.log("error:", error); + this._handler(error, reqOptions); } }; + private _handler = (error: Error | null, options: requestOptions, response?: any): void => { + if (error) { + options.callback(error, { options: options }, options.release); + // switch (error.code) { + // case "NOHTTP2SUPPORT": + // //if the enviroment is not support http2 api, all request rely on http2 protocol + // //are aborted immediately no matter how many retry times left + // log( + // "error", + // "Error " + error + " when fetching " + (options.uri || options.url) + " skip all retry times" + // ); + // break; + // default: + // log( + // "error", + // "Error " + + // error + + // " when fetching " + + // (options.uri || options.url) + + // (options.retries ? " (" + options.retries + " retries left)" : "") + // ); + // if (options.retries) { + // setTimeout(function () { + // options.retries--; + // self._schedule(options); + // options.release(); + // }, options.retryTimeout); + // return; + // } + // break; + // } + return; + } + + if (!response.body) { + response.body = ""; + } + console.debug("Got " + (options.uri || "html") + " (" + response.body.length + " bytes)..."); + + try { + self._doEncoding(options, response); + } catch (e) { + return options.callback(e, { options: options }, options.release); + } + + response.options = options; + + if (options.method === "HEAD" || !options.jQuery) { + return options.callback(null, response, options.release); + } + const injectableTypes = ["html", "xhtml", "text/xml", "application/xml", "+xml"]; + if (!options.html && !typeis(contentType(response), injectableTypes)) { + log("warn", "response body is not HTML, skip injecting. Set jQuery to false to suppress this message"); + return options.callback(null, response, options.release); + } + + }; private get queueSize(): number { return 0; } - } export default Crawler; diff --git a/src/lib/utils.ts b/src/lib/utils.ts index 85e5bbf..5d3bc54 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -1,10 +1,18 @@ /** - * * @returns type of param, a lower case string */ export const getType = (value: unknown): string => Object.prototype.toString.call(value).slice(8, -1).toLocaleLowerCase(); export const isNumber = (value: unknown): boolean => getType(value) === "number" && !isNaN(value as number); +export const isFunction = (value: unknown): boolean => getType(value) === "function"; +/** + * @param target + * @param source + * @returns target with source's properties added if they don't exist in target + * @description + * This function is used to set default values for an object. + * Add properties from source to target if they don't exist in target. + */ export const setDefaults = (target: Record, source: Record) => { for (const key in source) { if (target[key] === undefined) { @@ -23,7 +31,14 @@ export const isValidUrl = (url: string): boolean => { }; export function flattenDeep(array: T[]): T[]; - +/** + * + * @param array + * @returns a flattened array + * @description + * Flattens an array of arrays recursively. + * + */ export function flattenDeep(array: any[]): any[] { const result: any[] = []; array.forEach(element => { @@ -35,3 +50,14 @@ export function flattenDeep(array: any[]): any[] { }); return result; } + + +export function pick(obj: T, keys: K[]): Pick { + const result: any = {}; + keys.forEach(key => { + if (key in obj) { + result[key] = obj[key]; + } + }); + return result; +} \ No newline at end of file diff --git a/src/rateLimiter/cluster.ts b/src/rateLimiter/cluster.ts index d3f37c6..d975a37 100644 --- a/src/rateLimiter/cluster.ts +++ b/src/rateLimiter/cluster.ts @@ -5,7 +5,7 @@ export type ClusterOptions = RateLimiterOptions & { }; class Cluster { - private _rateLimiters: Record; + private _rateLimiters: Record; private _homogeneous: boolean; private _interval: NodeJS.Timeout | null = null; @@ -23,8 +23,10 @@ class Cluster { this._homogeneous = homogeneous || false; this._rateLimiters = {}; } - - createRateLimiter(id: string = ""): RateLimiter | undefined { + /** + * Alternative to Old Cluster.prototype.key + */ + getRateLimiter(id: number = 0): RateLimiter { if (!this._rateLimiters[id]) { this._rateLimiters[id] = new RateLimiter({ "maxConnections": this.globalMaxConnections, @@ -36,15 +38,14 @@ class Cluster { this._rateLimiters[id].setId(id); return this._rateLimiters[id]; } else { - console.error("RateLimiter with id: " + id + " already exists"); - return void 0; + return this._rateLimiters[id]; } } - hasRateLimiter(id: string = ""): boolean { + hasRateLimiter(id: number = 0): boolean { return !!this._rateLimiters[id]; } - deleteRateLimiter(id: string = ""): boolean { + deleteRateLimiter(id: number = 0): boolean { return delete this._rateLimiters[id]; } @@ -67,14 +68,27 @@ class Cluster { } dequeue(): TaskWrapper | undefined { - Object.keys(this._rateLimiters).forEach(id => { + // for (const id in this._rateLimiters) { + // if (this._rateLimiters[id].waitingSize) { + // return { + // next: this._rateLimiters[id].dequeue(), + // rateLimiterId: id, + // }; + // } else { + // delete this._rateLimiters[id]; + // } + // } + Object.keys(this._rateLimiters).forEach(key => { + const id = Number(key); if (this._rateLimiters[id].waitingSize) { return { - next: this._rateLimiters[id].dequeue(), - rateLimiterId: id, + "next": this._rateLimiters[id].dequeue(), + "rateLimiterId": parseInt(id, 10), }; - } else delete this._rateLimiters[id]; - }); + } else { + this.deleteRateLimiter(parseInt(id, 10)); + } + } return void 0; } diff --git a/src/rateLimiter/rateLimiter.ts b/src/rateLimiter/rateLimiter.ts index 27049ed..769e919 100644 --- a/src/rateLimiter/rateLimiter.ts +++ b/src/rateLimiter/rateLimiter.ts @@ -7,7 +7,7 @@ export type Task = { export type TaskWrapper = { next: Task; - rateLimiterId: string | null; + rateLimiterId: number | null; } export type RateLimiterOptions = { @@ -22,7 +22,7 @@ class RateLimiter { private _waitingTasks: multiPriorityQueue; private _cluster?: Cluster; - public id?: string; + public id?: number; public maxConnections: number; public nextRequestTime: number; public rateLimit: number; @@ -55,7 +55,7 @@ class RateLimiter { return this.waitingSize > 0 || (this._cluster !== void 0 && this._cluster.hasWaitingTasks()); } - setId(id: string) { + setId(id: number) { this.id = id; } diff --git a/src/test.ts b/src/test.ts deleted file mode 100644 index 705f021..0000000 --- a/src/test.ts +++ /dev/null @@ -1,5 +0,0 @@ -const a = Object.getPrototypeOf(Object.prototype) -const b = { - "a": 1, -} -console.log(a) \ No newline at end of file diff --git a/src/types/crawler.d.ts b/src/types/crawler.d.ts index 4dd31d3..d16b915 100644 --- a/src/types/crawler.d.ts +++ b/src/types/crawler.d.ts @@ -1,6 +1,7 @@ declare global { var mainModule: string; } + type crawlerOptions = { headers: Record; autoWindowClose: boolean; @@ -26,8 +27,11 @@ type crawlerOptions = { debug?: boolean; logger?: any; seenreq?: any; -} +}; type requestOptions = crawlerOptions & { + preRequest: (options: requestOptions, done: (error: Error | null, options: requestOptions) => void) => void; + release: () => void; + callback?: (error: Error, response: unknown, done: unknown) => void; uri: string; userAgent: string; headers: Record; diff --git a/src/types/index.d.ts b/src/types/index.d.ts new file mode 100644 index 0000000..bd373a2 --- /dev/null +++ b/src/types/index.d.ts @@ -0,0 +1 @@ +declare module "seenreq"; \ No newline at end of file diff --git a/test.ts b/test.ts new file mode 100644 index 0000000..688b060 --- /dev/null +++ b/test.ts @@ -0,0 +1,4 @@ + +let a = Number(5) + +console.log(a) diff --git a/tsconfig.json b/tsconfig.json index 9e78492..1f12a2f 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -110,12 +110,8 @@ }, "include": [ "src/**/*" - ], +, "test.ts" ], "exclude": [ "src/seenreq" - ], - "ts-node": { - "esm": true, - "transpileOnly": true - } + ] }