Skip to content

Commit d3dd9e8

Browse files
committed
Replace jsdom with cheerio
1 parent 97f7a32 commit d3dd9e8

12 files changed

+254
-99
lines changed

.idea/codeStyleSettings.xml

+13
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/dictionaries/Thomas.xml

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/encodings.xml

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/inspectionProfiles/Project_Default.xml

+23
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/inspectionProfiles/profiles_settings.xml

+7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

+79
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/node-scraper.iml

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/scopes/scope_settings.xml

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/vcs.xml

+7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/scraper.js

+80-85
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,80 @@
1-
var request = require('request');
2-
var jsdom = require('jsdom');
3-
var requestDefaults = {
4-
'uri': null
5-
, 'headers': {
6-
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
7-
}
8-
};
9-
var fetchDefaults = {
10-
'reqPerSec': 0
11-
};
12-
module.exports = function scrape(requestOptions, callback, fetchOptions) {
13-
if (!fetchOptions) {
14-
fetchOptions = {};
15-
}
16-
if (!callback) {
17-
callback = function(){};
18-
}
19-
Object.keys(fetchDefaults).forEach(function(key) {
20-
if (fetchOptions[key] === undefined) {
21-
fetchOptions[key] = fetchDefaults[key]
22-
}
23-
});
24-
25-
var fetches = [];
26-
var queue = [];
27-
28-
if (!Array.isArray(requestOptions)) {
29-
fetches.push(requestOptions);
30-
} else {
31-
fetches = requestOptions;
32-
}
33-
34-
fetches.forEach(function(requestOptions, index) {
35-
queue.push(function() {
36-
Object.keys(requestDefaults).forEach(function(key) {
37-
requestOptions[key] = requestOptions[key] || requestDefaults[key];
38-
});
39-
if (typeof requestOptions === 'string') {
40-
requestOptions = {
41-
'uri': requestOptions
42-
}
43-
}
44-
45-
if (!requestOptions['uri']) {
46-
callback(new Error('You must supply an uri.'), null, null);
47-
}
48-
49-
request(requestOptions, function (err, response, body) {
50-
body = body.replace(/<(\/?)script/g, '<$1nobreakage');
51-
setTimeout(runNextFetch, timeSpacing);
52-
if (err) {
53-
callback(err, null, null);
54-
}
55-
if (response && response.statusCode == 200) {
56-
var window = jsdom.jsdom().createWindow();
57-
jsdom.jQueryify(window, __dirname+'/../deps/jquery-1.6.1.min.js', function(win, $) {
58-
$('head').append($(body).find('head').html());
59-
$('body').append($(body).find('body').html());
60-
callback(null, $);
61-
});
62-
} else {
63-
callback(new Error('Request to '+requestOptions['uri']+' ended with status code: '+(typeof response !== 'undefined' ? response.statusCode : 'unknown')), null, null);
64-
}
65-
});
66-
})
67-
});
68-
69-
var concurrentConnections = !fetchOptions['reqPerSec'] ? queue.length : (Math.floor(fetchOptions['reqPerSec']) || 1);
70-
var timeSpacing = !fetchOptions['reqPerSec'] ? 0 : 1000/fetchOptions['reqPerSec'];
71-
72-
for (var i=0; i < concurrentConnections; i++) {
73-
runNextFetch(i);
74-
};
75-
76-
function runNextFetch(i) {
77-
if (!i) {
78-
i = 0;
79-
}
80-
if (queue[i]) {
81-
queue[i]();
82-
queue.shift();
83-
}
84-
}
85-
};
1+
var request = require('request');
2+
var cheerio = require('cheerio');
3+
4+
var requestDefaults = {
5+
'uri': null
6+
, 'headers': {
7+
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
8+
}
9+
};
10+
var fetchDefaults = {
11+
'reqPerSec': 0
12+
};
13+
module.exports = function scrape(requestOptions, callback, fetchOptions) {
14+
if (!fetchOptions) {
15+
fetchOptions = {};
16+
}
17+
if (!callback) {
18+
callback = function(){};
19+
}
20+
Object.keys(fetchDefaults).forEach(function(key) {
21+
if (fetchOptions[key] === undefined) {
22+
fetchOptions[key] = fetchDefaults[key]
23+
}
24+
});
25+
26+
var fetches = [];
27+
var queue = [];
28+
29+
if (!Array.isArray(requestOptions)) {
30+
fetches.push(requestOptions);
31+
} else {
32+
fetches = requestOptions;
33+
}
34+
35+
fetches.forEach(function(requestOptions, index) {
36+
queue.push(function() {
37+
Object.keys(requestDefaults).forEach(function(key) {
38+
requestOptions[key] = requestOptions[key] || requestDefaults[key];
39+
});
40+
if (typeof requestOptions === 'string') {
41+
requestOptions = {
42+
'uri': requestOptions
43+
}
44+
}
45+
46+
if (!requestOptions['uri']) {
47+
callback(new Error('You must supply an uri.'), null, null);
48+
}
49+
50+
request(requestOptions, function (err, response, body) {
51+
body = body.replace(/<(\/?)script/g, '<$1nobreakage');
52+
setTimeout(runNextFetch, timeSpacing);
53+
if (err) {
54+
callback(err, null, null);
55+
}else if (response && response.statusCode === 200) {
56+
callback(null, cheerio.load(body));
57+
} else {
58+
callback(new Error('Request to '+requestOptions['uri']+' ended with status code: '+(typeof response !== 'undefined' ? response.statusCode : 'unknown')), null, null);
59+
}
60+
});
61+
})
62+
});
63+
64+
var concurrentConnections = !fetchOptions['reqPerSec'] ? queue.length : (Math.floor(fetchOptions['reqPerSec']) || 1);
65+
var timeSpacing = !fetchOptions['reqPerSec'] ? 0 : 1000/fetchOptions['reqPerSec'];
66+
67+
for (var i=0; i < concurrentConnections; i++) {
68+
runNextFetch(i);
69+
};
70+
71+
function runNextFetch(i) {
72+
if (!i) {
73+
i = 0;
74+
}
75+
if (queue[i]) {
76+
queue[i]();
77+
queue.shift();
78+
}
79+
}
80+
};

package.json

+14-14
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
{
2-
"name" : "scraper",
3-
"description" : "Easier web scraping using jQuery.",
4-
"version" : "0.0.9",
5-
"author" : "Mathias Pettersson <[email protected]>",
6-
"engines" : ["node"],
7-
"directories" : { "lib" : "./lib" },
8-
"main" : "./lib/scraper",
9-
"repository" : { "type":"git", "url":"https://github.com/mape/node-scraper.git" },
10-
"dependencies" : {
11-
"request" : ">=0.10.0",
12-
"jsdom" : ">=0.1.20"
13-
}
14-
}
1+
{
2+
"name" : "scraper",
3+
"description" : "Easier web scraping using jQuery.",
4+
"version" : "0.0.9",
5+
"author" : "Mathias Pettersson <[email protected]>",
6+
"engines" : ["node"],
7+
"directories" : { "lib" : "./lib" },
8+
"main" : "./lib/scraper",
9+
"repository" : { "type":"git", "url":"https://github.com/mape/node-scraper.git" },
10+
"dependencies" : {
11+
"request" : ">=0.10.0",
12+
"cheerio":">=0.8.0"
13+
}
14+
}

0 commit comments

Comments
 (0)