Skip to content

Commit 6320b2f

Browse files
committed
Initial commit of node-scraper
0 parents  commit 6320b2f

File tree

7 files changed

+288
-0
lines changed

7 files changed

+288
-0
lines changed

LICENSE

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Copyright (c) 2010 Mathias Pettersson, [email protected]
2+
3+
Permission is hereby granted, free of charge, to any person obtaining
4+
a copy of this software and associated documentation files (the
5+
"Software"), to deal in the Software without restriction, including
6+
without limitation the rights to use, copy, modify, merge, publish,
7+
distribute, sublicense, and/or sell copies of the Software, and to
8+
permit persons to whom the Software is furnished to do so, subject to
9+
the following conditions:
10+
11+
The above copyright notice and this permission notice shall be
12+
included in all copies or substantial portions of the Software.
13+
14+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# node-scraper
2+
3+
A little module that makes scraping websites a little easier. Uses node.js and jQuery.
4+
5+
## Installation
6+
7+
Via [npm](http://github.com/isaacs/npm):
8+
9+
$ npm install scraper
10+
11+
## Usage
12+
13+
### Simple
14+
First argument is an url as a string, second is a callback which exposes a jQuery object with your scraped site as "body".
15+
16+
var scraper = require('scraper');
17+
scraper('http://search.twitter.com/search?q=javascript', function(err, jQuery) {
18+
if (err) {throw err}
19+
20+
jQuery('.msg').each(function() {
21+
console.log(jQuery(this).text().trim()+'\n');
22+
});
23+
});
24+
### Advanced
25+
First argument is an object containing settings for the "request" instance used internally, second is a callback which exposes a jQuery object with your scraped site as "body".
26+
27+
var scraper = require('scraper');
28+
scraper({
29+
'uri': 'http://search.twitter.com/search?q=nodejs'
30+
, 'headers': {
31+
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
32+
}}
33+
, function(err, $) {
34+
if (err) {throw err}
35+
36+
$('.msg').each(function() {
37+
console.log($(this).text().trim()+'\n');
38+
});
39+
});

deps/jquery-1.4.2.min.js

+154
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

examples/advanced.js

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
var scraper = require('scraper');
2+
3+
scraper({
4+
'uri': 'http://search.twitter.com/search?q=nodejs'
5+
, 'headers': {
6+
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
7+
}}
8+
, function(err, $) {
9+
if (err) {throw err;}
10+
11+
$('.msg').each(function() {
12+
console.log($(this).text().trim()+'\n');
13+
});
14+
});

examples/simple.js

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
var scraper = require('scraper');
2+
3+
scraper('http://search.twitter.com/search?q=javascript', function(err, $) {
4+
if (err) {throw err;}
5+
6+
$('.msg').each(function() {
7+
console.log($(this).text().trim()+'\n');
8+
});
9+
});

lib/scraper.js

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
var request = require('request');
2+
var jsdom = require('jsdom');
3+
4+
var defaults = {
5+
'uri': null
6+
, 'headers': {
7+
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
8+
}
9+
};
10+
module.exports = function scrape(requestOptions, callback) {
11+
var settings = {};
12+
Object.keys(defaults).forEach(function(key) {
13+
settings[key] = requestOptions[key] || defaults[key];
14+
});
15+
16+
if (typeof requestOptions === 'string') {
17+
settings['uri'] = requestOptions;
18+
}
19+
20+
if (!settings['uri']) {
21+
callback(new Error('You must supply an uri.'), null, null);
22+
}
23+
24+
request(settings, function (err, response, body) {
25+
if (err) {
26+
callback(err, null, null);
27+
}
28+
if (response.statusCode == 200) {
29+
var window = jsdom.jsdom().createWindow();
30+
jsdom.jQueryify(window, '../deps/jquery-1.4.2.min.js' , function() {
31+
window.$('body').append(body);
32+
callback(null, window.$);
33+
});
34+
} else {
35+
callback(new Error('Request to '+settings['uri']+' ended with status code: '+response.statusCode), null, null);
36+
}
37+
});
38+
};

package.json

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"name" : "scraper",
3+
"description" : "Easier web scraping using jQuery.",
4+
"version" : "0.0.1",
5+
"author" : "Mathias Pettersson <[email protected]>",
6+
"engines" : ["node"],
7+
"directories" : { "lib" : "./lib" },
8+
"main" : "./lib/scraper",
9+
"repository" : { "type":"git", "url":"https://github.com/mape/node-scraper.git" },
10+
"dependencies" : {
11+
"request" : ">=0.10.0",
12+
"jsdom" : ">=0.1.20"
13+
}
14+
}

0 commit comments

Comments
 (0)