Skip to content

Commit

Permalink
now with tests for the fifa scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
sanx committed Jun 29, 2014
1 parent 252b457 commit a7438fc
Show file tree
Hide file tree
Showing 15 changed files with 2,117 additions and 105 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ client/build
bower_components
.module-cache
build
.DS_Store
3 changes: 3 additions & 0 deletions config/development.config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"sources": {
"fifa": {
"useCacheMatchesPage": true,
"useCacheMatchPages": true,
"useCachePdfs": false,
"domain": "www.fifa.com",
"all_matches_path": "/worldcup/matches/index.html"
}
Expand Down
5 changes: 3 additions & 2 deletions data_not_checked_in/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
fifa14-all-data.json
matches_index.html
*.pdf
*.html
*.json
41 changes: 14 additions & 27 deletions lib/matches_info_from_html.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
var _ = require('lodash');
var Q = require('q');
var jsdom = require('jsdom');
var util = require('util');
Expand All @@ -7,7 +8,7 @@ var jsdomEnv = Q.nbind(jsdom.env, jsdom);
var getMatchesInfo = function (html) {
return jsdomEnv(html, ['../bower_components/jquery/dist/jquery.js']).then(function (window) {
var $ = window.$,
ret = {stages: {}, matches: {}},
ret = {stages: {}, matches: []},
searchedMatchText,
matchStatus,
matchDivSelector = '.mu.fixture,.mu.result,.mu.live',
Expand Down Expand Up @@ -36,26 +37,26 @@ var getMatchesInfo = function (html) {
} else if (matchElem.is('.live')) {
matchStatus = 'live';
}
ret.matches[matchId] = {
ret.matches.push({
roundId: matchRoundId,
matchId: matchId,
matchStatus: matchStatus,
stadium: matchElem.find('.mu-i-stadium').text(),
city: matchElem.find('.mu-i-venue').text(),
matchNum: matchElem.find('.mu-i-matchnum').text().match(/Match\s+(\d+)\b/)[1],
matchNum: parseInt(matchElem.find('.mu-i-matchnum').text().match(/Match\s+(\d+)\b/)[1], 10),
matchUrl: matchUrl,
homeTeamName: matchElem.find('.t.home .t-nText').text(),
homeTeamShort: matchElem.find('.t.home .t-nTri').text(),
awayTeamName: matchElem.find('.t.away .t-nText').text(),
awayTeamShort: matchElem.find('.t.away .t-nTri').text(),
scoreStatus: matchElem.find('.s-status-abbr').text(),
homeTeamName: matchElem.find('.t.home .t-nText').text().toLowerCase(),
homeTeamShort: matchElem.find('.t.home .t-nTri').text().toLowerCase(),
awayTeamName: matchElem.find('.t.away .t-nText').text().toLowerCase(),
awayTeamShort: matchElem.find('.t.away .t-nTri').text().toLowerCase(),
scoreStatus: matchElem.find('.s-status-abbr').text().toLowerCase(),
homeTeamScore: scoreMatches && scoreMatches[1],
awayTeamScore: scoreMatches && scoreMatches[2]
};
});
});
//return Q.fcall(function () {
return ret;
//});
ret.matches = _(ret.matches).sortBy('matchNum').uniq('matchNum').value();
//ret.matches = [];
return ret;
});
};

Expand All @@ -66,7 +67,7 @@ var getMatchesInfo = function (html) {
var getMatchInfo = function (html) {
return jsdomEnv(html, ['../bower_components/jquery/dist/jquery.js']).then(function (window) {
var $ = window.$,
ret,
ret = {},
playerStatsFilter;

console.log('on getMatchInfo');
Expand Down Expand Up @@ -97,17 +98,3 @@ module.exports = {
getMatchesInfo: getMatchesInfo,
getMatchInfo: getMatchInfo
};
/*getMatchesInfo('./data_not_checked_in/matches_index.html')
.then(function (info) {
console.log(JSON.stringify(info, ' ', 4));
console.log(util.format("there are %d matches", Object.getOwnPropertyNames(info.matches).length));
})
.done();
*/

/*getMatchesInfo('./data_not_checked_in/matches_index.html')
.then(function (info) {
console.log(JSON.stringify(info, ' ', 4));
console.log(util.format("there are %d matches", Object.getOwnPropertyNames(info.matches).length));
})
.done();*/
26 changes: 20 additions & 6 deletions lib/players_data_from_pdf.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
var Q = require('q');
var shelljs = require('shelljs');
var fs = require('fs');


var getPlayersDataFromPdf = function (filename) {
var fsRead = Q.denodeify(fs.readFile);

var getRawPlayersDataFromPdf = function (filename) {
var deferred = Q.defer();

shelljs.exec(
Expand All @@ -19,10 +22,21 @@ var getPlayersDataFromPdf = function (filename) {
return deferred.promise;
};

//export.modules = getPlayersDataFromPdf;
var parsePlayersDataBlob = function(blob) {
return {full: blob, herp: 'derp', players: []};
};

var getPlayersDataFromPdf = function (filename) {
return getRawPlayersDataFromPdf(filename)
.then(function (blob) {
return parsePlayersDataBlob(blob);
});
};

module.exports = getPlayersDataFromPdf;

getPlayersDataFromPdf('/Users/germoad/soccr/data_not_checked_in/downloaded_match_18_cmr.pdf')
.then(function (text) {
console.log(text.substr(0, 1000) + '...');
/*getPlayersDataFromPdf('/Users/germoad/soccr/data_not_checked_in/downloaded_match_18_cmr.pdf')
.then(function (info) {
console.log(JSON.stringify(info, ' ', 4));
})
.done();
.done();*/
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"README": "TODO",
"devDependencies": {
"browserify": "^4.1.6",
"chai-as-promised": "^4.1.1",
"envify": "^1.2.1",
"grunt": "^0.4.5",
"grunt-bower-install-simple": "^0.9.2",
Expand All @@ -18,6 +19,7 @@
"jsdom": "^0.10.6",
"react-tools": "^0.10.0",
"request": "^2.36.0",
"rewire": "^2.0.1",
"shelljs": "^0.3.0"
},
"dependencies": {
Expand Down
Loading

0 comments on commit a7438fc

Please sign in to comment.