Skip to content

Commit

Permalink
bug fix and cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
volpino committed Mar 14, 2012
1 parent 387f267 commit ca9b7ef
Showing 1 changed file with 23 additions and 20 deletions.
43 changes: 23 additions & 20 deletions countries_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import logging
from sonet.timr import Timr
import pygeoip
from collections import Counter, OrderedDict
from collections import Counter
import datetime
from dateutil.rrule import rrule, MONTHLY
from django.utils.encoding import smart_str
Expand All @@ -32,23 +32,24 @@
class CountriesPageProcessor(HistoryPageProcessor):
output = None
data = None
per_page_data = {}
per_page_stats = None
exclude_countries = []
exclude_countries = None
gi = None
countries = set()
min_edits = None
min_anon = None
_skip = None
_country = None
_country_data = Counter()
_anon_edits = 0
_edits = 0

def __init__(self, **kwargs):
super(CountriesPageProcessor, self).__init__(**kwargs)
self.gi = pygeoip.GeoIP(kwargs["geoip"])
self.data = OrderedDict()
self.data = {}
self.exclude_countries = self.exclude_countries or []
self.per_page_data = {}
self.countries = set()
self._skip = None
self._country = None
self._country_data = Counter()
self._anon_edits = 0
self._edits = 0

def flush(self):
"""
Expand All @@ -64,7 +65,7 @@ def flush(self):
f = open(self.output, "w")
csv_writer = csv.DictWriter(f, ["date"] + list(self.countries))
csv_writer.writeheader()
for date in self.data:
for date in sorted(self.data):
to_write = Counter(date=date)
to_write.update(dict([(x, 0) for x in self.countries]))
to_write.update(self.data[date])
Expand Down Expand Up @@ -100,28 +101,29 @@ def process_revision(self, _):

first_date = None # 2001 date mismatch
mismatch = False
for date in self.data:
first_date = date
break
if first_date and first_date > current_date:
mismatch = True
logging.warn("Date mismatch! Fixing... - %s %s", first_date,
current_date)

if self.data and current_date not in self.data:
first_date = min(self.data)
if first_date > current_date:
mismatch = True
logging.warn("Date mismatch! Fixing... - %s %s", first_date,
current_date)

if not self.data or mismatch: # populate dict with all dates
start = self._date.date()
end = datetime.date.today()
for dt in rrule(MONTHLY, dtstart=start, until=end):
dt = dt.strftime("%Y/%m")
if not dt in self.data:
if dt in self.data:
break
else:
self.data[dt] = Counter()

self.data[current_date][self._country] += 1

if self.per_page_stats:
self._country_data[self._country] += 1

self._date = None
self._country = None

self._anon_edits += 1
Expand All @@ -146,6 +148,7 @@ def process_page(self, _):
self._anon_edits = 0
self._edits = 0
self._country_data = Counter()

self.skip = False

def process_title(self, elem):
Expand Down

0 comments on commit ca9b7ef

Please sign in to comment.