Skip to content

Commit

Permalink
bug fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
volpino committed Sep 29, 2011
1 parent 6fc8bba commit 1e17597
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 23 deletions.
62 changes: 40 additions & 22 deletions usercontributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import time
from array import array
from datetime import datetime
import urllib
import simplejson

## PROJECT LIBS
import sonet.mediawiki as mwlib
Expand All @@ -42,7 +44,8 @@
ATTR_LEN = None

class UserContrib(object):
__slots__ = ['comments_length', 'namespace_count', 'data']
__slots__ = ['comments_length', 'namespace_count', 'data',
"current_time", "lang", "user"]

def __init__(self):
##self.namespace_count = np.zeros((attr_len,), dtype=np.int)
Expand All @@ -61,8 +64,9 @@ def __init__(self):
## we don't define namespace_count here but in inc_namespace() to save
## memory

def get_quartile():
n = (self.last_time - self.first_time).days
def get_quartile(self):
current = datetime.fromtimestamp(self.current_time)
n = (current - self.first_time).days
if n < 21:
return 0
elif n < 226:
Expand All @@ -83,14 +87,27 @@ def inc_normal(self):

def inc_namespace(self, idx):
if not hasattr(self, 'namespace_count'):
##TODO: maybe attr_len contains unneeded namespace? like key=0
self.namespace_count = array('I', (0,)*(ATTR_LEN*4))
quartile = self.get_quartile()
idx = quartile + idx*4
self.namespace_count[idx] += 1
return self.namespace_count

@property
def first_time(self):
if self.data[7] == 0:
api_base = 'http://%s.wikipedia.org/w/api.php' % self.lang
options = {
'action': 'query',
'list': 'usercontribs',
'ucuser': self.user,
'ucdir': 'newer',
'uclimit': 1,
}
url = api_base + '?' + urllib.urlencode(options)
result = simplejson.load(urllib.urlopen(url))
dtime = mwlib.ts2dt(result["query"]["usercontribs"]["timestamp"])
self.data[7] = int(time.mktime(dtime.timetuple()))
return datetime.fromtimestamp(self.data[7])

@property
Expand All @@ -99,6 +116,7 @@ def last_time(self):

def time(self, time_):
epoch = int(time.mktime(time_.timetuple()))
self.current_time = epoch
if self.data[7] == 0 or self.data[7] > epoch:
self.data[7] = epoch
if self.data[8] == 0 or self.data[8] < epoch:
Expand Down Expand Up @@ -164,7 +182,7 @@ def inc_revert(self):


class ContribDict(dict):
def __init__(self, namespaces):
def __init__(self, namespaces, lang):
global ATTR_LEN
super(ContribDict, self).__init__()
self._namespaces = namespaces
Expand All @@ -182,6 +200,7 @@ def __init__(self, namespaces):

contributions, self.connection = get_contributions_table()
self.insert = contributions.insert()
self.lang = lang

#----------------------------------------------------------------------
def append(self, user, page_title, timestamp, comment, minor):
Expand All @@ -191,16 +210,6 @@ def append(self, user, page_title, timestamp, comment, minor):
contrib = UserContrib()
self[user] = contrib

## Namespace
a_title = page_title.split(':')
if len(a_title) == 1:
contrib.inc_normal()
else:
try:
contrib.inc_namespace(self._d_namespaces[a_title[0]])
except KeyError:
contrib.inc_normal()

year = int(timestamp[:4])
month = int(timestamp[5:7])
day = int(timestamp[8:10])
Expand All @@ -211,6 +220,19 @@ def append(self, user, page_title, timestamp, comment, minor):
timestamp = datetime(year, month, day, hour, minutes, seconds)
## Time
contrib.time(timestamp)
contrib.lang = self.lang
contrib.user = user
## Namespace
a_title = page_title.split(':')
if len(a_title) == 1:
contrib.inc_normal()
contrib.inc_namespace(0)
else:
try:
contrib.inc_namespace(self._d_namespaces[a_title[0]])
except KeyError:
contrib.inc_namespace(0)
contrib.inc_normal()

## Minor
if minor:
Expand Down Expand Up @@ -240,8 +262,6 @@ def save(self, lang):
iterator = self.iteritems()
step = 100000
for _ in xrange(0, len(self), step):
print "lol"
#print d.namespace_count.tolist()
data = [{'username': user,
'lang': lang,
'normal_edits': d.normal_count,
Expand All @@ -264,11 +284,10 @@ def save(self, lang):


def use_contrib_dict(receiver, namespaces, lang):
cd = ContribDict(namespaces)
cd = ContribDict(namespaces, lang)

while 1:
rev = receiver.recv()
print rev
try:
cd.append(*rev)
except TypeError:
Expand Down Expand Up @@ -315,7 +334,7 @@ def namespaces(self):
@namespaces.setter
def namespaces(self, namespaces):
self.__namespaces = namespaces
self.contribution = ContribDict(namespaces)
self.contribution = ContribDict(namespaces, self.lang)

@property
def welcome_pattern(self):
Expand All @@ -341,7 +360,6 @@ def welcome_pattern(self, value):

def process_title(self, elem):
self._title = elem.text
print self._title
self._id = None
self._username = None

Expand Down Expand Up @@ -493,7 +511,7 @@ def main():
tags='page,title,revision,timestamp,contributor,username,ip'+ \
',comment,id,minor')

namespaces = mwlib.get_namespaces(src)
namespaces = [(0, "Normal")]+mwlib.get_namespaces(src)

src.close()
logging.info("BEGIN PARSING")
Expand Down
8 changes: 7 additions & 1 deletion usercontributions_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,13 @@ def main():
else:
src = deflate(xml)

namespaces = [v for _, v in mwlib.get_namespaces(src)]
tmp = ["Normal"]+[v for _, (_, v) in enumerate(mwlib.get_namespaces(src))]
namespaces = []
# fix for quartiles
for ns in tmp:
for n in range(1, 5):
namespaces.append("%s_%d" % (ns, n))
print namespaces

fout = BZ2File(out, 'w')

Expand Down

0 comments on commit 1e17597

Please sign in to comment.