moving libraries into the sonet module

phauly · Aug 5, 2010 · 0f7c944 · 0f7c944
1 parent f399353
commit 0f7c944
Show file tree

Hide file tree

Showing 12 changed files with 81 additions and 90 deletions.
diff --git a/analysis.py b/analysis.py
@@ -8,10 +8,9 @@
 import igraph as ig
 
 ## PROJECT
-from tablr import Tablr
-from timr import Timr
-import mwlib
-import sonetgraph as sg
+from sonet.tablr import Tablr
+from sonet.timr import Timr
+from sonet import mediawiki as mwlib, graph as sg
 
 ## GLOBAL VARIABLES
 
@@ -76,7 +75,7 @@ def create_option_parser():
         help="Write the adjacency matrix of the giant component to a file")
     op.add_option('--users-role', action="store_true", dest="users_role",
         help="Write a list users-role to a file")
-        
+
     return op
 
 
@@ -114,26 +113,26 @@ def main():
                                                  len(g.classes[group_name]))
     else:
         g.defineClass('all', {})
-        
+
     print " * lang: %s" % (lang,)
     print " * date: %s" % (date,)
 
     if options.details:
         with Timr("details"):
             print " * nodes number: %d" % (vn,)
             print " * edges number: %d" % (en,)
-    
+
             nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1))
             nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1))
-    
+
             print " * nodes with out edges number: %d (%6f%%)" % (
                 nodes_with_outdegree, 100.*nodes_with_outdegree/vn)
             print " * nodes with in edges number: %d (%6f%%)" % (
                 nodes_with_indegree, 100.*nodes_with_indegree/vn)
             print " * max weights on edges : %s" % top(g.g.es['weight'])
-            
+
             #print " * diameter : %6f" % g.g.diameter(weights='length')
-    
+
             #print " * average weight : %6f" % numpy.average(g.g.es['weight'])
 
 
@@ -142,9 +141,9 @@ def main():
             for cls, vs in g.classes.iteritems():
                 if not len(vs) > 1:
                     continue
-                
+
                 subgraph = vs.subgraph()
-                
+
                 print " * %s : density : %.10f" % (cls, subgraph.density())
                 print " * %s : reciprocity : %.10f" % (cls,
                                                        subgraph.reciprocity())
@@ -154,14 +153,14 @@ def main():
         with Timr('degree'):
             g.g.vs['indegree'] = g.g.degree(type=ig.IN)
             g.g.vs['outdegree'] = g.g.degree(type=ig.OUT)
-    
+
             for cls, vs in g.classes.iteritems():
                 if not vs:
                     continue
-    
+
                 ind = numpy.array(vs['indegree'])
                 outd = numpy.array(vs['outdegree'])
-    
+
                 print " * %s : mean IN degree (no weights): %f" % (
                     cls, numpy.average(ind))
                 print " * %s : mean OUT degree (no weights): %f" % (
@@ -170,7 +169,7 @@ def main():
                                                                    top(ind))
                 print " * %s : max OUT degrees (no weights): %s" % (cls,
                                                                     top(outd))
-    
+
                 print " * %s : stddev IN degree (no weights): %f" % (
                     cls, numpy.sqrt(numpy.var(ind)))
                 print " * %s : stddev OUT degree (no weights): %f" % (
@@ -189,7 +188,7 @@ def main():
             vc = g.g.clusters()
             size_clusters = vc.sizes()
             giant = vc.giant()
-    
+
             print " * length of 5 max clusters: %s" % top(size_clusters)
             #print " * #node in 5 max clusters/#all nodes: %s" % top(
             #    [1.*cluster_len/vn for cluster_len in size_clusters])
@@ -202,7 +201,7 @@ def main():
                   gg.averageDistance(weight='length')
             print " * average hops in the giant component: %f" % \
                   gg.averageDistance()
-    
+
             #print "Average distance 2: %f" % giant.average_path_length(True,
             #                                                           False)
 
@@ -212,22 +211,22 @@ def main():
             print " * efficiency: %f" % g.efficiency(weight='length')
 
 
-    if (options.plot or options.histogram or options.power_law or 
+    if (options.plot or options.histogram or options.power_law or
         options.centrality):
         with Timr('set weighted indegree'):
             g.set_weighted_degree()
 
 
     if options.centrality:
         timr.start('centrality')
-        
+
         print >> sys.stderr, "betweenness"
         g.g.vs['bw'] = g.g.betweenness(weights='length', directed = True)
         #g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality
-        
+
         print >> sys.stderr, "pagerank"
         g.g.vs['pr'] = g.g.pagerank(weights='weight') # pagerank
-        
+
         print >> sys.stderr, "outdegree"
         g.set_weighted_degree(type=ig.OUT)
         #total_weights = sum(g.g.es['weight'])
@@ -236,25 +235,25 @@ def main():
         for cls, vs in g.classes.iteritems():
             if not vs:
                 continue
-            
+
             norm_betweenness = numpy.array(g.classes[cls]['bw'])/max_edges
             print " * %s : average betweenness : %.10f" % (
                 cls, numpy.average(norm_betweenness))
             print " * %s : stddev betweenness : %.10f" % (
                 cls, numpy.sqrt(numpy.var(norm_betweenness)))
             print " * %s : max betweenness: %s" % (
                 cls, top(numpy.array(g.classes[cls]['bw'])/max_edges))
-            
+
             #print " * Average eigenvector centrality : %6f" % numpy.average(
             #    g.vs['ev'])
-            
+
             print " * %s : average pagerank : %.10f" % (
                 cls, numpy.average(g.classes[cls]['pr']))
             print " * %s : stddev pagerank : %.10f" % (
                 cls, numpy.sqrt(numpy.var(g.classes[cls]['pr'])))
             print " * %s : max pagerank: %s" % (
                 cls, top(g.classes[cls]['pr']))
-            
+
             wi = g.classes[cls]['weighted_indegree']
             print " * %s : average IN degree centrality (weighted): %.10f" % (
                 cls, numpy.average(wi))
@@ -263,7 +262,7 @@ def main():
             print " * %s : max IN degrees centrality (weighted): %s" % (
                 cls, top(wi))
             del wi
-                  
+
             wo = g.classes[cls]['weighted_outdegree']
             print " * %s : average OUT degree centrality (weighted) : %.10f" %\
                   (cls, numpy.average(wo))
@@ -272,17 +271,17 @@ def main():
             print " * %s : max OUT degrees centrality (weighted): %s" % (
                 cls, top(wo))
             del wo
-            
+
         timr.stop('centrality')
 
     if options.power_law:
         with Timr('power law'):
             for cls, vs in g.classes.iteritems():
                 if not vs:
                     continue
-                
+
                 indegrees = vs['weighted_indegree']
-    
+
                 try:
                     alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6)
                     print " * %s : alpha exp IN degree distribution : %10f " %\
@@ -346,7 +345,7 @@ def main():
     if options.plot:
         ## TODO: evaluate if this can be done with
         ## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018
-        import math        
+        import math
         bots = g.g.vs.select(bot=True)
         bots['color'] = ('purple',)*len(bots)
 
@@ -371,7 +370,7 @@ def main():
                            in g.g.es]
         g.g.es['width'] = weights
 
-        ig.plot(g.g, target=lang+"_weighted_edges.png", bbox=(0, 0, 4000, 
+        ig.plot(g.g, target=lang+"_weighted_edges.png", bbox=(0, 0, 4000,
                                                               2400),
                 layout='fr', vertex_label=' ')
 
@@ -382,25 +381,25 @@ def main():
         #tablr.printHeader()
         #tablr.printData()
         tablr.saveInDjangoModel()
-        
+
 
     if options.adjacency:
         giant = g.g.clusters().giant()
         destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date)
         destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date)
         sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username')
         sg.Graph(giant).writeReciprocityMatrix('username', destRec)
-        
+
 
     if options.users_role:
         l = g.getUserClass('username', ('anonymous', 'bot', 'bureaucrat',
                                         'sysop'))
-        
+
         destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date)
         with open(destUR, 'w') as f:
             for username, role in sorted(l):
                 print >> f, "%s,%s" % (username, role)
-                
+
         from random import shuffle
         destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date)
         for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'):
@@ -412,7 +411,7 @@ def main():
                           ("%s,http://vec.wikipedia.org/w/index.php?title="+\
                           "Discussion_utente:%s&action=history&offset="+\
                           "20100000000001") % (username, username)
-        
+
 
 
 if __name__ == '__main__':

diff --git a/countwords.py b/countwords.py
@@ -14,18 +14,14 @@
 ##########################################################################
 
 from bz2 import BZ2File
-import mwlib
-import os, sys
-import re
-from time import time
-from itertools import ifilter
+import sonet.mediawiki as mwlib
+import sys
 from functools import partial
 import cProfile as profile
 
 ## etree
 from lxml import etree
 
-
 ## nltk
 import nltk
 
@@ -46,27 +42,26 @@
 
 ### CHILD PROCESS
 def get_freq_dist(q, done_q, fd=None):
-    global stopwords
     dstpw = dict(zip(stopwords, [0]*len(stopwords)))
     tokenizer = nltk.PunktWordTokenizer()
 
     if not fd:
         fd = nltk.FreqDist()
-    
+
     while 1:
         s = q.get()
-        
+
         try:
             tokens = tokenizer.tokenize(nltk.clean_html(s.encode('utf-8')
                                                         .lower()))
         except AttributeError: ## end
             done_q.put(fd.items())
-            
+
             return
-            
+
         text = nltk.Text(t for t in tokens if len(t) > 2 and t not in dstpw)
         fd.update(text)
-        
+
 
 def get_freq_dist_wrapper(q, done_q, fd=None):
     profile.runctx("get_freq_dist(q, done_q, fd)",
@@ -76,8 +71,8 @@ def get_freq_dist_wrapper(q, done_q, fd=None):
 ### MAIN PROCESS
 def process_page(elem, queue=None):
     user = None
-    global count, it_stopwords
-    
+    global count
+
     for child in elem:
         if child.tag == tag['title'] and child.text:
             a_title = child.text.split('/')[0].split(':')
@@ -100,35 +95,34 @@ def process_page(elem, queue=None):
 
                 try:
                     queue.put(rc.text)
-                    
+
                     count += 1
-                    
+
                     if not count % 500:
                         print >>sys.stderr, count
                 except:
                     print "Warning: exception with user %s" % (
                         user.encode('utf-8'),)
                     raise
-    
+
 
 def main():
     import optparse
 
     p = optparse.OptionParser(usage="usage: %prog [options] file")
 
-    opts, files = p.parse_args()
+    _, files = p.parse_args()
 
     if not files:
         p.error("Give me a file, please ;-)")
     xml = files[0]
 
-    global templates
     global lang_user_talk, lang_user, tag
 
     src = BZ2File(xml)
 
     tag = mwlib.getTags(src)
-    
+
     p = Process(target=get_freq_dist, args=(queue, done_queue))
     p.start()
 
@@ -140,16 +134,16 @@ def main():
     partial_process_page = partial(process_page, queue=queue)
     mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                     partial_process_page)
-    
+
     queue.put(0) ## this STOPS the process
-    
+
     print >>sys.stderr, "end of parsing"
-    
+
     fd = done_queue.get()
     p.join()
-    
+
     print >>sys.stderr, "end of FreqDist"
-    
+
     for k, v in sorted(fd,cmp=lambda x,y: cmp(x[1], y[1]), reverse=True):
         print v, k