Skip to content

Commit

Permalink
moving libraries into the sonet module
Browse files Browse the repository at this point in the history
  • Loading branch information
vad authored and Morail committed Aug 5, 2010
1 parent f399353 commit 0f7c944
Show file tree
Hide file tree
Showing 12 changed files with 81 additions and 90 deletions.
73 changes: 36 additions & 37 deletions analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
import igraph as ig

## PROJECT
from tablr import Tablr
from timr import Timr
import mwlib
import sonetgraph as sg
from sonet.tablr import Tablr
from sonet.timr import Timr
from sonet import mediawiki as mwlib, graph as sg

## GLOBAL VARIABLES

Expand Down Expand Up @@ -76,7 +75,7 @@ def create_option_parser():
help="Write the adjacency matrix of the giant component to a file")
op.add_option('--users-role', action="store_true", dest="users_role",
help="Write a list users-role to a file")

return op


Expand Down Expand Up @@ -114,26 +113,26 @@ def main():
len(g.classes[group_name]))
else:
g.defineClass('all', {})

print " * lang: %s" % (lang,)
print " * date: %s" % (date,)

if options.details:
with Timr("details"):
print " * nodes number: %d" % (vn,)
print " * edges number: %d" % (en,)

nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1))
nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1))

print " * nodes with out edges number: %d (%6f%%)" % (
nodes_with_outdegree, 100.*nodes_with_outdegree/vn)
print " * nodes with in edges number: %d (%6f%%)" % (
nodes_with_indegree, 100.*nodes_with_indegree/vn)
print " * max weights on edges : %s" % top(g.g.es['weight'])

#print " * diameter : %6f" % g.g.diameter(weights='length')

#print " * average weight : %6f" % numpy.average(g.g.es['weight'])


Expand All @@ -142,9 +141,9 @@ def main():
for cls, vs in g.classes.iteritems():
if not len(vs) > 1:
continue

subgraph = vs.subgraph()

print " * %s : density : %.10f" % (cls, subgraph.density())
print " * %s : reciprocity : %.10f" % (cls,
subgraph.reciprocity())
Expand All @@ -154,14 +153,14 @@ def main():
with Timr('degree'):
g.g.vs['indegree'] = g.g.degree(type=ig.IN)
g.g.vs['outdegree'] = g.g.degree(type=ig.OUT)

for cls, vs in g.classes.iteritems():
if not vs:
continue

ind = numpy.array(vs['indegree'])
outd = numpy.array(vs['outdegree'])

print " * %s : mean IN degree (no weights): %f" % (
cls, numpy.average(ind))
print " * %s : mean OUT degree (no weights): %f" % (
Expand All @@ -170,7 +169,7 @@ def main():
top(ind))
print " * %s : max OUT degrees (no weights): %s" % (cls,
top(outd))

print " * %s : stddev IN degree (no weights): %f" % (
cls, numpy.sqrt(numpy.var(ind)))
print " * %s : stddev OUT degree (no weights): %f" % (
Expand All @@ -189,7 +188,7 @@ def main():
vc = g.g.clusters()
size_clusters = vc.sizes()
giant = vc.giant()

print " * length of 5 max clusters: %s" % top(size_clusters)
#print " * #node in 5 max clusters/#all nodes: %s" % top(
# [1.*cluster_len/vn for cluster_len in size_clusters])
Expand All @@ -202,7 +201,7 @@ def main():
gg.averageDistance(weight='length')
print " * average hops in the giant component: %f" % \
gg.averageDistance()

#print "Average distance 2: %f" % giant.average_path_length(True,
# False)

Expand All @@ -212,22 +211,22 @@ def main():
print " * efficiency: %f" % g.efficiency(weight='length')


if (options.plot or options.histogram or options.power_law or
if (options.plot or options.histogram or options.power_law or
options.centrality):
with Timr('set weighted indegree'):
g.set_weighted_degree()


if options.centrality:
timr.start('centrality')

print >> sys.stderr, "betweenness"
g.g.vs['bw'] = g.g.betweenness(weights='length', directed = True)
#g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality

print >> sys.stderr, "pagerank"
g.g.vs['pr'] = g.g.pagerank(weights='weight') # pagerank

print >> sys.stderr, "outdegree"
g.set_weighted_degree(type=ig.OUT)
#total_weights = sum(g.g.es['weight'])
Expand All @@ -236,25 +235,25 @@ def main():
for cls, vs in g.classes.iteritems():
if not vs:
continue

norm_betweenness = numpy.array(g.classes[cls]['bw'])/max_edges
print " * %s : average betweenness : %.10f" % (
cls, numpy.average(norm_betweenness))
print " * %s : stddev betweenness : %.10f" % (
cls, numpy.sqrt(numpy.var(norm_betweenness)))
print " * %s : max betweenness: %s" % (
cls, top(numpy.array(g.classes[cls]['bw'])/max_edges))

#print " * Average eigenvector centrality : %6f" % numpy.average(
# g.vs['ev'])

print " * %s : average pagerank : %.10f" % (
cls, numpy.average(g.classes[cls]['pr']))
print " * %s : stddev pagerank : %.10f" % (
cls, numpy.sqrt(numpy.var(g.classes[cls]['pr'])))
print " * %s : max pagerank: %s" % (
cls, top(g.classes[cls]['pr']))

wi = g.classes[cls]['weighted_indegree']
print " * %s : average IN degree centrality (weighted): %.10f" % (
cls, numpy.average(wi))
Expand All @@ -263,7 +262,7 @@ def main():
print " * %s : max IN degrees centrality (weighted): %s" % (
cls, top(wi))
del wi

wo = g.classes[cls]['weighted_outdegree']
print " * %s : average OUT degree centrality (weighted) : %.10f" %\
(cls, numpy.average(wo))
Expand All @@ -272,17 +271,17 @@ def main():
print " * %s : max OUT degrees centrality (weighted): %s" % (
cls, top(wo))
del wo

timr.stop('centrality')

if options.power_law:
with Timr('power law'):
for cls, vs in g.classes.iteritems():
if not vs:
continue

indegrees = vs['weighted_indegree']

try:
alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6)
print " * %s : alpha exp IN degree distribution : %10f " %\
Expand Down Expand Up @@ -346,7 +345,7 @@ def main():
if options.plot:
## TODO: evaluate if this can be done with
## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018
import math
import math
bots = g.g.vs.select(bot=True)
bots['color'] = ('purple',)*len(bots)

Expand All @@ -371,7 +370,7 @@ def main():
in g.g.es]
g.g.es['width'] = weights

ig.plot(g.g, target=lang+"_weighted_edges.png", bbox=(0, 0, 4000,
ig.plot(g.g, target=lang+"_weighted_edges.png", bbox=(0, 0, 4000,
2400),
layout='fr', vertex_label=' ')

Expand All @@ -382,25 +381,25 @@ def main():
#tablr.printHeader()
#tablr.printData()
tablr.saveInDjangoModel()


if options.adjacency:
giant = g.g.clusters().giant()
destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date)
destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date)
sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username')
sg.Graph(giant).writeReciprocityMatrix('username', destRec)


if options.users_role:
l = g.getUserClass('username', ('anonymous', 'bot', 'bureaucrat',
'sysop'))

destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date)
with open(destUR, 'w') as f:
for username, role in sorted(l):
print >> f, "%s,%s" % (username, role)

from random import shuffle
destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date)
for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'):
Expand All @@ -412,7 +411,7 @@ def main():
("%s,http://vec.wikipedia.org/w/index.php?title="+\
"Discussion_utente:%s&action=history&offset="+\
"20100000000001") % (username, username)



if __name__ == '__main__':
Expand Down
44 changes: 19 additions & 25 deletions countwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,14 @@
##########################################################################

from bz2 import BZ2File
import mwlib
import os, sys
import re
from time import time
from itertools import ifilter
import sonet.mediawiki as mwlib
import sys
from functools import partial
import cProfile as profile

## etree
from lxml import etree


## nltk
import nltk

Expand All @@ -46,27 +42,26 @@

### CHILD PROCESS
def get_freq_dist(q, done_q, fd=None):
global stopwords
dstpw = dict(zip(stopwords, [0]*len(stopwords)))
tokenizer = nltk.PunktWordTokenizer()

if not fd:
fd = nltk.FreqDist()

while 1:
s = q.get()

try:
tokens = tokenizer.tokenize(nltk.clean_html(s.encode('utf-8')
.lower()))
except AttributeError: ## end
done_q.put(fd.items())

return

text = nltk.Text(t for t in tokens if len(t) > 2 and t not in dstpw)
fd.update(text)


def get_freq_dist_wrapper(q, done_q, fd=None):
profile.runctx("get_freq_dist(q, done_q, fd)",
Expand All @@ -76,8 +71,8 @@ def get_freq_dist_wrapper(q, done_q, fd=None):
### MAIN PROCESS
def process_page(elem, queue=None):
user = None
global count, it_stopwords
global count

for child in elem:
if child.tag == tag['title'] and child.text:
a_title = child.text.split('/')[0].split(':')
Expand All @@ -100,35 +95,34 @@ def process_page(elem, queue=None):

try:
queue.put(rc.text)

count += 1

if not count % 500:
print >>sys.stderr, count
except:
print "Warning: exception with user %s" % (
user.encode('utf-8'),)
raise


def main():
import optparse

p = optparse.OptionParser(usage="usage: %prog [options] file")

opts, files = p.parse_args()
_, files = p.parse_args()

if not files:
p.error("Give me a file, please ;-)")
xml = files[0]

global templates
global lang_user_talk, lang_user, tag

src = BZ2File(xml)

tag = mwlib.getTags(src)

p = Process(target=get_freq_dist, args=(queue, done_queue))
p.start()

Expand All @@ -140,16 +134,16 @@ def main():
partial_process_page = partial(process_page, queue=queue)
mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
partial_process_page)

queue.put(0) ## this STOPS the process

print >>sys.stderr, "end of parsing"

fd = done_queue.get()
p.join()

print >>sys.stderr, "end of FreqDist"

for k, v in sorted(fd,cmp=lambda x,y: cmp(x[1], y[1]), reverse=True):
print v, k

Expand Down
Loading

0 comments on commit 0f7c944

Please sign in to comment.