forked from emeryberger/CSrankings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount-zero-authors.py
139 lines (107 loc) · 5.11 KB
/
count-zero-authors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
from csrankings import *
# import gzip
def parseDBLP(facultydict):
count = 0
authlogs = {}
interestingauthors = {}
authorscores = {}
authorscoresAdjusted = {}
with open('dblp.xml', mode='r') as f:
# with gzip.open('dblp.xml.gz') as f:
oldnode = None
for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
if (oldnode is not None):
oldnode.clear()
oldnode = node
foundArticle = False
inRange = False
authorsOnPaper = 0
authorName = ""
confname = ""
year = -1
if (node.tag == 'inproceedings' or node.tag == 'article'):
# It's a booktitle or journal, and it's one of our conferences.
# Check that dates are in the specified range.
for child in node:
if (child.tag == 'year' and type(child.text) is str):
year = int(child.text)
if ((year >= startyear) and (year <= endyear)):
inRange = True
break
if (not inRange):
# Out of range.
continue
# Now, count up how many faculty from our list are on this paper.
for child in node:
if (child.tag == 'author'):
authorName = child.text
authorName.strip()
if (authorName in facultydict):
authorsOnPaper += 1
if (authorsOnPaper == 0):
# No authors from our list.
continue
# Count the number of pages. It needs to exceed our threshold to be considered.
pageCount = -1
for child in node:
if (child.tag == 'pages'):
pageCount = pagecount(child.text)
if ((pageCount > 1) and (pageCount < pageCountThreshold)):
# Only skip papers with a very small paper count,
# but above 1. Why?
# DBLP has real papers with incorrect page counts
# - usually a truncated single page. -1 means no
# pages found at all => some problem with journal
# entries in DBLP.
# print "Skipping article with "+str(pageCount)+" pages."
continue
# If we got here, we have a winner.
count = count + 1
if (count % 100 == 0):
print count
for child in node:
if (child.tag == 'author'):
authorName = child.text
authorName.strip()
if (authorName in facultydict):
# print "here we go",authorName, confname, authorsOnPaper, year
if (generateLog):
logstring = authorName.encode('utf-8') + " ; " + confname + " " + str(year)
tmplist = authlogs.get(authorName,[])
tmplist.append(logstring)
authlogs[authorName] = tmplist
interestingauthors[authorName] = interestingauthors.get(authorName,0) + 1
if (generateLog):
return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
else:
return (interestingauthors, authorscores, authorscoresAdjusted)
def csv2dict_str_str(fname):
with open(fname, mode='r') as infile:
reader = csv.reader(infile)
#for rows in reader:
# print rows[0], "-->", rows[1]
d = {unicode(rows[0].strip(),'utf-8'): unicode(rows[1].strip(),'utf-8') for rows in reader}
return d
def sortdictionary(d):
return sorted(d.iteritems(), key=operator.itemgetter(1), reverse = True)
facultydict = csv2dict_str_str('faculty-affiliations.csv')
if (generateLog):
(intauthors_gl, authscores_gl, authscoresAdjusted_gl, authlog_gl) = parseDBLP(facultydict)
else:
(intauthors_gl, authscores_gl, authscoresAdjusted_gl) = parseDBLP(facultydict)
# if (intauthors_gl[k] > 0)
# print k.encode('utf8') + " : " + str(intauthors_gl[k]).encode('utf8')
for k in facultydict:
if ((not intauthors_gl.has_key(k)) or (intauthors_gl[k] <= 3)):
# print k.encode('utf8') + " , " + facultydict[k]
name = k.replace(' ', '%20')
name = name.encode('utf8')
# print name
institution = facultydict[k]
cmd = "xmlstarlet sel -T --net -t -m '//authors/author' -v '.' -n http://dblp.uni-trier.de/search/author?xauthor=" + name
stream = os.popen(cmd)
for line in stream:
x = line.rstrip()
print x + " , " + institution.encode('utf8')
# os.system(cmd)