forked from emeryberger/CSrankings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregenerate_data.py
250 lines (231 loc) · 8.5 KB
/
regenerate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import gzip
import xmltodict
import collections
import json
import csv
import re
import sys
import operator
# from typing import Dict
from csrankings import *
# Consider pubs in this range only.
startyear = 1970
endyear = 2269
totalPapers = 0 # for statistics reporting purposes only
authlogs = {}
interestingauthors = {}
authorscores = {}
authorscoresAdjusted = {}
coauthors = {}
papersWritten = {}
counter = 0
successes = 0
failures = 0
confdict = {}
aliasdict = {}
def do_it():
# gz = gzip.GzipFile('dblp-original.xml.gz')
gz = gzip.GzipFile('dblp.xml.gz')
xmltodict.parse(gz, item_depth=2, item_callback=handle_article)
def build_dicts():
global areadict
global confdict
global facultydict
global aliasdict
# Build a dictionary mapping conferences to areas.
# e.g., confdict['CVPR'] = 'vision'.
confdict = {}
venues = []
for k, v in areadict.items():
for item in v:
confdict[item] = k
venues.append(item)
facultydict = csv2dict_str_str('faculty-affiliations.csv')
aliasdict = csv2dict_str_str('dblp-aliases.csv')
# Count and report the total number of faculty in the database.
totalFaculty = 0
for name in facultydict:
# Exclude aliases.
if name in aliasdict:
continue
totalFaculty += 1
print("Total faculty members currently in the database: "+str(totalFaculty))
def handle_article(_, article):
global totalPapers
global confdict
global counter
global successes
global failures
global interestingauthors
global authorscores
global authorscoresAdjusted
global authlogs
global interestingauthors
global facultydict
global aliasdict
global TOG_SIGGRAPH_Volume
global TOG_SIGGRAPH_Asia_Volume
global TVCG_Vis_Volume
global TVCG_VR_Volume
counter += 1
try:
if counter % 10000 == 0:
print(str(counter)+ " papers processed.")
if 'author' in article:
# Fix if there is just one author.
if type(article['author']) != list:
article['author'] = [article['author']]
authorList = article['author']
authorsOnPaper = len(authorList)
foundOneInDict = False
for authorName in authorList:
if type(authorName) is collections.OrderedDict:
authorName = authorName["#text"]
authorName = authorName.strip()
if authorName in facultydict:
foundOneInDict = True
break
if authorName in aliasdict:
if aliasdict[authorName] in facultydict:
foundOneInDict = True
break
if not foundOneInDict:
return True
else:
return True
if 'booktitle' in article:
confname = article['booktitle']
elif 'journal' in article:
confname = article['journal']
else:
return True
volume = article.get('volume',"0")
number = article.get('number',"0")
url = article.get('url',"")
year = int(article.get('year',"-1"))
pages = ""
if confname in confdict:
areaname = confdict[confname]
#Special handling for PACMPL
if confname == 'PACMPL':
confname = article['number']
if confname in confdict:
areaname = confdict[confname]
else:
return True
elif confname == 'ACM Trans. Graph.':
if year in TOG_SIGGRAPH_Volume:
(vol, num) = TOG_SIGGRAPH_Volume[year]
if (volume == str(vol)) and (number == str(num)):
confname = 'SIGGRAPH'
areaname = confdict[confname]
if year in TOG_SIGGRAPH_Asia_Volume:
(vol, num) = TOG_SIGGRAPH_Asia_Volume[year]
if (volume == str(vol)) and (number == str(num)):
confname = 'SIGGRAPH Asia'
areaname = confdict[confname]
elif confname == 'IEEE Trans. Vis. Comput. Graph.':
if year in TVCG_Vis_Volume:
(vol, num) = TVCG_Vis_Volume[year]
if (volume == str(vol)) and (number == str(num)):
areaname = 'vis'
if year in TVCG_VR_Volume:
(vol, num) = TVCG_VR_Volume[year]
if (volume == str(vol)) and (number == str(num)):
confname = 'VR'
areaname = 'vr'
else:
return True
if 'title' in article:
title = article['title']
if type(title) is collections.OrderedDict:
title = title["#text"]
if 'pages' in article:
pages = article['pages']
pageCount = pagecount(pages)
startPage = startpage(pages)
else:
pageCount = -1
startPage = -1
successes += 1
except TypeError:
raise
except:
print(sys.exc_info()[0])
failures += 1
raise
if countPaper(confname, year, volume, number, pages, startPage, pageCount, url, title):
totalPapers += 1
for authorName in authorList:
if type(authorName) is collections.OrderedDict:
authorName = authorName["#text"]
realName = aliasdict.get(authorName, authorName)
# if authorName in aliasdict:
# authorName = aliasdict[authorName]
foundAuthor = None
if realName in facultydict:
foundAuthor = realName
if foundAuthor is not None:
log = { 'name' : foundAuthor.encode('utf-8'),
'year' : year,
'title' : title.encode('utf-8'),
'conf' : confname,
'area' : areaname,
'institution' : facultydict[foundAuthor],
'numauthors' : authorsOnPaper }
if not volume is "":
log['volume'] = volume
if not number is "":
log['number'] = number
if not startPage is "":
log['startPage'] = startPage
if not pageCount is "":
log['pageCount'] = pageCount
tmplist = authlogs.get(foundAuthor, [])
tmplist.append(log)
authlogs[foundAuthor] = tmplist
interestingauthors[foundAuthor] = interestingauthors.get(foundAuthor, 0) + 1
authorscores[(foundAuthor, areaname, year)] = authorscores.get((foundAuthor, areaname, year), 0) + 1.0
authorscoresAdjusted[(foundAuthor, areaname, year)] = authorscoresAdjusted.get((foundAuthor, areaname, year), 0) + 1.0 / authorsOnPaper
return True
def dump_it():
global authorscores
global authorscoresAdjusted
global authlogs
global interestingauthors
global facultydict
with open('generated-author-info.csv','w') as f:
f.write('"name","dept","area","count","adjustedcount","year"\n')
authorscores = collections.OrderedDict(sorted(authorscores.iteritems()))
for ((authorName, area, year), count) in authorscores.iteritems():
# count = authorscores[(authorName, area, year)]
countAdjusted = authorscoresAdjusted[(authorName, area, year)]
f.write(authorName.encode('utf-8'))
f.write(',')
f.write((facultydict[authorName].encode('utf-8')))
f.write(',')
f.write(area)
# f.write(',')
# f.write(subarea)
f.write(',')
f.write(str(count))
f.write(',')
f.write(str(countAdjusted))
f.write(',')
f.write(str(year))
f.write('\n')
with open('articles.json','w') as f:
z = []
authlogs = collections.OrderedDict(sorted(authlogs.items()))
for v, l in authlogs.iteritems():
if v in interestingauthors:
for s in sorted(l, key=lambda x: x['name'].decode('utf-8')+str(x['year'])+x['conf']+x['title'].decode('utf-8')):
z.append(s)
json.dump(z, f, indent=2)
def main():
build_dicts()
do_it()
dump_it()
print("Total papers counted = "+str(totalPapers))
if __name__== "__main__":
main()