-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsurvey-analyze.py
executable file
·309 lines (259 loc) · 14 KB
/
survey-analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import datetime
import os
import re
import collections
def setup():
pd.set_option('display.width', 120)
def reorder_columns(df):
"""
Reorder the columns in a DataFrame from largest to smallest using the last row's numeric data.
Useful for modifying the legend order to match that of the last plotted values.
"""
last_row = df.iloc[[-1]]
latest_date = last_row.index[0] # index value
df = df.sort_values(by=latest_date, ascending=False, axis=1)
return df
def normalize_language(lang):
"""Process freeform language string into a known format"""
pattern = re.compile(r"\s+")
language = pattern.sub("", lang if isinstance(lang, str) else "").lower()
if len(language) > 0:
language = language[0].upper() + language[1:]
language_temp = language
mappings = {"C#" : "CSharp",
"Golang" : "Go",
"Android" : "Java",
"Scala" : "Java",
"Javascript-node.js" : "Javascript",
"Node.js" : "Javascript",
"Objectivec" : "Objective-c"}
language = mappings[language] if language in mappings else language
# if language_temp != language:
# print("Changed from {} to {}".format(language_temp, language))
return language
def save_data(df, filename):
df.to_csv(filename)
print("Data saved: {}".format(filename))
def save_plot(p, start, end, name):
# filename = name + start + "To" + end + ".png"
filename = name + ".png"
p.get_figure().savefig(filename)
print("Plot saved: {}".format(filename))
def analyze_file(filename, quiet, one_per_ipaddress, yearly):
if not os.path.isfile(filename):
raise RuntimeError("Not a file: {}".format(filename))
if not quiet:
print("Reading {}".format(filename))
# Filter out some lines, there are some of these: Expected 32 fields in line 452, saw 33
data = pd.read_csv(filename, error_bad_lines=False, warn_bad_lines=not quiet, encoding="latin1")
# Drop obvious duplicates as they have the same data including submission time down to 1 second
keep_obvious_duplicates = False
if not keep_obvious_duplicates:
size_before = len(data)
data = data.drop_duplicates()
duplicates_count = size_before - len(data)
if duplicates_count > 0 and not quiet:
print("Removed {} ({}%) obvious duplicates from {} ".format(duplicates_count, round(duplicates_count/size_before*100.0, 1), os.path.basename(filename)))
# Filter out entries from the same IP address.
# Off by default... note that 2008-03 all entries for the from 1st to 18th are from one IP address !?!
if one_per_ipaddress:
size_before = len(data)
data = data.drop_duplicates(subset="IPAddress")
duplicates_count = size_before - len(data)
# if duplicates_count > 0 and not quiet:
if duplicates_count > 0:
print("Removed {} ({}%) IP address duplicates from {} ".format(duplicates_count, round(duplicates_count/size_before*100.0, 1), os.path.basename(filename)))
ops_headings = [
"BSD",
"HPUX",
"Linux",
"MacOSX",
"Solaris",
"Windows",
"OtherOS"]
misc_headings = [
"LogDate",
"IPAddress",
"SpareLang1",
"SpareLang2",
"SpareLang3",
"OtherLang",
"NamedLanguage",
"SpareOS1",
"SpareOS2",
"SpareOS3",
"OtherOS",
"NamedOS"]
lang_headings = [lang for lang in data.columns.values if lang not in ops_headings + misc_headings]
# Some data is corrupt as it contains unexpected values - just filter out these rows
acceptable_values = ["1", 1, 1.0]
for column_name in lang_headings + ops_headings:
size_before = len(data)
data = data[data[column_name].isin(acceptable_values) | data[column_name].isnull()]
invalid_count = size_before - len(data)
if invalid_count > 0 and not quiet:
print("Removed {} invalid rows due to corrupted data in the '{}' column".format(invalid_count, column_name))
data[column_name] = data[column_name].apply(pd.to_numeric) # Corrupted data usually means the dtype was not float64
filebasename = os.path.splitext(os.path.basename(filename))[0]
data["LogDate"] = pd.to_datetime(data["LogDate"])
data["Date"] = [datetime.datetime(t.year, 1 if yearly else t.month, 1) for t in data["LogDate"]]
operating_systems = data.filter(["Date"] + ops_headings)
ops = operating_systems.groupby(["Date"])
ops = ops.aggregate(np.sum).reset_index()
ops = ops.rename(columns = {"OtherOS" : "Other"})
languages = data.filter(["Date"] + lang_headings)
langs = languages.groupby(["Date"])
langs = langs.aggregate(np.sum).reset_index()
# Create a Counter dictionary of the named languages and the count of each
named_languages = collections.Counter([normalize_language(lang) for lang in data["NamedLanguage"]])
del named_languages[""]
# These are misunderstandings by users thinking C/C++ are target languages
del named_languages["C/c++"]
del named_languages["C"]
del named_languages["C++"]
# Add in new columns for the languages specified in the NamedLanguage column
minimum_count = -1
for lang, count in named_languages.items():
if count >= minimum_count: # filter out noise when minimum_count is set to be > 0
if lang not in langs:
langs[lang] = 0.0
langs[lang] = langs[lang] + float(count)
return (ops, langs)
def analyze_directory(directory, quiet, show_graphs, show_tables, begin_year, end_year, one_per_ipaddress, yearly):
drop_first_period = False
# Start/end year validation
if begin_year and begin_year < 2000:
raise RuntimeError("Invalid start year {}".format(begin_year))
if end_year and end_year < 2000:
raise RuntimeError("Invalid end year {}".format(end_year))
start_limit = None if not begin_year else datetime.datetime(begin_year, 1, 1)
end_limit = None if not end_year else datetime.datetime(end_year, 12, 31)
if start_limit and end_limit and start_limit >= end_limit:
raise RuntimeError("start year {} must be less than end year {}".format(begin_year, end_year))
files = sorted([os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])
operating_systems = None
languages = None
for f in files[:]:
(ops, langs) = analyze_file(f, quiet, one_per_ipaddress, yearly)
operating_systems = pd.concat([operating_systems, ops])
languages = pd.concat([languages, langs])
# Sum up duplicate rows - this is actually only really necessary for the yearly option
operating_systems = operating_systems.groupby(["Date"])
operating_systems = operating_systems.aggregate(np.sum).reset_index()
languages = languages.groupby(["Date"])
languages = languages.aggregate(np.sum).reset_index()
# Operating systems...
ops = operating_systems.set_index("Date")
ops.reindex()
if drop_first_period and len(ops) > 1:
ops = ops.drop(ops.index[0]) # Remove first month's data which doesn't start from 1st of the month
if start_limit:
ops = ops[ops.index >= start_limit]
if ops.empty:
raise RuntimeError("No data to show after limiting to start year {}".format(begin_year))
if end_limit:
ops = ops[ops.index <= end_limit]
if ops.empty:
raise RuntimeError("No data to show after limiting to end year {}".format(end_year))
if ops.empty:
raise RuntimeError("No data to show - something has gone horribly wrong!")
total_ops = pd.DataFrame(ops.sum())
total_ops = total_ops.rename(columns = {0 : "Count"})
ops_sum = total_ops["Count"].sum()
total_ops["Percent"] = total_ops["Count"].div(ops_sum).multiply(100.0).round(1)
total_ops = total_ops.sort_values(by="Count", ascending=False)
ops = reorder_columns(ops)
columns = ops.columns
ops_percent = ops[columns].div(ops[columns].sum(axis=1), axis=0).multiply(100.0)
if show_tables:
print("Operating Systems Count:\n{}".format(ops.to_string()))
print("Operating Systems Percent:\n{}".format(ops_percent.to_string()))
start_date = ops.index.min()
start = start_date.strftime("%Y-%m")
end_date = ops.index.max()
end = end_date.strftime("%Y-%m")
print("Operating system totals over entire period ({} to {}):\n{}".format(start, end, total_ops.to_string()))
# Target languages...
langs = languages.set_index("Date")
langs.reindex()
if drop_first_period and len(langs) > 1:
langs = langs.drop(langs.index[0]) # Remove first month's data which doesn't start from 1st of the month
if start_limit:
langs = langs[langs.index >= start_limit]
if end_limit:
langs = langs[langs.index <= end_limit]
# Get totals for each language and sort
total_langs = pd.DataFrame(langs.sum()).sort_values(by=0, ascending=False)
# Remove least popular languages, also removes a lot of junk entered in the free form NamedLanguage column
total_langs = total_langs[total_langs[0] > 50.0]
total_langs = total_langs.rename(columns = {0 : "Count"})
langs_sum = total_langs["Count"].sum()
total_langs["Percent"] = total_langs["Count"].div(langs_sum).multiply(100.0).round(2)
keep_langs = [lang for lang in total_langs.index]
langs = langs[keep_langs]
langs = reorder_columns(langs)
columns = langs.columns
langs_percent = langs[columns].div(langs[columns].sum(axis=1), axis=0).multiply(100.0)
if show_tables:
print("Target languages Count:\n{}".format(langs.to_string()))
print("Target languages Percent:\n{}".format(langs_percent.to_string()))
if start_date != langs.index.min():
raise RuntimeError("Unexpected error: start date for target language does not match start date for operating systems")
if end_date != langs.index.max():
raise RuntimeError("Unexpected error: end date for target language does not match end date for operating systems")
print("Target languages totals over entire period ({} to {}):\n{}".format(start, end, total_langs.to_string()))
save_data(ops, "SWIGSurveyOperatingSystemsCount.csv")
save_data(ops_percent, "SWIGSurveyOperatingSystemsPercent.csv")
save_data(langs, "SWIGSurveyTargetLanguagesCount.csv")
save_data(langs_percent, "SWIGSurveyTargetLanguagesPercent.csv")
# Graphing...
figure_size = (14, 9)
p = ops.plot(title="SWIG Survey - Operating System Usage", ylim=(0), figsize=figure_size)
p.set_ylabel("Count")
p.legend(loc="upper right")
save_plot(p, start, end, "SWIGSurveyOperatingSystemsCount")
p = ops_percent.plot(title="SWIG Survey - Operating System Usage", ylim=(0), figsize=figure_size)
p.set_ylabel("Percent")
p.legend(loc="upper right")
save_plot(p, start, end, "SWIGSurveyOperatingSystemsPercent")
group1_count = 10
group2_count = 20
p = langs[langs.columns[:group1_count]].plot(title="SWIG Survey - Target Language Usage (Top {})".format(group1_count), ylim=(0), figsize=figure_size)
p.set_ylabel("Count")
p.legend(loc="upper right")
save_plot(p, start, end, "SWIGSurveyTargetLanguagesTopCount")
p = langs_percent[langs_percent.columns[:group1_count]].plot(title="SWIG Survey - Target Language Usage (Top {})".format(group1_count), ylim=(0), figsize=figure_size)
p.set_ylabel("Percent")
p.legend(loc="upper right")
save_plot(p, start, end, "SWIGSurveyTargetLanguagesTopPercent")
if len(langs) > group1_count:
p = langs[langs.columns[group1_count:group2_count]].plot(title="SWIG Survey - Target Language Usage (Next {})".format(group2_count-group1_count), ylim=(0), figsize=figure_size)
p.set_ylabel("Count")
p.legend(loc="upper right")
save_plot(p, start, end, "SWIGSurveyTargetLanguagesNextCount")
p = langs_percent[langs_percent.columns[group1_count:group2_count]].plot(title="SWIG Survey - Target Language Usage (Next {})".format(group2_count-group1_count), ylim=(0), figsize=figure_size)
p.set_ylabel("Percent")
p.legend(loc="upper right")
save_plot(p, start, end, "SWIGSurveyTargetLanguagesNextPercent")
if show_graphs:
plt.show()
if __name__ == "__main__":
import argparse
default_directory = "swigsurvey"
parser = argparse.ArgumentParser(description="Analyze the SWIG survey results. Generate graph and data csv files showing operating system and target language usage.")
parser.add_argument("-b", "--begin-year", required=False, default=None, type=int, help="Limit analysis to begin at the given year (default is no limit)")
parser.add_argument("-d", "--directory", required=False, default=default_directory, help="Directory to analyze, default is '{}'".format(default_directory))
parser.add_argument("-e", "--end-year", required=False, default=None, type=int, help="Limit analysis to end at the given year (default is no limit)")
parser.add_argument("-g", "--show-graphs", required=False, default=False, action="store_true", help="Show graphs at end of run (default is not to show)")
parser.add_argument("-i", "--one-per-ipaddress", required=False, default=False, action="store_true", help="Only accept one entry for each IP address per month (default is to keep duplicates submitted by one IP address")
parser.add_argument("-t", "--show-tables", required=False, default=False, action="store_true", help="Show results tables (default is not to show)")
parser.add_argument("-v", "--verbose", required=False, default=False, action="store_true", help="Verbose mode")
parser.add_argument("-y", "--yearly", required=False, default=False, action="store_true", help="Group data and analyse per year (default is to do this per month)")
args = parser.parse_args()
setup()
analyze_directory(args.directory, not args.verbose, args.show_graphs, args.show_tables, args.begin_year, args.end_year, args.one_per_ipaddress, args.yearly)