forked from seemoo-lab/myo-keylogging
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexp_key_data.py
294 lines (261 loc) · 13.5 KB
/
exp_key_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# Code for the "Inferring Keystrokes from Myo Armband Sensors" project
#
# Copyright (C) 2019-2021 Matthias Gazzari, Annemarie Mattmann
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
Print the key data.
"""
import csv
import pathlib
import json
import random
random.seed(233)
import numpy as np
import pandas as pd
from preprocess.encode import KEYMAP_DE, KEYMAP_US
CONSIDERED_KEYS = [keysyms[0] for keysyms in KEYMAP_DE.values()]
CONSIDERED_KEYS += ["ssharp", "plus", "numbersign", "less", "comma", "period", "minus", "Caps_Lock", "udiaeresis", "odiaeresis", "adiaeresis"]
CONSIDERED_KEYS += [keysyms[0] for keysyms in KEYMAP_US.values()]
CONSIDERED_KEYS += ["equal", "bracketleft", "bracketright", "semicolon", "apostrophe", "slash"]
CONSIDERED_KEYS += ["ß", "+", "#", "<", ",", ".", "-", "ü", "ö", "ä", "=", "[", "]", ";", "'", "/"] # add replacement keys just to be sure
CONSIDERED_KEYS += list(map(str, range(10,23))) + list(map(str, range(24,37))) + list(map(str, range(38,49))) + ["94"] + list(map(str, range(50,63))) + ["65"] + ["66"] # add keycodes (Caps_Lock is 66)
KEYSYM_TO_SYMBOL = {"ssharp": "ß", "plus": "+", "numbersign": "#", "less": "<", "comma": ",", "period": ".", "minus": "-", "udiaeresis": "ü", "odiaeresis": "ö", "adiaeresis": "ä", "equal": "=", "bracketleft": "[", "bracketright": "]", "semicolon": ";", "apostrophe": "'", "slash": "/"}
def run(files, verbose=0):
key_index = 2 # set to 1 for key codes and to 2 for key symbols
# prepare dictionaries
data = {
"participant": [],
"recording": [],
"style": [],
"layout": [],
"task type": [],
"time [min]": [],
"keys per minute": [],
"total number of keys": [],
"total number of keys no auto": [],
"total number of modifiers": [],
"total number of backspaces": [],
"total uncorrected errors": [], # levenshtein
"total number of overlaps": [],
"total number of non-modifier overlaps": []
}
keylevel_data = {
"participant": [],
"recording": [],
"style": [],
"layout": [],
"task type": [],
"task": [],
"keysym": [],
"keycode": [],
"total amount overlapped": []
}
# get mistakes and backspace hits (and store the values in above dictionary)
for task, filename in enumerate(files):
# gather meta data
with open(f"{str(filename)[:-7]}meta.json") as jsonfile:
json_data = json.load(jsonfile)
try:
task_type = json_data["common"]["task_type"]
except KeyError:
task_type = "passwords"
recording = json_data["id"]["collection"]
participant = json_data["id"]["user"]
kb_layout = json_data["keyboard"]["layout"]
typing_style = " ".join(json_data["common"]["typing_style"].split("_")).capitalize()
if not typing_style == "Touch typing":
typing_style = "Non-touch typing"
levenshtein = json_data["common"].get("levenshtein_distance", 0)
num_typed_keys = json_data["common"]["num_given_keys_with_auto_repeat"]
num_auto_repeat_keys = json_data["common"]["num_auto_repeat_keys"]
num_true_keys = json_data["common"].get("num_true_keys", num_typed_keys)
# gather key data
missing_return = False
# check for missing return
if any([string.startswith("SYNC_ERROR_B") for string in json_data["notes"]]):
missing_return = True
if verbose:
print(f"Handling missing second return at the start of: {filename}")
# prepare variables
num_backspaces = 0
first = 0
last = 0
total_keys_within_task = 0
full_key_presses = []
full_key_presses_code = []
amount_overlapped = []
non_mod_amount_overlapped = []
num_modifiers = 0
with open(filename, newline="") as csvfile:
csvreader = csv.reader(csvfile)
start_count = 0
pressed_keys = []
for i, row in enumerate(csvreader):
# get backspaces
if row[2] == "BackSpace" and row[3] == "press":
num_backspaces += 1
# get modifiers
if row[2] in ("Shift_L", "Shift_R") and row[3] == "press":
num_modifiers += 1
# get overlapping keys
keys = [t[0] for t in pressed_keys]
if row[3] == "press" and row[key_index] not in keys:
# each key pressed is overlapped by all keys pressed before that are not released
for i,_ in enumerate(pressed_keys):
pressed_keys[i][1] += 1
pressed_keys.append([row[key_index], 0])
elif row[3] == "release" and row[key_index] in keys:
index = keys.index(row[key_index])
key = pressed_keys.pop(index)
full_key_presses.append(key[0]) # key pressed
assert key[0] == row[key_index]
full_key_presses_code.append(row[1])
amount_overlapped.append(key[1]) # number of keys overlapped
non_mod_amount_overlapped.append(0 if key[0] in ("Shift_L", "Shift_R", "50", "62") else key[1])
# get timings for all tasks
if start_count > 2:
if row[3] == "release": # only count releases to prevent counting auto-presses
last = float(row[0])
total_keys_within_task += 1
continue
# if the task is the game task, skip all introduction texts, the number of which varied
# between data recordings
# (approximate start time is set to the first number pressed, as starting a ship by
# pressing a number is the first "reasonable" action ingame)
if task_type == "game":
if row[2].isdigit():
first = float(row[0])
start_count = 3
continue
# if the task is not the game task, wait for two enters pressed and skip all that comes
# before or in between unless the missing_return flag is set in which case only one
# enter needs to be found
# this may be inaccurate should the first required (or typed) key be enter or should
# enter be pressed accidentally before the clap sync was successful, followed by one
# enter and one or more keys before the task starts with another enter press
# but it is more accurate for other error patterns like too many enter presses that
# should occur more often
if start_count == 2 or start_count == 1 and missing_return:
if row[1] == "36": # reset if enter is pressed again
if verbose:
print(f"Additional enter presses detected in: {filename}")
start_count -= 1
else: # start
if i != 5 and verbose:
print(f"-- Start at key {i//2+1} instead of 3.")
first = float(row[0])
start_count = 3
continue
# ignore everything before the first enter release
if row[1] == "36" and row[3] == "release":
start_count += 1
continue
# print warnings and information
if row[1] not in ("36", "keycode"):
if start_count == 1 and verbose:
print(f"WARNING: Possibly missing second return at the start of: {filename}")
print(f" or misclick if the key value is low.")
#subprocess.Popen(f"sed -n 1,10p {filename}".split())
if start_count == 0 and verbose:
print(f"Extra characters found at the start of: {filename}")
# do not continue here
# sum overlaps
overlaps = sum(1 if el > 0 else 0 for el in amount_overlapped)
non_mod_overlaps = sum(1 if el > 0 else 0 for el in non_mod_amount_overlapped)
# store data
data["recording"].append(recording)
data["participant"].append(participant)
data["task type"].append(task_type)
data["style"].append(typing_style)
data["layout"].append(kb_layout)
data["time [min]"].append((last - first)/60)
data["keys per minute"].append(total_keys_within_task/((last - first)/60))
data["total number of keys"].append(num_typed_keys)
data["total number of keys no auto"].append(num_typed_keys - num_auto_repeat_keys) # can be != len(full_key_presses) due to non-released keys
data["total number of modifiers"].append(num_modifiers)
data["total number of backspaces"].append(num_backspaces)
data["total uncorrected errors"].append(levenshtein)
data["total number of overlaps"].append(overlaps)
data["total number of non-modifier overlaps"].append(non_mod_overlaps)
# store overlapping
keylevel_data["task type"].extend([task_type for el in full_key_presses])#("\n".join(task_type.split(" ")))
keylevel_data["task"].extend([task for el in full_key_presses])
keylevel_data["keysym"].extend([el for el in full_key_presses])
keylevel_data["keycode"].extend([el for el in full_key_presses_code])
keylevel_data["total amount overlapped"].extend([el for el in amount_overlapped])
keylevel_data["recording"].extend([recording for el in full_key_presses])
keylevel_data["participant"].extend([participant for el in full_key_presses])
keylevel_data["style"].extend([typing_style for el in full_key_presses])
keylevel_data["layout"].extend([kb_layout for el in full_key_presses])
#print(filename)
#print(f"# keys typed: {num_typed_keys}")
#print(f"# full key presses: {len(full_key_presses)}")
#print(f"# keys typed no auto: {num_typed_keys-num_auto_repeat_keys}")
#print(f"# overlaps: {overlaps}")
#print(f"# non-shift overlaps: {non_mod_overlaps}")
#print(f"amount overlapped: {sum(amount_overlapped)}")
# create and sort dataframes to prevent plots sorted in a different way
df_data = pd.DataFrame(data)
df_data = df_data.sort_values(by=["task type"]).reset_index(drop=True)
df_keylevel_data = pd.DataFrame(keylevel_data)
df_keylevel_data = df_keylevel_data.sort_values(by=["task type"]).reset_index(drop=True)
# add more columns
# add task filters
df_data["generalized task type"] = df_data["task type"]
df_data["generalized task type"].replace({"uniform \d": "uniform", "uniform disappearing \d": "uniform disappearing"}, regex=True, inplace=True)
df_data["task category"] = df_data["task type"]
df_data["task category"].replace({"uniform \d": "uniform", "uniform disappearing \d": "uniform"}, regex=True, inplace=True)
df_data["generalized task category"] = df_data["task type"]
df_data["generalized task category"].replace({"uniform \d": "uniform", "uniform disappearing \d": "uniform", "text": "text-based", "pangram": "text-based"}, regex=True, inplace=True)
# add relative values
#df_data["number of modifiers"] = df_data["total number of modifiers"]/df_data["total number of keys"]
df_data["number of backspaces"] = df_data["total number of backspaces"]/df_data["total number of keys"]
df_data["number of overlaps"] = df_data["total number of overlaps"]/df_data["total number of keys no auto"]
df_data["number of non-modifier overlaps"] = df_data["total number of non-modifier overlaps"]/df_data["total number of keys no auto"]
# prepare totals
df_keylevel_data["total key frequency"] = 1 # add number of occurrences column
df_keylevel_data["total overlap frequency"] = 1.0
df_keylevel_data.loc[df_keylevel_data["total amount overlapped"] == 0, "total overlap frequency"] = 0.0
# add task filters
df_keylevel_data["generalized task type"] = df_keylevel_data["task type"]
df_keylevel_data["generalized task type"].replace({"uniform \d": "uniform", "uniform disappearing \d": "uniform disappearing"}, regex=True, inplace=True)
df_keylevel_data["task category"] = df_keylevel_data["task type"]
df_keylevel_data["task category"].replace({"uniform \d": "uniform", "uniform disappearing \d": "uniform"}, regex=True, inplace=True)
df_keylevel_data["generalized task category"] = df_keylevel_data["task type"]
df_keylevel_data["generalized task category"].replace({"uniform \d": "uniform", "uniform disappearing \d": "uniform", "text": "text-based", "pangram": "text-based"}, regex=True, inplace=True)
#with pd.option_context("display.max_rows", None, "display.max_columns", None):
# print(df_data)
return df_data, df_keylevel_data
def main(
path: "path to a directory to load data from" = "train-data/",
verbose: "verbosity level" = 0
):
# get all key files
files = pathlib.Path(path).glob("*.key.csv")
# get the number of participants
num_participants = len(list(pathlib.Path(path).glob("*t2.meta.json")))
print(f"number of participants: {num_participants}")
df_data, df_keylevel_data = run(files, verbose=verbose)
print("")
print(f"total hours of data taking: {df_data['time [min]'].sum()/60}")
print(f"total time taken per recording:\n{df_data.groupby('recording')['time [min]'].agg(np.sum)}\n")
print(f"mean time taken over all recordings:\n{df_data.groupby('recording')['time [min]'].agg(np.sum).mean()}\n")
print(f"mean typing speed of recordings across all tasks:\n{df_data.groupby(['recording','participant'])['keys per minute'].agg(np.mean).sort_values()}\n")
print(f"mean typing speed of recordings with a certain typing style across all tasks:\n{df_data.groupby('style')['keys per minute'].agg(np.mean)}\n")
print(f"mean typing speed of recordings with a certain typing style across all TEXT tasks:\n{df_data.loc[df_data['task type'] == 'text'].groupby('style')['keys per minute'].agg(np.mean)}\n")
print(f"mean typing speed of recordings with a certain typing style across all UNIFORM tasks:\n{df_data.loc[df_data['task type'].str.startswith('uniform')].groupby('style')['keys per minute'].agg(np.mean)}\n")
print(f"mean typing speed of recordings per task type:\n{df_data.groupby('generalized task category')['keys per minute'].agg(np.mean)}\n")
print(f"mean overlaps of recordings across all tasks:\n{df_data.groupby(['recording'])['number of non-modifier overlaps'].agg(np.mean)}\n")
print(f"mean overlaps of all recordings:\n{np.mean(df_data['number of non-modifier overlaps'])}\n")