-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmfv.py
938 lines (813 loc) · 34.1 KB
/
mfv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
"""
2015 - Mike Schladt
mfv.py - collection of functions to manipulate and perform analysis of malware feature vectors
"""
import argparse
import sys
import MySQLdb
import config as CONST
import numpy
import json
import plotly.plotly as py
from plotly.graph_objs import *
db_host = CONST.DB_HOST
db_user = CONST.DB_USER
db_pass = CONST.DB_PASS
db_name = CONST.DB_NAME
class FeatureVector(object):
"""
FeatureVector object stores features and basic meta related to a cuckoo analysis report
"""
md5 = "" #md5 of the sample analyzed
machine = "" #name of the machine completing the analysis
task_id = 0 #cuckoo task id of the analysis
features = {} #dictionary of feautures {'feature_name': int_value}
def __init__(self, md5, machine, task_id, features):
self.md5 = md5
self.machine = machine
self.task_id = task_id
self.features = features
self.magnitude = self.get_magnitude()
self.get_labels()
def get_magnitude(self) :
"""
Reutrns the magnitude of vector represented by self.features
INPUT : self (FeatureVector)
OUTPUT : magnitude : float
"""
#add vector features to 1d array
feature_array = []
for key, value in self.features.iteritems() :
feature_array.append(value)
#use numpy to return magnitude
return numpy.linalg.norm(numpy.array(feature_array))
def get_distance(self, second_vector) :
"""
Returns the euclidean distance between self and another FeatureVector
INPUT : self : FeatureVector
INPUT : second_vector : FeatureVector
Output : distance : float
"""
#add vector fearues to 1D array for self
a = []
b = []
for key, value in self.features.iteritems() :
a.append(value)
b.append(second_vector.features[key])
return numpy.linalg.norm(numpy.array(a)-numpy.array(b))
def prune_zeros(self) :
"""
Removes features with 0 value
INPUT : self
OUTPUT : FeatureVector : self without 0-valued feature
"""
pruned_vector = self
features = {key: value for key, value in self.features.items() if value != 0}
pruned_vector.features = features
return pruned_vector
def prune_features(self, feature_keys) :
"""
Removes features NOT contianed in feature_keys
INPUT : feature_keys : list of feature keys
OUTPUT : FeatureVectors : self containing only those features inputed as feature_keys
"""
#create new feature dict for each vector
pruned_vector = self
features = {}
for feature_key in feature_keys :
if feature_key in self.features :
features[feature_key] = self.features[feature_key]
else :
features[feature_key] = 0
#set vector features to new features
pruned_vector.features = features
return pruned_vector
def get_labels(self) :
"""
Returns label of vector based on tags (if md5)
INPUT : self
OUTPUT : label : string (None if no md5)
"""
if self.md5 :
cmd = "SELECT type,value FROM tags WHERE md5=%s"
data = [self.md5]
#connect to db and execute command
connection = MySQLdb.connect(db_host, db_user, db_pass, db_name)
cursor = connection.cursor()
cursor.execute(cmd, data)
results = cursor.fetchall()
connection.close()
#load results into tag dictionary
tags = {}
for result in results :
tags[result[0]] = result[1]
if 'family' in tags :
self.family = tags['family']
else :
self.family = "NO_FAM"
if 'filetype' in tags :
self.filetype = tags['filetype']
else :
self.filetype = "NO_TYPE"
if 'source' in tags :
self.source = tags['source']
else :
self.source = "NO_SRC"
if 'report_date' in tags :
self.report_date = tags['report_date']
else :
self.report_date = "NO_DATE"
#create label string
try:
self.label = "{0}_{1}_{2}_{3}".format(self.family,self.filetype,self.source,self.report_date)
except Exception as e:
self.label = "None"
def select_vectors(tags,machines,task_ids,hashes) :
"""
Returns feature vectors matching the following input filters:
INPUT : tags : list of tuples : list of tuples representing at tags [(<tag type>, <tag value),() ]
INPUT : machines : list : list of machines names (partial names okay)
INPUT : task_ids : list : list of cuckoo task ids
INPUT : hashes : list : list of md5 hashes
OUTPUT : vectors : list of dicts : list of dictionaries representing vector
features [{'<feature type>' : <feature value>, ...}, {} ...]
"""
vectors = []
data = []
first_filter = True #bool to determine if WHERE or AND should be appended
cmd = "SELECT * FROM vectors" #base sql command
#add tags filter
if tags :
#create the search string
sub_cmd = 'SELECT t1.md5 FROM tags t1 '
#add INNER JOIN statements
i = 1
while i < len(tags) :
sub_cmd += 'INNER JOIN tags t{0} ON t{1}.md5=t{2}.md5 '.format(i+1,i,i+1)
i += 1
#add WHERE clause
i = 0
while i < len(tags) :
if i == 0 :
sub_cmd += 'WHERE t{0}.type=%s AND t{0}.value=%s '.format(i+1)
else :
sub_cmd += 'AND t{0}.type=%s AND t{0}.value=%s '.format(i+1)
data.append(tags[i][0])
data.append(tags[i][1])
i += 1
if first_filter :
cmd = "{0} WHERE (md5 in ({1}))".format(cmd, sub_cmd)
first_filter = False
else :
cmd = "{0} AND (md5 in ({1}))".format(cmd, sub_cmd)
#add machine filter
if machines :
sub_cmd = "(machine LIKE %s) "
data.append('%' + machines.pop(0) + '%')
for machine in machines :
sub_cmd += "OR (machine LIKE %s) "
data.append('%' + machine + '%')
if first_filter :
cmd = "{0} WHERE ({1})".format(cmd, sub_cmd)
first_filter = False
else :
cmd = "{0} AND ({1})".format(cmd, sub_cmd)
#add task_id filter
if task_ids :
sub_cmd = "(task_id=%s) "
data.append(int(task_ids.pop(0)))
for task_id in task_ids :
sub_cmd += "OR (task_id=%s) "
data.append(int(task_id))
if first_filter :
cmd = "{0} WHERE ({1})".format(cmd, sub_cmd)
first_filter = False
else :
cmd = "{0} AND ({1})".format(cmd, sub_cmd)
#add hash filter
if hashes :
sub_cmd = "(md5=%s) "
data.append(hashes.pop(0))
for md5 in hashes :
sub_cmd += "OR (md5=%s) "
data.append(md5)
if first_filter :
cmd = "{0} WHERE ({1})".format(cmd, sub_cmd)
first_filter = False
else :
cmd = "{0} AND ({1})".format(cmd, sub_cmd)
#DEBUG
#print cmd
#print data
#connect to db and execute command
connection = MySQLdb.connect(db_host, db_user, db_pass, db_name)
cursor = connection.cursor()
cursor.execute(cmd, data)
results = cursor.fetchall()
if not results :
return False
connection.close()
vectors = []
#create the malware feature vector object
current_analysis_info = '{0},{1},{2}'.format(results[0][0],results[0][1],results[0][2])
md5 = results[0][0].strip()
machine = results[0][1]
task_id = results[0][2]
features = {}
i = 0
for result in results :
#add feature to dictionary
features[result[3]] = result[4]
#find information for next result entry
try :
next_analysis_info = '{0},{1},{2}'.format(results[i+1][0],results[i+1][1],results[i+1][2])
except :
#add features vector to vectors list
vector = FeatureVector(md5,machine,task_id,features)
vectors.append(vector)
continue #reached the end of results
#check if next result entry is same vector
if next_analysis_info != current_analysis_info :
#add features vector to vectors list
vector = FeatureVector(md5,machine,task_id,features)
vectors.append(vector)
#set current analysis info to next analysis info
current_analysis_info = next_analysis_info
#seed values for the next vector object
md5 = results[i+1][0]
machine = results[i+1][1]
task_id = results[i+1][2]
features = {}
i += 1
return vectors
def get_args() :
"""
Retrieves main function arguments
INPUT : NONE
OUTPUT : list : parsed arguments
"""
parser = argparse.ArgumentParser(description='Collection of helper functions to maniputlate and perform maths on vectors')
subparsers = parser.add_subparsers()
select = subparsers.add_parser('print', help='Prints feature vectors. Filtered by provided tags')
select.add_argument('-t', '--tag',
help="Specify tag type/value pairs to filter on. Example : '-t family,dyre source,bit9'", nargs='+')
select.add_argument('-m', '--machine',
help="Specify machine name to filter on (partial names okay). Example : '-m win7x64'", nargs='+')
select.add_argument('-c', '--cuckoo_task_id',
help="Specify cuckoo task id filter on. Example : '-c 710'", nargs='+')
select.add_argument('--md5',
help="Specify MD5 hash to filter on. Example : '-h 84bb1c8c5957125029e4fbfa9ec63045'", nargs='+')
select.add_argument('--csv',
help="Print in CSV format", action='store_true')
select.set_defaults(subparser='print')
mean = subparsers.add_parser('mean', help='Prints vector representing the mean of all vectors matching input filters')
mean.add_argument('-t', '--tag',
help="Specify tag type/value pairs to filter on. Example : '-t family,dyre source,bit9'", nargs='+')
mean.add_argument('-m', '--machine',
help="Specify machine name to filter on (partial names okay). Example : '-m win7x64'", nargs='+')
mean.add_argument('-c', '--cuckoo_task_id',
help="Specify cuckoo task id filter on. Example : '-c 710'", nargs='+')
mean.add_argument('--md5',
help="Specify MD5 hash to filter on. Example : '-h 84bb1c8c5957125029e4fbfa9ec63045'", nargs='+')
mean.add_argument('--csv',
help="Print in CSV format", action='store_true')
mean.set_defaults(subparser='mean')
stddev = subparsers.add_parser('stddev', help='Prints vector representing the standard deviation of vectors matching input filters')
stddev.add_argument('-t', '--tag',
help="Specify tag type/value pairs to filter on. Example : '-t family,dyre source,bit9'", nargs='+')
stddev.add_argument('-m', '--machine',
help="Specify machine name to filter on (partial names okay). Example : '-m win7x64'", nargs='+')
stddev.add_argument('-c', '--cuckoo_task_id',
help="Specify cuckoo task id filter on. Example : '-c 710'", nargs='+')
stddev.add_argument('--md5',
help="Specify MD5 hash to filter on. Example : '-h 84bb1c8c5957125029e4fbfa9ec63045'", nargs='+')
stddev.add_argument('--csv',
help="Print in CSV format", action='store_true')
stddev.set_defaults(subparser='stddev')
auto_archetype = subparsers.add_parser('auto_archetype', help='Autogenerates archetypes and updates the database' )
auto_archetype.set_defaults(subparser='auto_archetype')
return parser.parse_args()
def parse_select_args(args) :
"""
Parses 'select' option argum
INPUT : args : list of arguments parsed with argparse
OUTPUT : tag, machines, task_ids, hashes
"""
#add tags
if args.tag :
tags = []
for tag in args.tag :
tag = tag.split(',')
tags.append((tag[0].strip(),tag[1].strip()))
else :
tags = None
#add machines
if args.machine :
machines = []
for machine in args.machine :
machines.append(machine.strip())
else :
machines = None
#add task_ids
if args.cuckoo_task_id :
task_ids = []
for task_id in args.cuckoo_task_id :
task_ids.append(task_id.strip())
else :
task_ids = None
#add hashes
if args.md5 :
hashes = []
for md5 in args.md5 :
hashes.append(md5.strip())
else :
hashes = None
return tags,machines,task_ids,hashes
def get_max_values_vector():
"""
Return max values for each feature found in the database. Used for normalization
INPUT : None
OUTPUT : max_values_vector : FeatureVector w/ max values
"""
#connect to db
connection = MySQLdb.connect(db_host, db_user, db_pass, db_name)
cursor = connection.cursor()
cmd = "SELECT DISTINCT type FROM vectors"
cursor.execute(cmd)
feature_names = cursor.fetchall()
#find max value for each feature
print "Please wait while we caluclate max values for normalizing function ..."
features = {}
for feature_name in feature_names :
cmd = "SELECT value FROM vectors WHERE type=%s ORDER BY value DESC LIMIT 1"
data = [feature_name[0]]
cursor.execute(cmd,data)
feature_value = cursor.fetchone()
#print "{0} : {1}".format(feature_name[0],feature_value[0])
sys.stdout.write('.')
features[feature_name[0]] = feature_value[0]
sys.stdout.write('\n')
max_values_vector = FeatureVector(None,None,None,features)
return max_values_vector
def normalize_vectors(vectors, max_values_vector) :
"""
Normalizes vectors to values provided in max_values_vector
INPUT : vectors : list of FeatureVectors
INPUT : max_values_vector : single FeatureVector containing max values
OUTPUT : normalized_vectors : list of FeatureVectors with normalized features
"""
normalized_vectors = []
for vector in vectors :
#container for each vector's normalized features
normalized_features = {}
for key, value in vector.features.iteritems() :
#do the thing
if value != 0 :
normalized_features[key] = (float(value) / max_values_vector.features[key])
else :
normalized_features[key] = 0
#create new vector with normalized features
normalized_vector = vector
normalized_vector.features = normalized_features
normalized_vectors.append(normalized_vector)
return normalized_vectors
def get_archetype(subset_vectors, superset_vectors) :
"""
Returns "archetype" of subset_vectors
INPUT : subset_vectors : list of FeatureVector objects (vector family)
INPUT : superset_vectors : list of FeatureVector objects (entire vector population)
OUPUT : archetype_vector : FeatureVector containing most significant features
OUPUT : archetype_stddev : FeatureVector containing stddev of the most significant features
"""
#prune the subset vectors
subset_vectors = [v.prune_zeros() for v in subset_vectors]
#find standard deviations for both subset and superset
subset_stddev = stats_stddev(subset_vectors)
superset_stddev = stats_stddev(superset_vectors)
#find difference between subset and superset standard deviation
stddev_diffs = []
for key, value in subset_stddev.features.iteritems() :
if superset_stddev.features[key] != 0 : #if superset stddev = 0 , the feature is not-significant
stddev_diffs.append((value - superset_stddev.features[key],key))
#find X features with largest gap between subset and superset standard deviation
stddev_diffs = sorted(stddev_diffs)[:30]
#create vector using keys from stddev_diffs and values from subset_vectors
archetype_vector = stats_mean(subset_vectors)
features = {}
for item in stddev_diffs :
key = item[1]
features[key] = archetype_vector.features[key]
archetype_vector.features = features
#create stddev_vector using keys from stddev_diffs and values from subset_vectors
archetype_stddev = stats_stddev(subset_vectors)
features = {}
for item in stddev_diffs :
key = item[1]
features[key] = archetype_stddev.features[key]
archetype_stddev.features = features
return archetype_vector, archetype_stddev
def stats_mean(vectors) :
"""
Stats fuction returns a single feature set representing the averga
INPUT: vectors : list of FeatureVector objects
OUTPUT: mean_vector : FeatureVector : a meta FeatureVector representing the mean feature values
"""
#get keys - all vectors must contain keys found in first vector
keys = []
for key, value in vectors[0].features.iteritems() :
keys.append(key)
#sort the keys
keys = sorted(keys)
feature_arrays = [] #array of feature arrays
for vector in vectors :
feature_array = [] #single feature array
for key in keys :
#check for missing key and set to 0
if key not in vector.features :
vector.features[key] = 0
#add to array
feature_array.append(vector.features[key])
#append feature array to feature_arrays
feature_arrays.append(feature_array)
#use numpy to return 1D array representing the mean arary of the inputs
mean_feature_array = numpy.mean(numpy.array(feature_arrays),axis=0)
#add mean features list to dictionary values
mean_features = {}
i = 0
for key in keys :
mean_features[key] = mean_feature_array[i]
i += 1
#add mean features to a FeatureVector object
mean_vector = FeatureVector(None,None,None,mean_features)
return mean_vector
def stats_stddev(vectors) :
"""
Stats fuction returns a single feature set representing the standard devation for each feature
INPUT: vectors : list of FeatureVector objects
OUTPUT: stddev_vector : FeatureVector : a meta FeatureVector representing the standard deviations
"""
#get keys - all vectors should contain keys found in first vector (else they will be set to 0)
keys = []
for key, value in vectors[0].features.iteritems() :
keys.append(key)
#sort the keys
keys = sorted(keys)
feature_arrays = [] #array of feature arrays
for vector in vectors :
feature_array = [] #single feature array
for key in keys :
#check for missing key and set to 0
if key not in vector.features :
vector.features[key] = 0
#add to array
feature_array.append(vector.features[key])
#append feature array to feature_arrays
feature_arrays.append(feature_array)
#use numpy to return 1D array representing the stddev arary of the inputs
stddev_feature_array = numpy.std(numpy.array(feature_arrays),axis=0)
#add mean features list to dictionary values
stddev_features = {}
i = 0
for key in keys :
stddev_features[key] = stddev_feature_array[i]
i += 1
#add stddev features to a FeatureVector object
stddev_vector = FeatureVector(None,None,None,stddev_features)
return stddev_vector
def stats_summary(subset_vectors, superset_vectors) :
"""
Prints a summary of statistics for inputed vector sets
INPUT : subset_vectors : list of FeatureVector objects (vector family)
INPUT : superset_vectors : list of FeatureVector objects (entire vector population)
OUTPUT : NONE : prints results to stdout
"""
#compute vector representing mean values for each feature
#subset_mean_vector = stats_mean(subset_vectors)
subset_mean_vector, archetype_stddev = get_archetype(subset_vectors,superset_vectors)
#find euclidean distance to mean vector for each vector in target set
distances = []
for second_vector in subset_vectors :
dist = subset_mean_vector.get_distance(second_vector)
distances.append(dist)
#calculate mean and standard deviation of above distances
subset_mean = numpy.mean(distances)
subset_stddev = numpy.std(distances)
#containers to store task_ids for vectors within 1, 2, & 3 standard deviations
one_sig = []
two_sig = []
three_sig = []
#find euclidean distance to mean vector for vector in entire vector set
distances = []
for second_vector in superset_vectors :
dist = subset_mean_vector.get_distance(second_vector)
#check for values inside sigma thresholds
if dist < (subset_stddev + subset_mean) :
one_sig.append(second_vector.task_id)
if dist < ((2 * subset_stddev) + subset_mean) :
two_sig.append(second_vector.task_id)
if dist < ((3 * subset_stddev) + subset_mean) :
three_sig.append(second_vector.task_id)
distances.append(dist)
superset_mean = numpy.mean(distances)
superset_stddev = numpy.std(distances)
print
print "Mean distance from family samples to family archetype : {0}".format(subset_mean)
print "Standard deviation of distance for family samples to family archetype : {0}".format(subset_stddev)
print "Mean distance from all samples to family archetype : {0}".format(superset_mean)
print "Standard deviation of distance for all samples to family archetype : {0}".format(superset_stddev)
print "Total number of vectors in family : {0}".format(len(subset_vectors))
print "Number of all vectors within one sigma of family mean : {0}".format(len(one_sig))
print "Number of all vectors two sigma of family mean : {0}".format(len(two_sig))
print "Number of all vectors within three sigma of family mean : {0}".format(len(three_sig))
print
def auto_archetype() :
"""
Auto-generates
Finds the archetypes for each group
Updates the archetype table in database
NOTE: will overwrite existing archetypes with same label
INPUT: NONE
OUTPUT : NONE
"""
#auto-generate groups
#find that max_values_vector to use in normalization
max_values_vector = get_max_values_vector()
#find tag groupings
tag_groups = autogen_tag_groups()
machines = ['win7x32','win7x64']
#seperate by machine
for machine in machines :
for tags in tag_groups :
#group name auto generated by tags and machine
group_name = "{0}_{1}_{2}_{3}_{4}".format(tags[0][1],tags[1][1],tags[2][1],tags[3][1],machine)
print "Creating automated archetype for {0}".format(group_name)
task_ids = None
hashes = None
#get subset
subset_vectors = select_vectors(tags, [machine], task_ids, hashes)
subset_vectors = normalize_vectors(subset_vectors, max_values_vector)
#get superset - keep only fileytpe tags and machine
superset_vectors = select_vectors([tags[1]],[machine],None,None)
superset_vectors = normalize_vectors(superset_vectors, max_values_vector)
#find archetype vector and extract feature keys
archetype, archetype_stddev = get_archetype(subset_vectors, superset_vectors)
#save to database
label = group_name
mean_features = json.dumps(archetype.features)
stddev_features = json.dumps(archetype_stddev.features)
max_value_features = json.dumps(max_values_vector.features)
cmd = "INSERT INTO archetypes (label, mean_features, stddev_features, max_value_features) VALUES (%s,%s,%s,%s)"
data = [label,mean_features,stddev_features,max_value_features]
#connect and execute
connection = MySQLdb.connect(db_host, db_user, db_pass, db_name)
cursor = connection.cursor()
try :
cursor.execute(cmd, data)
print "\t ...added {0}".format(label)
except Exception as e :
if 'Duplicate entry' in str(e) :
cmd = "UPDATE archetypes SET label=%s, mean_features=%s, stddev_features=%s, max_value_features=%s WHERE label=%s"
data.append(label)
cursor.execute(cmd,data)
print "...updated {0}".format(label)
connection.commit()
connection.close()
def print_verbose(vectors) :
"""
INPUT : vectors : list of FeatureVector objects
OUTPUT : None : verbosly prints to stdout
"""
for vector in vectors :
print "---------------------------------"
print "Vector Information: "
if vector.md5 :
print "MD5 : {0}".format(vector.md5)
if vector.machine :
print "Machine : {0}".format(vector.machine)
if vector.task_id :
print "Task ID : {0}".format(vector.task_id)
print "Vector Magnitude : {0}".format(vector.magnitude)
print "---------------------------------"
print
for key, value in vector.features.iteritems() :
print "\t{0:60} , {1:>20}".format(key, value)
print
def print_csv(vectors) :
"""
INPUT : vectors : list of FeatureVector objects
OUTPUT : None : prints csv to stdout
"""
#print headers
entry = vectors[0]
outline = ""
if entry.md5 and entry.machine and entry.task_id :
outline += "md5,machine,task_id"
for key, value in entry.features.iteritems() :
outline += ",{0}".format(key)
outline = outline.strip(",")
outline = outline.strip()
print outline
#print values
for vector in vectors :
outline = ""
if entry.md5 and entry.machine and entry.task_id :
outline += "{0},{1},{2}".format(vector.md5,vector.machine,vector.task_id)
for key, value in vector.features.iteritems() :
outline += ",{0}".format(value)
outline = outline.strip(",")
outline = outline.strip()
print outline
def plotly_scatter(vectors, mean_vector, stddev_vector, feature_keys, plot_name, plot_title) :
"""
Uses Plotly to create scatter plot of vectors
Plotly credentials must be stored
INPUT : vectors : list of FeatureVectors
INPUT : mean_vector (None allowed): FeatureVector graphed as mean
INPUT : stddev_vectos (None allowed): FeatureVector graphed as stddev
INPUT : plot_name : string : name for plotly chart
INPUT : feature_keys (None allowed) : list : list of features to plot
OUTPUT : url_to_plot : string : url to the plotly chart
"""
#prune to feature_list
if feature_keys :
for vector in vectors :
vector = vector.prune_features(feature_keys)
if mean_vector :
mean_vector = mean_vector.prune(feature_keys)
if stddev_vector :
stddev_vector = stddev_vector.prune(feature_keys)
#calculate mean and stddev if not provided
if not mean_vector :
mean_vector = stats_mean(vectors)
if not stddev_vector :
stddev_vector = stats_stddev(vectors)
#create containers
x = []
y = []
y_stddev = []
feature_tuples = []
#extract feature tuples
for key, value in mean_vector.features.iteritems() :
feature_tuples.append((value,key))
#sort feaures by values and add to containers
feature_tuples = sorted(feature_tuples)
for feature in feature_tuples :
x.append(feature[1])
y.append(feature[0])
y_stddev.append(stddev_vector.features[feature[1]])
mean_trace = Scatter(
x = x,
y = y,
mode='lines+markers',
name="MEAN",
line = Line(
width=3
)
)
#add first trace to data
data = Data([mean_trace])
#create +2 std_dev lines
a = numpy.array(y)
b = numpy.array(y_stddev)
y2 = a+(2*b)
plus2sig = Scatter(
x = x,
y = y2,
mode = 'line',
name = '+2 STDDEV',
line = Line(
dash='dash'
)
)
data.append(plus2sig)
#create -2 std_dev lines
a = numpy.array(y)
b = numpy.array(y_stddev)
y3 = a-(2*b)
minus2sig = Scatter(
x = x,
y = y3,
mode = 'line',
name = '-2 STDDEV',
line = Line(
dash='dash'
)
)
data.append(minus2sig)
if vectors :
for vector in vectors :
#create container for feature names and values
x = []
y = []
#sort feaures by values and add
feature_tuples = sorted(feature_tuples)
for feature in feature_tuples :
key = feature[1]
x.append(key)
y.append(vector.features[key])
vector_trace = Scatter(
x = x,
y = y,
mode='markers',
name="ID: {0} {1}".format(vector.task_id, vector.label),
)
data.append(vector_trace)
layout = Layout (
title = plot_title,
margin=Margin(
l=80,
r=80,
b=160,
t=80,
)
)
fig = Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename=plot_name, auto_open=False)
return plot_url
def autogen_tag_groups() :
"""
Automates the generation of vector groupings based on unique tag pairs
INPUT : NONE
OUTPUT : tag_groups : list of list of tuples (i.e. list of tags)
"""
#connect to db
connection = MySQLdb.connect(db_host, db_user, db_pass, db_name)
cursor = connection.cursor()
#find distinct family types
cmd = "SELECT DISTINCT value FROM tags WHERE type='family'"
cursor.execute(cmd)
family_results = cursor.fetchall()
#create the search string
sub_cmd = 'SELECT t1.md5 FROM tags t1 '
#container for tag groups
tag_groups = []
#find distinct family, filetype pairs
for family_result in family_results :
cmd = ("SELECT DISTINCT t1.value,t2.value FROM tags t1 INNER JOIN tags t2 ON t1.md5=t2.md5 "
"WHERE t1.type='family' AND t1.value=%s AND t2.type='filetype'")
data = [family_result[0]]
cursor.execute(cmd,data)
filetype_results = cursor.fetchall()
#find distinct family, filetype, source tuples
for filetype_result in filetype_results :
cmd = ("SELECT DISTINCT t1.value,t2.value,t3.value "
"FROM tags t1 INNER JOIN tags t2 ON t1.md5=t2.md5 INNER JOIN tags t3 ON t2.md5=t3.md5 "
"WHERE t1.type='family' AND t1.value=%s AND t2.type='filetype' AND t2.value=%s AND t3.type='source'")
data = [family_result[0],filetype_result[1]]
cursor.execute(cmd,data)
source_results = cursor.fetchall()
for source_result in source_results :
#find distinct family, filetype, source tuples
cmd = ("SELECT DISTINCT t1.value,t2.value,t3.value,t4.value "
"FROM tags t1 INNER JOIN tags t2 ON t1.md5=t2.md5 "
"INNER JOIN tags t3 ON t2.md5=t3.md5 INNER JOIN tags t4 ON t3.md5=t4.md5 "
"WHERE t1.type='family' AND t1.value=%s AND t2.type='filetype' "
"AND t2.value=%s AND t3.type='source' AND t3.value=%s AND t4.type='report_date'")
data = [family_result[0],filetype_result[1],source_result[2]]
cursor.execute(cmd,data)
report_date_results = cursor.fetchall()
for result in report_date_results :
tags = [('family',result[0]),('filetype',result[1]),('source',result[2]),('report_date',result[3])]
tag_groups.append(tags)
return tag_groups
if __name__ == '__main__' :
#get input args
args = get_args()
#select sub-command
if args.subparser == 'print' :
tags, machines, task_ids, hashes = parse_select_args(args)
vectors = select_vectors(tags, machines, task_ids, hashes)
if vectors :
if args.csv :
print_csv(vectors)
else :
print_verbose(vectors)
else :
print "No results for provided filters"
#mean subcommand
elif args.subparser == 'mean' :
tags, machines, task_ids, hashes = parse_select_args(args)
vectors = select_vectors(tags, machines, task_ids, hashes)
if vectors :
mean_vector = stats_mean(vectors)
if args.csv :
print_csv([mean_vector])
else :
print_verbose([mean_vector])
else :
print "No results for provided filters"
#stddev subcommand
elif args.subparser == 'stddev' :
tags, machines, task_ids, hashes = parse_select_args(args)
vectors = select_vectors(tags, machines, task_ids, hashes)
if vectors :
stddev_vector = stats_stddev(vectors)
if args.csv :
print_csv([stddev_vector])
else :
print_verbose([stddev_vector])
else :
print "No results for provided filters"
#stddev subcommand
elif args.subparser == 'auto_archetype' :
auto_archetype()