forked from pthomas1/MLBlog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
probability.py
475 lines (379 loc) · 18.2 KB
/
probability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
# The MIT License (MIT)
# Copyright (c) 2015 Thoughtly
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.
#
#
#
# argparse is a standard Python mechanism for handling commandline
# args while avoiding a bunch of boilerplate code.
import argparse
# This is a module that provides a bunch of simple methods that make
# accessing the filesystem simpler.
from utils import fs, charting, log
# Python logging allows us to log formatted log messages at different
# log levels.
import logging
# numpy is just used for some simple array helpers
import numpy
# need to generate pseudo random numbers
import random
def main():
# Build the commandline parser and return entered args. This also
# setups up any non-ML/NLP config needed by the script (such as logging)
args = configure_command_line_arguments()
random.seed()
num_trials = int(args["numTrials"])
# execute the coin flip test
if args["coinFlip"]:
generate_coin_flip_distribution_offset(num_trials, float(args["coinFlipMultiplier"]))
# execute the dice roll test
if args["diceRoll"]:
generate_die_roll_sum_distribution(num_trials, int(args["numDice"]))
# generate a uniform distribution
if args["uniformDistribution"]:
generate_uniformly_distributed_pdf(num_trials)
# generate a guassian distribution
if args["gaussianDistribution"]:
generate_gaussian_distributed_pdf(num_trials, float(args["mean"]), float(args["standardDeviation"]))
# generate a poisson distribution
if args["poissonDistribution"]:
generate_poisson_distributed_pdf(num_trials, int(args["lambda"]))
if args["jars"]:
marbles_and_jars(num_trials);
###############################################################################
#
# Generate and plot a Poisson Distribution based on the lambda passed in.
# The distribution is generated by the numpy Poisson random number generator,
# generating number_of_samples samples.
#
###############################################################################
def generate_poisson_distributed_pdf(number_of_samples, lam):
# generate number_of_samples random numbers with a Poisson dist
values = numpy.random.poisson(lam, number_of_samples).tolist()
# plot the distribution
charting.plot_distribution("poisson_distribution.png",
"Poisson Distribution - " + str(number_of_samples) + ", lambda = " + str(lam),
"Likelihoods",
bucket_size=1,
data=values,
show_bucket_values=True,
color='#59799e',
normalize=True);
###############################################################################
#
# Generate and plot a Gaussian Distribution based on the mean and standard
# deviation passed in. The distribution is generated by the numpy gauss
# random number generator, generating number_of_samples samples.
#
###############################################################################
def generate_gaussian_distributed_pdf(number_of_samples, mean, std_dev):
values = []
# generate gaussian distribution with mean and standard deviation
for i in range(0, number_of_samples):
values.extend([random.gauss(mean, std_dev)])
# chart the output
charting.plot_distribution("gaussian_distribution.png",
"Gaussian Distribution (" + str(number_of_samples) + ", mean = " + str(mean) + ", stnd dev = " + str(std_dev) + ")",
"Likelihoods",
num_buckets=10+int(std_dev*10),
data=values,
show_bucket_values=True,
color='#59799e',
normalize=True);
###############################################################################
#
# Generate and plot a Uniform Distribution from zero to one. The distribution
# is generated by the python random number generator.
#
###############################################################################
def generate_uniformly_distributed_pdf(number_of_samples):
values = []
# generate uniformly distributed random values
for i in range(0, number_of_samples):
values.extend([random.random()])
# plot the distribution
charting.plot_distribution("uniform_distribution.png",
"Uniform Distribution (" + str(number_of_samples) + ")",
"Likelihoods",
num_buckets=10,
data=values,
show_bucket_values=True,
color='#59799e',
normalize=True);
###############################################################################
#
# This method generates a number of series of coin flips. Each series
# generates number_of_flips flips and plots how far from fair the series of
# flips ended up. You pass in the max number of flips in a given trial.
# The flip_count_multiplier is the step size multiplier from one trial to the
# next. Larger flip_count_multiplier results in fewer trials.
#
###############################################################################
def generate_coin_flip_distribution_offset(max_number_of_flips, flip_count_multiplier=1.1):
flip_counts = []
head_percentages = []
number_of_flips = 2
while number_of_flips < max_number_of_flips:
logging.info("Generating " + str(number_of_flips) + " coin flips")
# Flip the coin over and over and report back the number of heads
# so we can then determine the ratio of heads
number_of_heads = flip_a_coin(number_of_flips)
ratio_of_heads = float(number_of_heads) / number_of_flips
flip_counts.extend([number_of_flips])
# Whatever number we get, unless it was exactly .5, it was off from the ideal. Record
# that offset from the expected so we can plot it.
error_from_expected = abs(.5 - ratio_of_heads)
head_percentages.extend([error_from_expected])
# It would take forever to walk from 1 to a million, but it's not too bad if
# we multiply the number of coin flip trials each time instead of adding.
number_of_flips = int(number_of_flips * flip_count_multiplier) + 1
# output a text variation of the generated percentages
logging.debug(str(flip_counts))
logging.debug(str(head_percentages))
# we don't have room to display all number labels, so eliminate all but 8
x_label_step_size = len(flip_counts) / 8
for i in range(0, len(flip_counts)):
if i % x_label_step_size:
flip_counts[i] = ""
# now generate a plot
charting.bar_chart("coin_flip.png", [head_percentages],
"Heads Flips - Offset from Ideal (" + str(max_number_of_flips) + ")",
flip_counts,
"Offset from .5 - Larger is Worse",
None,
['#59799e'],
0,
0,
False,
.5,
"none")
###############################################################################
#
# Execute N die rolls and output an array of values.
#
###############################################################################
def generate_die_roll_sum_distribution(number_of_rolls, number_of_dice):
values = []
die_or_dice = "Die"
if number_of_dice > 1: die_or_dice = "Dice"
logging.info("Rolling " + str(number_of_dice) + " " + die_or_dice + " " + str(number_of_rolls) + " times");
# execute number_of_rolls rolls of number_of_dice dice and accumulate the sum of the values
for i in range(0, number_of_rolls):
sum = 0
for d in range(0, number_of_dice):
sum += random.randint(1,6)
values.extend([sum])
# plot the output
charting.plot_distribution("dice_rolls.png",
"Roll Distribution - " + str(number_of_dice) + " " + die_or_dice + ", " + str(number_of_rolls) + " rolls",
"Sum of Values",
bucket_size=1,
data=values,
show_bucket_values=True,
color='#59799e',
normalize=True)
###############################################################################
#
# Execute N coin flips and output the number of heads and tails
#
###############################################################################
def flip_a_coin(number_of_flips):
number_of_heads = 0
for i in range(0, number_of_flips):
# Return a 0 or a 1. We'll call 1's heads
number_of_heads += random.randint(0, 1)
return number_of_heads
###############################################################################
#
# This function is responsible for running a simulation of draws from a number
# of jars filled with colored marbles. The user can configure the simulation
# in marbles.csv. Each row is a jar. Columns represent the marbles in each
# of the jars. Two are provided to match the tutorial text, but an arbitrary
# number can be simulated.
#
###############################################################################
def marbles_and_jars(num_trials):
# read in the csv file of jars
rows = fs.read_csv("marbles.csv")
logging.debug("Read rows: " + str(rows))
jars = {}
headers = []
marble_picks = {}
# go through the rows and build a dictionary of jar_name => array of marble colors
for index, row in enumerate(rows):
# first row is just header data
if index == 0:
headers = row
else:
# go through each of the headers (these are columns)
for column_index, header in enumerate(headers):
# if the first column than it's the name of the jar - initialize the array to empty (no marbles)
if column_index == 0:
jars[row[0]] = []
else:
# each other column represents a number of marbles, the name of the marble is in the header
marble_color = header
# initialize the counters for picking marbles for the given color
marble_picks[marble_color] = 0
# set blank cells to 0, otherwise add the value in the cell
if len(row[column_index]) == 0:
num_marbles = 0
else:
num_marbles = int(row[column_index])
# expand an array of colors, 1 element for each num_marbles
jars[row[0]] += [marble_color] * num_marbles
logging.info("Jars: " + str(jars))
for i in range(0, num_trials):
# pick a random jar from all of the jars w/out taking the marbles into consideration
jar_names = jars.keys()
jar_name = jar_names[random.randint(0, len(jar_names) - 1)]
# now draw a single marble from all the marbles given that we selected a jar
marbles = jars[jar_name];
marble = marbles[random.randint(0, len(marbles) - 1)]
marble_picks[marble] += 1
logging.info("Marble picks : " + str(marble_picks))
# prepare the data for plotting
keys = []
data = []
for key, value in marble_picks.iteritems():
column_name = key + " (" + str(value) + ")"
keys.extend([column_name])
data.extend([value/float(num_trials)])
description_list = []
for jar_name, jar_marbles in jars.iteritems():
description_list.append(jar_name + "(" + str(len(jar_marbles)) + ")")
description = ", ".join(description_list)
# plot the data
charting.bar_chart("marbles.png",
[data],
"Marbles in Jars (" + str(num_trials) + ") - " + description,
keys,
"Probabilities",
None,
['#59799e'])
###############################################################################
#
# Build the commandline parser for the script and return a map of the entered
# options. In addition, setup logging based on the user's entered log level.
# Specific options are documented inline.
#
###############################################################################
def configure_command_line_arguments():
# Initialize the commandline argument parser.
parser = argparse.ArgumentParser(description='Play with probabilities')
# Configure the log level parser. Verbose shows some logs, veryVerbose
# shows more
logging_group = parser.add_mutually_exclusive_group(required=False)
logging_group.add_argument("-v",
"--verbose",
help="Set the log level verbose.",
action='store_true',
required=False)
logging_group.add_argument("-vv",
"--veryVerbose",
help="Set the log level verbose.",
action='store_true',
required=False)
# Run the coin flip simulation and plot the output
parser.add_argument('-cf',
'--coinFlip',
help="Generate the coin flip offset chart",
required=False,
action='store_true')
# configure how big of a multiplier to use when stepping the coin flipper
parser.add_argument('-cfm',
'--coinFlipMultiplier',
help="Each iteration of the coin flip test runs more times than the previous, where current = cfm*previous.",
required=False,
default=1.2)
# Run the dice roll simulation and plot the output
parser.add_argument('-d',
'--diceRoll',
help="Generate the dice roll distribution",
required=False,
action='store_true')
# configure the number of dice to use in the dice roller
parser.add_argument('-nd',
'--numDice',
help="How many dice to use in the dice roll distribution",
required=False,
default=1)
# configure the number of trials to use for any of the simulations
parser.add_argument('-nt',
'--numTrials',
help="How many trials to use for generating the distribution",
required=False,
default=1000000)
# generate a uniform distribution
parser.add_argument('-ud',
'--uniformDistribution',
help="Generate a Uniform distribution",
required=False,
action='store_true')
# generate a gaussian distribution
parser.add_argument('-gd',
'--gaussianDistribution',
help="Generate a Gaussian distribution",
required=False,
action='store_true')
# set the mean for a gaussian distribution
parser.add_argument('-m',
'--mean',
help="Set the mean for the Gaussian distribution",
required=False,
default=0)
# set the standard deviation for the gaussian distribution
parser.add_argument('-sd',
'--standardDeviation',
help="Set the standard deviation for the Gaussian distribution",
required=False,
default=1)
# generate a poisson distribution
parser.add_argument('-pd',
'--poissonDistribution',
help="Generate a Poisson distribution",
required=False,
action='store_true')
# set lambda for the poisson distribution
parser.add_argument('-l',
'--lambda',
help="Set lambda (the expected arrival rate) for the Poisson distribution",
required=False,
default=3)
# run marble/jar simulation
parser.add_argument('-j',
'--jars',
help="Calculate probability distribution of marble choices.",
required=False,
action='store_true')
# Parse the passed commandline args and turn them into a dictionary.
args = vars(parser.parse_args())
# Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL
log.set_log_level_from_args(args)
return args
###############################################################################
#
# This is a pythonism. Rather than putting code directly at the "root"
# level of the file we instead provide a main method that is called
# whenever this python script is run directly.
#
###############################################################################
if __name__ == "__main__":
main()