forked from AllenDowney/ThinkStats2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chap08soln.py
236 lines (168 loc) · 6.03 KB
/
chap08soln.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function, division
import thinkstats2
import thinkplot
import math
import random
import numpy as np
from scipy import stats
from estimation import RMSE, MeanError
"""This file contains a solution to exercises in Think Stats:
Exercise 8.1
In this chapter we used $\xbar$ and median to estimate $\mu$, and
found that $\xbar$ yields lower MSE.
Also, we used $S^2$ and $S_{n-1}^2$ to estimate $\sigma$, and found that
$S^2$ is biased and $S_{n-1}^2$ unbiased.
Run similar experiments to see if $\xbar$ and median are biased estimates
of $\mu$.
Also check whether $S^2$ or $S_{n-1}^2$ yields a lower MSE.
My conclusions:
1) xbar and median yield lower mean error as m increases, so neither
one is obviously biased, as far as we can tell from the experiment.
2) The biased estimator of variance yields lower RMSE than the unbiased
estimator, by about 10%. And the difference holds up as m increases.
Exercise 8.2
Suppose you draw a sample with size $n=10$ from a population
with an exponential disrtribution with $\lambda=2$. Simulate
this experiment 1000 times and plot the sampling distribution of
the estimate $\lamhat$. Compute the standard error of the estimate
and the 90\% confidence interval.
Repeat the experiment with a few different values of $n$ and make
a plot of standard error versus $n$.
1) With sample size 10:
standard error 0.896717911545
confidence interval (1.2901330772324622, 3.8692334892427911)
2) As sample size increases, standard error and the width of
the CI decrease:
10 0.90 (1.3, 3.9)
100 0.21 (1.7, 2.4)
1000 0.06 (1.9, 2.1)
All three confidence intervals contain the actual value, 2.
Exercise 8.3
In games like hockey and soccer, the time between goals is
roughly exponential. So you could estimate a team's goal-scoring rate
by observing the number of goals they score in a game. This
estimation process is a little different from sampling the time
between goals, so let's see how it works.
Write a function that takes a goal-scoring rate, {\tt lam}, in goals
per game, and simulates a game by generating the time between goals
until the total time exceeds 1 game, then returns the number of goals
scored.
Write another function that simulates many games, stores the
estimates of {\tt lam}, then computes their mean error and RMSE.
Is this way of making an estimate biased? Plot the sampling
distribution of the estimates and the 90\% confidence interval. What
is the standard error? What happens to sampling error for increasing
values of {\tt lam}?
My conclusions:
1) RMSE for this way of estimating lambda is 1.4
2) The mean error is small and decreases with m, so this estimator
appears to be unbiased.
One note: If the time between goals is exponential, the distribution
of goals scored in a game is Poisson.
See https://en.wikipedia.org/wiki/Poisson_distribution
"""
def Estimate1(n=7, m=100000):
"""Mean error for xbar and median as estimators of population mean.
n: sample size
m: number of iterations
"""
mu = 0
sigma = 1
means = []
medians = []
for _ in range(m):
xs = [random.gauss(mu, sigma) for i in range(n)]
xbar = np.mean(xs)
median = np.median(xs)
means.append(xbar)
medians.append(median)
print('Experiment 1')
print('mean error xbar', MeanError(means, mu))
print('mean error median', MeanError(medians, mu))
def Estimate2(n=7, m=100000):
"""RMSE for biased and unbiased estimators of population variance.
n: sample size
m: number of iterations
"""
mu = 0
sigma = 1
estimates1 = []
estimates2 = []
for _ in range(m):
xs = [random.gauss(mu, sigma) for i in range(n)]
biased = np.var(xs)
unbiased = np.var(xs, ddof=1)
estimates1.append(biased)
estimates2.append(unbiased)
print('Experiment 2')
print('RMSE biased', RMSE(estimates1, sigma**2))
print('RMSE unbiased', RMSE(estimates2, sigma**2))
def SimulateSample(lam=2, n=10, m=1000):
"""Sampling distribution of L as an estimator of exponential parameter.
lam: parameter of an exponential distribution
n: sample size
m: number of iterations
"""
def VertLine(x, y=1):
thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3)
estimates = []
for j in range(m):
xs = np.random.exponential(1/lam, n)
lamhat = 1/np.mean(xs)
estimates.append(lamhat)
stderr = RMSE(estimates, lam)
print('standard error', stderr)
cdf = thinkstats2.Cdf(estimates)
ci = cdf.Percentile(5), cdf.Percentile(95)
print('confidence interval', ci)
VertLine(ci[0])
VertLine(ci[1])
# plot the CDF
thinkplot.Cdf(cdf)
thinkplot.Save(root='estimation2',
xlabel='estimate',
ylabel='CDF',
title='Sampling distribution')
return stderr
def SimulateGame(lam):
"""Simulates a game and returns the estimated goal-scoring rate.
lam: actual goal scoring rate in goals per game
"""
goals = 0
t = 0
while True:
time_between_goals = random.expovariate(lam)
t += time_between_goals
if t > 1:
break
goals += 1
# estimated goal-scoring rate is the actual number of goals scored
L = goals
return L
def Estimate4(lam=2, m=1000000):
estimates = []
for i in range(m):
L = SimulateGame(lam)
estimates.append(L)
print('Experiment 4')
print('rmse L', RMSE(estimates, lam))
print('mean error L', MeanError(estimates, lam))
pmf = thinkstats2.Pmf(estimates)
thinkplot.Hist(pmf)
thinkplot.Show()
def main():
thinkstats2.RandomSeed(17)
Estimate1()
Estimate2()
print('Experiment 3')
for n in [10, 100, 1000]:
stderr = SimulateSample(n=n)
print(n, stderr)
Estimate4()
if __name__ == '__main__':
main()