@@ -13,13 +13,25 @@ def anomalyDetection_example():
13
13
plt = display_2d_data (X , 'bx' )
14
14
plt .title ("origin data" )
15
15
plt .show ()
16
+ '''多元高斯分布函数,并可视化拟合的边界'''
17
+ mu ,sigma2 = estimateGaussian (X ) # 参数估计(求均值和方差)
18
+ #print mu,sigma2
19
+ p = multivariateGaussian (X ,mu ,sigma2 ) # 多元高斯分布函数
20
+ #print p
21
+ visualizeFit (X ,mu ,sigma2 ) # 显示图像
16
22
17
- mu ,sigma2 = estimateGaussian (X )
18
- print mu ,sigma2
19
- p = multivariateGaussian (X ,mu ,sigma2 )
20
- print p
23
+ '''选择异常点(在交叉验证CV上训练得到最好的epsilon)'''
24
+ Xval = data ['Xval' ]
25
+ yval = data ['yval' ]
26
+ pval = multivariateGaussian (Xval , mu , sigma2 ) # 计算CV上的概率密度值
27
+ epsilon ,F1 = selectThreshold (yval ,pval ) # 选择最优的epsilon临界值
28
+ print u'在CV上得到的最好的epsilon是:%e' % epsilon
29
+ print u'对应的F1Score值为:%f' % F1
30
+ outliers = np .where (p < epsilon ) # 找到小于临界值的异常点,并作图
31
+ plt .plot (X [outliers ,0 ],X [outliers ,1 ],'o' ,markeredgecolor = 'r' ,markerfacecolor = 'w' ,markersize = 10. )
32
+ plt = display_2d_data (X , 'bx' )
33
+ plt .show ()
21
34
22
- visualizeFit (X ,mu ,sigma2 )
23
35
24
36
25
37
@@ -44,16 +56,49 @@ def multivariateGaussian(X,mu,Sigma2):
44
56
k = len (mu )
45
57
if (Sigma2 .shape [0 ]> 1 ):
46
58
Sigma2 = np .diag (Sigma2 )
47
-
59
+ '''多元高斯分布函数'''
48
60
X = X - mu
49
61
argu = (2 * np .pi )** (- k / 2 )* np .linalg .det (Sigma2 )** (- 0.5 )
50
62
p = argu * np .exp (- 0.5 * np .sum (np .dot (X ,np .linalg .inv (Sigma2 ))* X ,axis = 1 )) # axis表示每行
51
63
return p
52
64
53
65
# 可视化边界
54
66
def visualizeFit (X ,mu ,sigma2 ):
55
- X1 ,X2 = np .meshgrid (0 ,0.5 ,35 )
56
- Z = multivariateGaussian (np .vstack ((X1 ,X2 )), mu , Sigma2 )
67
+ x = np .arange (0 , 36 , 0.5 ) # 0-36,步长0.5
68
+ y = np .arange (0 , 36 , 0.5 )
69
+ X1 ,X2 = np .meshgrid (x ,y ) # 要画等高线,所以meshgird
70
+ Z = multivariateGaussian (np .hstack ((X1 .reshape (- 1 ,1 ),X2 .reshape (- 1 ,1 ))), mu , sigma2 ) # 计算对应的高斯分布函数
71
+ Z = Z .reshape (X1 .shape ) # 调整形状
72
+ plt .plot (X [:,0 ],X [:,1 ],'bx' )
73
+
74
+ if np .sum (np .isinf (Z ).astype (float )) == 0 : # 如果计算的为无穷,就不用画了
75
+ # plt.contourf(X1,X2,Z,10.**np.arange(-20, 0, 3),linewidth=.5)
76
+ CS = plt .contour (X1 ,X2 ,Z ,10. ** np .arange (- 20 , 0 , 3 ),color = 'black' ,linewidth = .5 ) # 画等高线,Z的值在10.**np.arange(-20, 0, 3)
77
+ #plt.clabel(CS)
78
+
79
+ plt .show ()
80
+
81
+ # 选择最优的epsilon,即:使F1Score最大
82
+ def selectThreshold (yval ,pval ):
83
+ '''初始化所需变量'''
84
+ bestEpsilon = 0.
85
+ bestF1 = 0.
86
+ F1 = 0.
87
+ step = (np .max (pval )- np .min (pval ))/ 1000
88
+ '''计算'''
89
+ for epsilon in np .arange (np .min (pval ),np .max (pval ),step ):
90
+ cvPrecision = pval < epsilon
91
+ tp = np .sum ((cvPrecision == 1 ) & (yval == 1 )).astype (float ) # sum求和是int型的,需要转为float
92
+ fp = np .sum ((cvPrecision == 1 ) & (yval == 0 )).astype (float )
93
+ fn = np .sum ((cvPrecision == 1 ) & (yval == 0 )).astype (float )
94
+ precision = tp / (tp + fp ) # 精准度
95
+ recision = tp / (tp + fn ) # 召回率
96
+ F1 = (2 * precision * recision )/ (precision + recision ) # F1Score计算公式
97
+ if F1 > bestF1 : # 修改最优的F1 Score
98
+ bestF1 = F1
99
+ bestEpsilon = epsilon
100
+ return bestEpsilon ,bestF1
101
+
57
102
58
103
59
104
if __name__ == '__main__' :
0 commit comments