Skip to content

Commit

Permalink
add a beautiful Heap in ./tool,add KNN in /KNN,and the test code in /…
Browse files Browse the repository at this point in the history
…test
  • Loading branch information
justdark committed Nov 13, 2013
1 parent 1433081 commit 3346387
Showing 18 changed files with 324 additions and 17 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -16,10 +16,12 @@ Code Files

`./dml/DT` -Decision Tree , CART algorithm

`./dml/ClUSTER` -some cluster algorithm,inculde kmeans\kmedoids\spectralCluster(todo)
`./dml/ClUSTER` -some cluster algorithm,inculde kmeans \ kmedoids \ spectralCluster

`./dml/ADAB` -the adaboost algorithm

`./dml/KNN` -the k-Nearest Neighbor algorithm(kd-tree BBF implementing)

`./dml/tool` -include some basic tools for computing

`./test/` -include some test code for DML
11 changes: 2 additions & 9 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
RECENT TODO List:
==

1.spectual cluster
--
learning

2.Decision Tree
1.Decision Tree
--
add Pruning

3.k-nearest
--
kd tree is required of course

4.Naive Bayesian Model
2.Naive Bayesian Model
--
to do
18 changes: 13 additions & 5 deletions dml/CLUSTER/spectralCluster.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,14 @@ def EuclidDistance(x,y):
return np.sqrt(np.sum(((np.array(x)-np.array(y))**2)))

class SCC:
def __init__(self,X,K,dist=EuclidDistance):
def __init__(self,X,K,dist=EuclidDistance,ftype="Normalized"):
'''
X is a M*N matrix contain M case of train data
K is the number of cluster you want to get
dist is a function that to make the matrix
ftype support "Normalized" or "Ratio"
two different way to calculate Laplacian
'''
self.X=X
self.K=K
self.dist=dist
@@ -16,14 +23,15 @@ def __init__(self,X,K,dist=EuclidDistance):
self.W=self.distmat(X,X)
self.D=np.diag(self.W.sum(axis=0))
self.L=self.D-self.W
self.D[self.D==0]=1
self.L=self.D**(-0.5)*self.L*self.D**(-0.5)

self.ftype=ftype
if ftype=="Normalized":
self.D[self.D==0]=1
self.L=self.D**(-0.5)*self.L*self.D**(-0.5)
pass
def train(self,maxiter=100,threshold=0.1):
v,self.T=eig(self.L)
#print v
self.km=KMEANSC(self.T[:,:self.K],self.K)
self.km=KMEANSC(self.T[:,1:self.K],self.K)
self.km.train(maxiter,threshold)
self.labels=self.km.labels
def distmat(self,X,Y):
Binary file modified dml/CLUSTER/spectralCluster.pyc
Binary file not shown.
12 changes: 12 additions & 0 deletions dml/KNN/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
"""

import numpy as np
import scipy as sp
import pylab as py
from .knn import KNNC
from .kd import KDTree

__all__ = ['KNNC','KDTree'
]
Binary file added dml/KNN/__init__.pyc
Binary file not shown.
110 changes: 110 additions & 0 deletions dml/KNN/kd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from __future__ import division
import numpy as np
import scipy as sp
from operator import itemgetter
from scipy.spatial.distance import euclidean
from dml.tool import Heap
class KDNode:
def __init__(self,x,y,l):
self.x=x
self.y=y
self.l=l
self.F=None
self.Lc=None
self.Rc=None
self.distsToNode=None

class KDTree:
def __init__(self,X,y=None,dist=euclidean):
self.X=X
self.k=X.shape[0] #N
self.y=y
self.dist=dist
self.P=self.maketree(X,y,0)
self.P.F=None
def maketree(self,data,y,deep):
if data.size==0:
return None
lenght = data.shape[0]
case = data.shape[1]
p=int((case)/2)
l = (deep%self.k)
#print data
data=np.vstack((data,y))
data=np.array(sorted(data.transpose(),key=itemgetter(l))).transpose()
#print data
y=data[lenght,:]
data=data[:lenght,:]

v=data[l,p]
rP=KDNode(data[:,p],y[p],l)
#print data[:,p],y[p],l
if case>1:
ldata=data[:,data[l,:]<v]
ly=y[data[l,:]<v]
data[l,p]=v-1
rdata=data[:,data[l,:]>=v]
ry=y[data[l,:]>=v]
data[l,p]=v
rP.Lc=self.maketree(ldata,ly,deep+1)
if rP.Lc!=None:
rP.Lc.F=rP
rP.Rc=self.maketree(rdata,ry,deep+1)
if rP.Rc!=None:
rP.Rc.F=rP
return rP

def search_knn(self,P,x,k,maxiter=200):
def pf_compare(a,b):
return self.dist(x,a.x)<self.dist(x,b.x)
def ans_compare(a,b):
return self.dist(x,a.x)>self.dist(x,b.x)
pf_seq=Heap(compare=pf_compare)
pf_seq.insert(P) #prior sequence
ans=Heap(k,compare=ans_compare) #ans sequence
while pf_seq.counter>0:
t=pf_seq.heap[1]
pf_seq.delete(1)
flag=True
if ans.counter==k:
now=t.F
#print ans.heap[1].x,'========'
if now != None:
q=x.copy()
q[now.l]=now.x[now.l]
length=self.dist(q,x)
if length>self.dist(ans.heap[1].x,x):
flag=False
else:
flag=True
else:
flag=True
if flag:
tp,pf_seq,ans=self.to_leaf(t,x,pf_seq,ans)
#print "============="
#ans.insert(tp)
return ans


def to_leaf(self,P,x,pf_seq,ans):
tp=P
if tp!=None:
ans.insert(tp)
if tp.x[tp.l]>x[tp.l]:
if tp.Rc!=None:
pf_seq.insert(tp.Rc)
if tp.Lc==None:
return tp,pf_seq,ans
else:
return self.to_leaf(tp.Lc,x,pf_seq,ans)
if tp.Lc!=None:
pf_seq.insert(tp.Lc)
if tp.Rc==None:
return tp,pf_seq,ans
else:
return self.to_leaf(tp.Rc,x,pf_seq,ans)





Binary file added dml/KNN/kd.pyc
Binary file not shown.
66 changes: 66 additions & 0 deletions dml/KNN/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#coding:utf-8
import numpy as np
import scipy as sp
from scipy.spatial.distance import cdist
from scipy.spatial.distance import euclidean
from dml.KNN.kd import KDTree

#import pylab as py
class KNNC:
"""docstring for KNNC"""
def __init__(self,X,K,labels=None,dist=euclidean):
'''
X is a N*M matrix where M is the case
labels is prepare for the predict.
dist is the similarity measurement way,
The distance function can be ‘braycurtis’, ‘canberra’,
‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’,
‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘kulsinski’,
‘mahalanobis’,
'''
self.X = np.array(X)
if labels==None:
np.zeros((1,self.X.shape[1]))
self.labels = np.array(labels)
self.K = K
self.dist = dist
self.KDTrees=KDTree(X,labels,self.dist)

def predict(self,x,k):
ans=self.KDTrees.search_knn(self.KDTrees.P,x,k)
dc={}
maxx=0
y=0
for i in range(ans.counter+1):
if i==0:
continue
dc.setdefault(ans.heap[i].y,0)
dc[ans.heap[i].y]+=1
if dc[ans.heap[i].y]>maxx:
maxx=dc[ans.heap[i].y]
y=ans.heap[i].y
return y
def for_point(self,test_x,k=None):
if k==None:
k=self.K
ans=self.KDTrees.search_knn(self.KDTrees.P,np.array(test_x),k)
result=[]
for i in range(ans.counter+1):
if i==0:
continue
result.append(ans.heap[i].x)
return result
def pred(self,test_x,k=None):
'''
test_x is a N*TM matrix,and indicate TM test case
you can redecide the k
'''
if k==None:
k=self.K
test_case=np.array(test_x)
y=[]
for i in range(test_case.shape[1]):
y.append(self.predict(test_case[:,i].transpose(),k))
return y
Binary file added dml/KNN/knn.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion dml/__init__.py
Original file line number Diff line number Diff line change
@@ -7,5 +7,5 @@
import pylab as py


__all__ = ['LR','NN','CLUSTER','ADAB','DT','tool'
__all__ = ['LR','NN','CLUSTER','ADAB','DT','KNN','tool'
]
Binary file modified dml/__init__.pyc
Binary file not shown.
4 changes: 3 additions & 1 deletion dml/tool/__init__.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@
from .sign import sign
from .pca import pca,projectData,recoverData
from .displayData import displayData
from .heap import Heap
__all__ = ['sigmoid',
'normalize',
'disnormalize',
@@ -17,5 +18,6 @@
'pca',
'projectData',
'recoverData',
'displayData'
'displayData',
'Heap'
]
Binary file modified dml/tool/__init__.pyc
Binary file not shown.
84 changes: 84 additions & 0 deletions dml/tool/heap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from __future__ import division
import numpy as np
import scipy as sp
def heap_judge(a,b):
return a>b

class Heap:
def __init__(self,K=None,compare=heap_judge):
'''
'K' is the parameter to restrict the length of Heap
!!! when K is confirmed,the Min heap contain Max K elements
while Max heap contain Min K elements
'compare' is the compare function which return a BOOL when pass two variable
default is Max heap
'''
self.K=K
self.compare=compare
self.heap=['#']
self.counter=0
def insert(self,a):
#print self.heap
#if self.K!=None:
# print a.x,'==='
if self.K==None:
self.heap.append(a)
self.counter+=1
self.up(self.counter)
else:
if self.counter<self.K:
self.heap.append(a)
self.counter+=1
self.up(self.counter)
else:
if (not self.compare(a,self.heap[1])):
self.heap[1]=a
self.down(1)
return
def up(self,index):
if (index==1):
return
'''
print index
for t in range(index+1):
if t==0:
continue
print self.heap[t].x
print
'''
if self.compare(self.heap[index],self.heap[int(index/2)]):
#fit the condition
self.heap[index],self.heap[int(index/2)]=self.heap[int(index/2)],self.heap[index]
self.up(int(index/2))
return
def down(self,index):
if 2*index>self.counter:
return
tar_index=0
if 2*index<self.counter:
if self.compare(self.heap[index*2],self.heap[index*2+1]):
tar_index=index*2
else:
tar_index=index*2+1
else:
tar_index=index*2
if not self.compare(self.heap[index],self.heap[tar_index]):
self.heap[index],self.heap[tar_index]=self.heap[tar_index],self.heap[index]
self.down(tar_index)
return

def delete(self,index):
self.heap[index],self.heap[self.counter]=self.heap[self.counter],self.heap[index]
self.heap.pop()
self.counter-=1
self.down(index)
pass

def delete_ele(self,a):
try:
t=self.heap.index(a)
except ValueError:
t=None
if t!=None:
self.delete(t)
return t
Binary file added dml/tool/heap.pyc
Binary file not shown.
9 changes: 9 additions & 0 deletions test/knn/knn_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from __future__ import division
import numpy as np
import scipy as sp
from dml.KNN.kd import KDTree
from dml.KNN import KNNC
X=np.array([[2,5,9,4,8,7],[3,4,6,7,1,2]])
y=np.array([2,5,9,4,8,7])
knn=KNNC(X,1,y)
print knn.for_point([[2],[2]])
Loading

0 comments on commit 3346387

Please sign in to comment.