add a beautiful Heap in ./tool,add KNN in /KNN,and the test code in /…

…test
justdark · Nov 13, 2013 · 3346387 · 3346387
1 parent 1433081
commit 3346387
Showing 18 changed files with 324 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -16,10 +16,12 @@ Code Files
 
 `./dml/DT` -Decision Tree , CART algorithm
 
-`./dml/ClUSTER` -some cluster algorithm,inculde kmeans\kmedoids\spectralCluster(todo)
+`./dml/ClUSTER` -some cluster algorithm,inculde kmeans \ kmedoids \ spectralCluster
 
 `./dml/ADAB` -the adaboost algorithm
 
+`./dml/KNN` -the k-Nearest Neighbor algorithm(kd-tree BBF implementing)
+
 `./dml/tool` -include some basic tools for computing
 
 `./test/` -include some test code for DML

diff --git a/TODO.md b/TODO.md
@@ -1,18 +1,11 @@
 RECENT TODO List:
 ==
 
-1.spectual cluster
---
- learning
 
-2.Decision Tree
+1.Decision Tree
 --
  add Pruning
 
-3.k-nearest
---
- kd tree is required of course
-
-4.Naive Bayesian Model
+2.Naive Bayesian Model
 --
 to do
diff --git a/dml/CLUSTER/spectralCluster.py b/dml/CLUSTER/spectralCluster.py
@@ -7,7 +7,14 @@ def EuclidDistance(x,y):
 	return np.sqrt(np.sum(((np.array(x)-np.array(y))**2)))
 
 class SCC:
-	def __init__(self,X,K,dist=EuclidDistance):
+	def __init__(self,X,K,dist=EuclidDistance,ftype="Normalized"):
+		'''
+			X is a M*N matrix contain M case of train data
+			K is the number of cluster you want to get
+			dist is a function that to make the matrix
+			ftype support "Normalized" or "Ratio"
+			      two different way to calculate Laplacian
+		'''
 		self.X=X
 		self.K=K
 		self.dist=dist
@@ -16,14 +23,15 @@ def __init__(self,X,K,dist=EuclidDistance):
 		self.W=self.distmat(X,X)
 		self.D=np.diag(self.W.sum(axis=0))
 		self.L=self.D-self.W
-		self.D[self.D==0]=1
-		self.L=self.D**(-0.5)*self.L*self.D**(-0.5)
-
+		self.ftype=ftype
+		if ftype=="Normalized":
+			self.D[self.D==0]=1
+			self.L=self.D**(-0.5)*self.L*self.D**(-0.5)
 		pass
 	def train(self,maxiter=100,threshold=0.1):
 		v,self.T=eig(self.L)
 		#print v
-		self.km=KMEANSC(self.T[:,:self.K],self.K)
+		self.km=KMEANSC(self.T[:,1:self.K],self.K)
 		self.km.train(maxiter,threshold)
 		self.labels=self.km.labels
 	def distmat(self,X,Y):

diff --git a/dml/CLUSTER/spectralCluster.pyc b/dml/CLUSTER/spectralCluster.pyc
diff --git a/dml/KNN/__init__.py b/dml/KNN/__init__.py
@@ -0,0 +1,12 @@
+"""
+
+"""
+
+import numpy as np
+import scipy as sp
+import pylab as py
+from .knn import KNNC
+from .kd import KDTree
+
+__all__ = ['KNNC','KDTree'
+]
diff --git a/dml/KNN/__init__.pyc b/dml/KNN/__init__.pyc
diff --git a/dml/KNN/kd.py b/dml/KNN/kd.py
@@ -0,0 +1,110 @@
+from __future__ import division
+import numpy as np
+import scipy as sp
+from operator import itemgetter
+from scipy.spatial.distance import euclidean
+from dml.tool import Heap
+class KDNode:
+	def __init__(self,x,y,l):
+		self.x=x
+		self.y=y
+		self.l=l
+		self.F=None
+		self.Lc=None
+		self.Rc=None
+		self.distsToNode=None
+
+class KDTree:
+	def __init__(self,X,y=None,dist=euclidean):
+		self.X=X
+		self.k=X.shape[0] #N
+		self.y=y
+		self.dist=dist
+		self.P=self.maketree(X,y,0)
+		self.P.F=None
+	def maketree(self,data,y,deep):
+		if data.size==0:
+			return None
+		lenght = data.shape[0]
+		case = data.shape[1]
+		p=int((case)/2)
+		l = (deep%self.k)
+		#print data
+		data=np.vstack((data,y))
+		data=np.array(sorted(data.transpose(),key=itemgetter(l))).transpose()
+		#print data
+		y=data[lenght,:]
+		data=data[:lenght,:]
+
+		v=data[l,p]
+		rP=KDNode(data[:,p],y[p],l)
+		#print data[:,p],y[p],l
+		if case>1:
+			ldata=data[:,data[l,:]<v]
+			ly=y[data[l,:]<v]
+			data[l,p]=v-1
+			rdata=data[:,data[l,:]>=v]
+			ry=y[data[l,:]>=v]
+			data[l,p]=v
+			rP.Lc=self.maketree(ldata,ly,deep+1)
+			if rP.Lc!=None:
+				rP.Lc.F=rP
+			rP.Rc=self.maketree(rdata,ry,deep+1)
+			if rP.Rc!=None:
+				rP.Rc.F=rP
+		return rP
+
+	def search_knn(self,P,x,k,maxiter=200):
+		def pf_compare(a,b):
+			return self.dist(x,a.x)<self.dist(x,b.x)
+		def ans_compare(a,b):
+			return self.dist(x,a.x)>self.dist(x,b.x)
+		pf_seq=Heap(compare=pf_compare)
+		pf_seq.insert(P)    #prior sequence
+		ans=Heap(k,compare=ans_compare)  #ans sequence
+		while pf_seq.counter>0:
+			t=pf_seq.heap[1]
+			pf_seq.delete(1)
+			flag=True
+			if ans.counter==k:
+				now=t.F
+				#print ans.heap[1].x,'========'
+				if now != None:
+					q=x.copy()
+					q[now.l]=now.x[now.l]
+					length=self.dist(q,x)
+					if length>self.dist(ans.heap[1].x,x):
+						flag=False
+					else:
+						flag=True
+				else:
+					flag=True
+			if flag:
+				tp,pf_seq,ans=self.to_leaf(t,x,pf_seq,ans)
+			#print "============="
+			#ans.insert(tp)
+		return ans
+
+
+	def to_leaf(self,P,x,pf_seq,ans):
+		tp=P
+		if tp!=None:
+			ans.insert(tp)
+			if tp.x[tp.l]>x[tp.l]:
+				if tp.Rc!=None:
+					pf_seq.insert(tp.Rc)
+				if tp.Lc==None:
+					return tp,pf_seq,ans
+				else:
+					return self.to_leaf(tp.Lc,x,pf_seq,ans)
+			if tp.Lc!=None:
+				pf_seq.insert(tp.Lc)
+			if tp.Rc==None:
+					return tp,pf_seq,ans
+			else:
+					return self.to_leaf(tp.Rc,x,pf_seq,ans)
+
+
+
+
+
diff --git a/dml/KNN/kd.pyc b/dml/KNN/kd.pyc
diff --git a/dml/KNN/knn.py b/dml/KNN/knn.py
@@ -0,0 +1,66 @@
+#coding:utf-8 
+import numpy as np
+import scipy as sp
+from scipy.spatial.distance import cdist
+from scipy.spatial.distance import euclidean
+from dml.KNN.kd import KDTree
+
+#import pylab as py
+class KNNC:
+	"""docstring for KNNC"""
+	def __init__(self,X,K,labels=None,dist=euclidean):
+		'''
+			X is a N*M matrix where M is the case 
+			labels is prepare for the predict.
+			dist is the similarity measurement way,
+
+			The distance function can be ‘braycurtis’, ‘canberra’, 
+			‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, 
+			‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘kulsinski’, 
+			‘mahalanobis’, 
+
+		'''
+		self.X = np.array(X)
+		if labels==None:
+			np.zeros((1,self.X.shape[1]))
+		self.labels = np.array(labels)
+		self.K = K
+		self.dist = dist
+		self.KDTrees=KDTree(X,labels,self.dist)
+
+	def predict(self,x,k):
+		ans=self.KDTrees.search_knn(self.KDTrees.P,x,k)
+		dc={}
+		maxx=0
+		y=0
+		for i in range(ans.counter+1):
+			if i==0:
+				continue
+			dc.setdefault(ans.heap[i].y,0)
+			dc[ans.heap[i].y]+=1
+			if dc[ans.heap[i].y]>maxx:
+				maxx=dc[ans.heap[i].y]
+				y=ans.heap[i].y
+		return y
+	def for_point(self,test_x,k=None):
+		if k==None:
+			k=self.K
+		ans=self.KDTrees.search_knn(self.KDTrees.P,np.array(test_x),k)
+		result=[]
+		for i in range(ans.counter+1):
+			if i==0:
+				continue
+			result.append(ans.heap[i].x)
+		return result
+	def pred(self,test_x,k=None):
+		'''
+			test_x is a N*TM matrix,and indicate TM test case
+			you can redecide the k
+		'''
+		if k==None:
+			k=self.K
+		test_case=np.array(test_x)
+		y=[]
+		for i in range(test_case.shape[1]):
+			y.append(self.predict(test_case[:,i].transpose(),k))
+		return y
diff --git a/dml/KNN/knn.pyc b/dml/KNN/knn.pyc
diff --git a/dml/__init__.py b/dml/__init__.py
@@ -7,5 +7,5 @@
 import pylab as py
 
 
-__all__ = ['LR','NN','CLUSTER','ADAB','DT','tool'
+__all__ = ['LR','NN','CLUSTER','ADAB','DT','KNN','tool'
 ]
diff --git a/dml/__init__.pyc b/dml/__init__.pyc
diff --git a/dml/tool/__init__.py b/dml/tool/__init__.py
@@ -9,6 +9,7 @@
 from .sign import sign
 from .pca import pca,projectData,recoverData
 from .displayData import  displayData
+from .heap import Heap
 __all__ = ['sigmoid',
 'normalize',
 'disnormalize',
@@ -17,5 +18,6 @@
 'pca',
 'projectData',
 'recoverData',
-'displayData'
+'displayData',
+'Heap'
 ]
diff --git a/dml/tool/__init__.pyc b/dml/tool/__init__.pyc
diff --git a/dml/tool/heap.py b/dml/tool/heap.py
@@ -0,0 +1,84 @@
+from __future__ import division
+import numpy as np
+import scipy as sp
+def heap_judge(a,b):
+	return a>b
+
+class Heap:
+	def __init__(self,K=None,compare=heap_judge):
+		'''
+			'K' 		is the parameter to restrict the length of Heap
+						!!! when K is confirmed,the Min heap contain Max K elements
+			                        		  while Max heap contain Min K elements
+			'compare' 	is the compare function which return a BOOL when pass two variable
+						default is Max heap
+		'''
+		self.K=K
+		self.compare=compare
+		self.heap=['#']
+		self.counter=0
+	def insert(self,a):
+		#print self.heap
+		#if self.K!=None:
+		#	print a.x,'==='
+		if self.K==None:
+			self.heap.append(a)
+			self.counter+=1
+			self.up(self.counter)
+		else:
+			if self.counter<self.K:
+				self.heap.append(a)
+				self.counter+=1
+				self.up(self.counter)
+			else:
+				if (not self.compare(a,self.heap[1])):
+					self.heap[1]=a
+					self.down(1)
+		return
+	def up(self,index):
+		if (index==1):
+			return
+		'''
+		print index
+		for t in range(index+1):
+			if t==0:
+				continue
+			print self.heap[t].x
+		print 
+		'''
+		if self.compare(self.heap[index],self.heap[int(index/2)]):
+			#fit the condition
+			self.heap[index],self.heap[int(index/2)]=self.heap[int(index/2)],self.heap[index]
+			self.up(int(index/2))
+		return
+	def down(self,index):
+		if 2*index>self.counter:
+			return
+		tar_index=0
+		if 2*index<self.counter:
+			if self.compare(self.heap[index*2],self.heap[index*2+1]):
+				tar_index=index*2
+			else:
+				tar_index=index*2+1
+		else:
+			tar_index=index*2
+		if not self.compare(self.heap[index],self.heap[tar_index]):
+			self.heap[index],self.heap[tar_index]=self.heap[tar_index],self.heap[index]
+			self.down(tar_index)
+		return
+
+	def delete(self,index):
+		self.heap[index],self.heap[self.counter]=self.heap[self.counter],self.heap[index]
+		self.heap.pop()
+		self.counter-=1
+		self.down(index)
+		pass
+
+	def delete_ele(self,a):
+		try:
+			t=self.heap.index(a)
+		except ValueError:
+			t=None
+		if t!=None:
+			self.delete(t)
+		return t
diff --git a/dml/tool/heap.pyc b/dml/tool/heap.pyc
diff --git a/test/knn/knn_test.py b/test/knn/knn_test.py
@@ -0,0 +1,9 @@
+from __future__ import division
+import numpy as np
+import scipy as sp
+from dml.KNN.kd import KDTree
+from dml.KNN  import KNNC
+X=np.array([[2,5,9,4,8,7],[3,4,6,7,1,2]])
+y=np.array([2,5,9,4,8,7])
+knn=KNNC(X,1,y)
+print knn.for_point([[2],[2]])