Skip to content

Commit 9fb7fe1

Browse files
authored
Add files via upload
1 parent 938c333 commit 9fb7fe1

File tree

1 file changed

+202
-0
lines changed

1 file changed

+202
-0
lines changed

2016_SISBID_Clustering_Lab.R

+202
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
#############################################################
2+
#2016 SISBID Module 5 - Unsupervised Learning
3+
#Genevera I. Allen & Yufeng Liu
4+
#Clustering Lab
5+
############################################################
6+
7+
############
8+
#Data set - Author Data. This data set consists of word counts from chapters written by four British authors.
9+
#This lab will put together concepts from both dimension reduction and clustering.
10+
#There are ultimately 3 goals to this lab:
11+
#1) Correctly cluster author texts in an unsupervised manner.
12+
#2) Determine which words are responsible for correctly separating the author texts.
13+
#3) Visualize the author texts, words and the results of your analysis
14+
#############
15+
16+
17+
#############
18+
#Problem 1 - Visualization
19+
#############
20+
#Problem 1a - We wish to plot the author texts as well as the words via a 2D scatterplot. Which method would be best to use? Why?
21+
22+
#Problem 1b - Apply PCA to visualize the author texts. Explain the results.
23+
24+
#Problem 1c - Apply MDS to visualize the author texts. Interpret the results.
25+
26+
#Problem 1d - Can you use MDS to help determine which distance is appropriate for this data? Which one is best and why?
27+
28+
#Problem 1e - Apply MDS with your chosen distance to visualize the words. Interpret the results.
29+
30+
##########
31+
#Problem 2 - K-means
32+
##########
33+
#Problem 2a - Apply K-means with K=4 to this data.
34+
35+
#Problem 2b - How well does K-mean do at separating the authors?
36+
37+
#Problem 2c - Is K-means an appropriate clustering algorithm for this data? Why or Why nor?
38+
39+
#############
40+
#Problem 3 - Hierarchical Clustering
41+
#############
42+
#Problem 3a - Apply hierarchical clustering to this data set.
43+
44+
#Problem 3b - Which distance is best to use? Why?
45+
46+
#Problem 3c - Which linkage is best to use? Why?
47+
48+
#Problem 3d - Do any linkages perform particularly poorly? Explain this result.
49+
50+
#Problem 3e - Visualize your hierarchical clustering results.
51+
52+
###########
53+
#Problem 4 - Biclustering
54+
###########
55+
56+
#Problem 4a - Apply the cluster heatmap method to visualize this data. Which distance and linkage functions did you use?
57+
58+
#Problem 4b - Interpret the cluster heatmap. Which words are important for distinguishing author texts?
59+
60+
61+
###########
62+
#Problem 5 - NMF
63+
###########
64+
65+
#Problem 5a - Apply NMF with K = 4 and use W to assign cluster labels to each observation.
66+
67+
#Problem 5b - How well does NMF perform? Interpret and explain this result.
68+
69+
#Problem 5c - Can you use the NMF to determine which words are important for distinguishing author texts? How? What did you find?
70+
71+
#############
72+
#Problem 6 - Wrap-up
73+
############
74+
#Problem 6a - Overall, which method is the best at clustering the author texts? Why is this the case?
75+
76+
#Problem 6b - Which words are key for distinguishing the author texts? How did you determine these?
77+
78+
#Problem 6c - Overall, which is the best method for providing a visual summary of the data?
79+
80+
#######################
81+
82+
83+
###############################################################
84+
###############################################################
85+
#R scripts to help out with the Clustering Lab
86+
#Don't peek at this if you want to practice coding on your own!!
87+
##################################################################
88+
89+
90+
#######################
91+
#author data
92+
93+
load("UnsupL_SISBID_2016.RData")
94+
95+
#understand the data a bit
96+
dim(author)
97+
colnames(author)
98+
unique(rownames(author))
99+
TrueAuth = as.factor(rownames(author))
100+
101+
par(mfrow=c(2,2))
102+
hist(author[,colnames(author)=="the"],breaks=25)
103+
hist(author[,colnames(author)=="a"],breaks=25)
104+
hist(author[,colnames(author)=="and"],breaks=25)
105+
hist(author[,colnames(author)=="things"],breaks=25)
106+
107+
#take out bookID
108+
X = author[,1:69]
109+
110+
111+
#############
112+
#Visulaizing data - how to visulaize texts? words? in 2-dimensions
113+
114+
#trying PCA
115+
sv = svd(X);
116+
V = sv$v
117+
Z = X%*%V;
118+
plot(Z[,1],Z[,2],type="n")
119+
text(Z[,1],Z[,2],rownames(X),col=as.numeric(TrueAuth),cex=.5)
120+
#why doesn't this work well?
121+
122+
########
123+
#trying MDS (classical)
124+
#can you use MDS to decide which distance is best to understand this data?
125+
126+
#visualizing author texts
127+
Dmat = dist(X,method="canberra")
128+
mdsres = cmdscale(Dmat,k=2)
129+
plot(mdsres[,1],mdsres[,2],type="n")
130+
text(mdsres[,1],mdsres[,2],rownames(X),col=as.numeric(TrueAuth),cex=.5)
131+
132+
#visulaizing words
133+
Dmat = dist(t(X),method="canberra")
134+
mdsresW = cmdscale(Dmat,k=2)
135+
plot(mdsresW[,1],mdsresW[,2],type="n")
136+
text(mdsresW[,1],mdsresW[,2],colnames(X))
137+
138+
##############
139+
#K- means
140+
K = 4
141+
km = kmeans(X,centers=K)
142+
table(km$cluster,TrueAuth)
143+
144+
plot(mdsres[,1],mdsres[,2],type="n")
145+
text(mdsres[,1],mdsres[,2],rownames(X),col=km$cluster,cex=.5)
146+
147+
###############
148+
#hierarchical clustering
149+
#which distance is appropraite? Why?
150+
151+
Dmat = dist(X,method="canberra")
152+
com.hc = hclust(Dmat,method="complete")
153+
res.com = cutree(com.hc,4)
154+
table(res.com,TrueAuth)
155+
156+
plot(com.hc,cex=.5)
157+
158+
#which linkage is best? Why?
159+
160+
Dmat = dist(X,method="canberra")
161+
com.hc = hclust(Dmat,method="ward.D")
162+
res.com = cutree(com.hc,4)
163+
table(res.com,TrueAuth)
164+
165+
plot(com.hc,cex=.5)
166+
167+
#do any preform terribly? Why?
168+
169+
#visualize hierarchical clustering reuslts using MDS
170+
table(res.com,TrueAuth)
171+
plot(mdsres[,1],mdsres[,2],type="n")
172+
text(mdsres[,1],mdsres[,2],rownames(X),col=res.com,cex=.5)
173+
174+
175+
#############
176+
#cluster heatmap
177+
178+
heatmap(X,distfun=function(x)dist(x,method="canberra"),hclustfun=function(x)hclust(x,method="ward.D"))
179+
180+
heatmap(scale(X),distfun=function(x)dist(x,method="canberra"),hclustfun=function(x)hclust(x,method="ward.D"))
181+
182+
#############
183+
#NMF
184+
library("NMF")
185+
K = 4
186+
nmffit = nmf(X,rank=K)
187+
W = basis(nmffit)
188+
H = coef(nmffit)
189+
190+
cmap = apply(W,1,which.max)
191+
table(cmap,TrueAuth)
192+
193+
par(mfrow=c(1,2))
194+
basismap(nmffit,annRow=rownames(X),scale="col",legend=FALSE)
195+
coefmap(nmffit,annCol=colnames(X),scale="col",legend=FALSE)
196+
197+
par(mfrow=c(1,1))
198+
coefmap(nmffit,annCol=colnames(X),scale="col",legend=FALSE)
199+
200+
#which words are most important for distinguishing authors?
201+
202+
########################################################

0 commit comments

Comments
 (0)