|
| 1 | +############################################################# |
| 2 | +#2016 SISBID Module 5 - Unsupervised Learning |
| 3 | +#Genevera I. Allen & Yufeng Liu |
| 4 | +#Clustering Lab |
| 5 | +############################################################ |
| 6 | + |
| 7 | +############ |
| 8 | +#Data set - Author Data. This data set consists of word counts from chapters written by four British authors. |
| 9 | +#This lab will put together concepts from both dimension reduction and clustering. |
| 10 | +#There are ultimately 3 goals to this lab: |
| 11 | +#1) Correctly cluster author texts in an unsupervised manner. |
| 12 | +#2) Determine which words are responsible for correctly separating the author texts. |
| 13 | +#3) Visualize the author texts, words and the results of your analysis |
| 14 | +############# |
| 15 | + |
| 16 | + |
| 17 | +############# |
| 18 | +#Problem 1 - Visualization |
| 19 | +############# |
| 20 | +#Problem 1a - We wish to plot the author texts as well as the words via a 2D scatterplot. Which method would be best to use? Why? |
| 21 | + |
| 22 | +#Problem 1b - Apply PCA to visualize the author texts. Explain the results. |
| 23 | + |
| 24 | +#Problem 1c - Apply MDS to visualize the author texts. Interpret the results. |
| 25 | + |
| 26 | +#Problem 1d - Can you use MDS to help determine which distance is appropriate for this data? Which one is best and why? |
| 27 | + |
| 28 | +#Problem 1e - Apply MDS with your chosen distance to visualize the words. Interpret the results. |
| 29 | + |
| 30 | +########## |
| 31 | +#Problem 2 - K-means |
| 32 | +########## |
| 33 | +#Problem 2a - Apply K-means with K=4 to this data. |
| 34 | + |
| 35 | +#Problem 2b - How well does K-mean do at separating the authors? |
| 36 | + |
| 37 | +#Problem 2c - Is K-means an appropriate clustering algorithm for this data? Why or Why nor? |
| 38 | + |
| 39 | +############# |
| 40 | +#Problem 3 - Hierarchical Clustering |
| 41 | +############# |
| 42 | +#Problem 3a - Apply hierarchical clustering to this data set. |
| 43 | + |
| 44 | +#Problem 3b - Which distance is best to use? Why? |
| 45 | + |
| 46 | +#Problem 3c - Which linkage is best to use? Why? |
| 47 | + |
| 48 | +#Problem 3d - Do any linkages perform particularly poorly? Explain this result. |
| 49 | + |
| 50 | +#Problem 3e - Visualize your hierarchical clustering results. |
| 51 | + |
| 52 | +########### |
| 53 | +#Problem 4 - Biclustering |
| 54 | +########### |
| 55 | + |
| 56 | +#Problem 4a - Apply the cluster heatmap method to visualize this data. Which distance and linkage functions did you use? |
| 57 | + |
| 58 | +#Problem 4b - Interpret the cluster heatmap. Which words are important for distinguishing author texts? |
| 59 | + |
| 60 | + |
| 61 | +########### |
| 62 | +#Problem 5 - NMF |
| 63 | +########### |
| 64 | + |
| 65 | +#Problem 5a - Apply NMF with K = 4 and use W to assign cluster labels to each observation. |
| 66 | + |
| 67 | +#Problem 5b - How well does NMF perform? Interpret and explain this result. |
| 68 | + |
| 69 | +#Problem 5c - Can you use the NMF to determine which words are important for distinguishing author texts? How? What did you find? |
| 70 | + |
| 71 | +############# |
| 72 | +#Problem 6 - Wrap-up |
| 73 | +############ |
| 74 | +#Problem 6a - Overall, which method is the best at clustering the author texts? Why is this the case? |
| 75 | + |
| 76 | +#Problem 6b - Which words are key for distinguishing the author texts? How did you determine these? |
| 77 | + |
| 78 | +#Problem 6c - Overall, which is the best method for providing a visual summary of the data? |
| 79 | + |
| 80 | +####################### |
| 81 | + |
| 82 | + |
| 83 | +############################################################### |
| 84 | +############################################################### |
| 85 | +#R scripts to help out with the Clustering Lab |
| 86 | +#Don't peek at this if you want to practice coding on your own!! |
| 87 | +################################################################## |
| 88 | + |
| 89 | + |
| 90 | +####################### |
| 91 | +#author data |
| 92 | + |
| 93 | +load("UnsupL_SISBID_2016.RData") |
| 94 | + |
| 95 | +#understand the data a bit |
| 96 | +dim(author) |
| 97 | +colnames(author) |
| 98 | +unique(rownames(author)) |
| 99 | +TrueAuth = as.factor(rownames(author)) |
| 100 | + |
| 101 | +par(mfrow=c(2,2)) |
| 102 | +hist(author[,colnames(author)=="the"],breaks=25) |
| 103 | +hist(author[,colnames(author)=="a"],breaks=25) |
| 104 | +hist(author[,colnames(author)=="and"],breaks=25) |
| 105 | +hist(author[,colnames(author)=="things"],breaks=25) |
| 106 | + |
| 107 | +#take out bookID |
| 108 | +X = author[,1:69] |
| 109 | + |
| 110 | + |
| 111 | +############# |
| 112 | +#Visulaizing data - how to visulaize texts? words? in 2-dimensions |
| 113 | + |
| 114 | +#trying PCA |
| 115 | +sv = svd(X); |
| 116 | +V = sv$v |
| 117 | +Z = X%*%V; |
| 118 | +plot(Z[,1],Z[,2],type="n") |
| 119 | +text(Z[,1],Z[,2],rownames(X),col=as.numeric(TrueAuth),cex=.5) |
| 120 | +#why doesn't this work well? |
| 121 | + |
| 122 | +######## |
| 123 | +#trying MDS (classical) |
| 124 | +#can you use MDS to decide which distance is best to understand this data? |
| 125 | + |
| 126 | +#visualizing author texts |
| 127 | +Dmat = dist(X,method="canberra") |
| 128 | +mdsres = cmdscale(Dmat,k=2) |
| 129 | +plot(mdsres[,1],mdsres[,2],type="n") |
| 130 | +text(mdsres[,1],mdsres[,2],rownames(X),col=as.numeric(TrueAuth),cex=.5) |
| 131 | + |
| 132 | +#visulaizing words |
| 133 | +Dmat = dist(t(X),method="canberra") |
| 134 | +mdsresW = cmdscale(Dmat,k=2) |
| 135 | +plot(mdsresW[,1],mdsresW[,2],type="n") |
| 136 | +text(mdsresW[,1],mdsresW[,2],colnames(X)) |
| 137 | + |
| 138 | +############## |
| 139 | +#K- means |
| 140 | +K = 4 |
| 141 | +km = kmeans(X,centers=K) |
| 142 | +table(km$cluster,TrueAuth) |
| 143 | + |
| 144 | +plot(mdsres[,1],mdsres[,2],type="n") |
| 145 | +text(mdsres[,1],mdsres[,2],rownames(X),col=km$cluster,cex=.5) |
| 146 | + |
| 147 | +############### |
| 148 | +#hierarchical clustering |
| 149 | +#which distance is appropraite? Why? |
| 150 | + |
| 151 | +Dmat = dist(X,method="canberra") |
| 152 | +com.hc = hclust(Dmat,method="complete") |
| 153 | +res.com = cutree(com.hc,4) |
| 154 | +table(res.com,TrueAuth) |
| 155 | + |
| 156 | +plot(com.hc,cex=.5) |
| 157 | + |
| 158 | +#which linkage is best? Why? |
| 159 | + |
| 160 | +Dmat = dist(X,method="canberra") |
| 161 | +com.hc = hclust(Dmat,method="ward.D") |
| 162 | +res.com = cutree(com.hc,4) |
| 163 | +table(res.com,TrueAuth) |
| 164 | + |
| 165 | +plot(com.hc,cex=.5) |
| 166 | + |
| 167 | +#do any preform terribly? Why? |
| 168 | + |
| 169 | +#visualize hierarchical clustering reuslts using MDS |
| 170 | +table(res.com,TrueAuth) |
| 171 | +plot(mdsres[,1],mdsres[,2],type="n") |
| 172 | +text(mdsres[,1],mdsres[,2],rownames(X),col=res.com,cex=.5) |
| 173 | + |
| 174 | + |
| 175 | +############# |
| 176 | +#cluster heatmap |
| 177 | + |
| 178 | +heatmap(X,distfun=function(x)dist(x,method="canberra"),hclustfun=function(x)hclust(x,method="ward.D")) |
| 179 | + |
| 180 | +heatmap(scale(X),distfun=function(x)dist(x,method="canberra"),hclustfun=function(x)hclust(x,method="ward.D")) |
| 181 | + |
| 182 | +############# |
| 183 | +#NMF |
| 184 | +library("NMF") |
| 185 | +K = 4 |
| 186 | +nmffit = nmf(X,rank=K) |
| 187 | +W = basis(nmffit) |
| 188 | +H = coef(nmffit) |
| 189 | + |
| 190 | +cmap = apply(W,1,which.max) |
| 191 | +table(cmap,TrueAuth) |
| 192 | + |
| 193 | +par(mfrow=c(1,2)) |
| 194 | +basismap(nmffit,annRow=rownames(X),scale="col",legend=FALSE) |
| 195 | +coefmap(nmffit,annCol=colnames(X),scale="col",legend=FALSE) |
| 196 | + |
| 197 | +par(mfrow=c(1,1)) |
| 198 | +coefmap(nmffit,annCol=colnames(X),scale="col",legend=FALSE) |
| 199 | + |
| 200 | +#which words are most important for distinguishing authors? |
| 201 | + |
| 202 | +######################################################## |
0 commit comments