# ############################# # CLUSTER ANALYSIS # ############################# # SET THE WORKING DIRECTORY setwd("C:/Courses/STA5703/2009-Fall/R/") # WE CONSIDER THE NCI GENE EXPRESSION DATA GIVEN IN HASTIE ET AL. (2009) data(swiss) dim(swiss) # ============================= # PLOT THE HEATMAP FOR THE DATA # ============================= ?heatmap require(graphics) # Since we would like to explore observation association, Then R <- cor(t(swiss)) heatmap(R, col = rainbow(100, start=0, end=1/6), scale="column", Rowv=NA, Colv = NA, margin=c(6,6), labCol=rownames(swiss), labRow=rownames(swiss), xlab = "county", ylab= "county", main = "Heatmap of the Correlation Matrix") heatmap(R, col = rainbow(100, start=0, end=1/6), scale="column", Rowv=NULL, # THE ONLY DIFFERENCE WITH THE ABOVE HEATMAP Colv = NA, margin=c(6,6), labCol=rownames(swiss), labRow=rownames(swiss), xlab = "county", ylab= "county", main = "Heatmap of the Correlation Matrix") # =================================== # PREPARATION FOR CLUSTER ANALYSIS # =================================== # YOU MUST EITHER REMOVE OR IMPUTE MISSING VALUES FIRST swiss <- na.omit(swiss) # OPTION I: Rescale (standardize) variables for comparability. mydata <- scale(swiss, center = TRUE, scale = TRUE) cov(mydata) # OPTION II: Tranform data into Range[0,1] # mydata <- apply(swiss, 2, FUN=function(x){(x-min(x))/(max(x)-min(x))}) # ============== # GAP STATISTICS # ============== # The GAP Statistic of Tishirani et al. (2000) # To install R pckage SAGx source("http://bioconductor.org/biocLite.R") biocLite("SAGx") library(SAGx) ?gap library("MASS") data(swiss) n.clust <- 2:4 gap.stat <- gap.se <- rep(0, length(n.clust)) for (i in 1:length(n.clust)) { k <- n.clust[i] cl <- myclus(data = swiss, k = k) gap <- gap(swiss,cl$cluster, B=50) gap.stat[i] <- gap[1] gap.se[i] <- gap[2] } cbind(n.clust, gap.stat, gap.se) # A PLOT of THE GAP STAT par(mfrow=c(1,1), mar=rep(6, 4)) plot(x=c(min(n.clust), max(n.clust)), y=c(min(gap.stat-gap.se), max(gap.stat+gap.se)), type="n", xlab="num of cluster", ylab="gap") points(n.clust, gap.stat, type="p", xlab="num of cluster", ylab="gap", pch=19) lines(n.clust, gap.stat, lty=2, col="blue", lwd=2) for(i in 1:length(n.clust)){ arrows(x0=n.clust[i], y0=gap.stat[i], x1=n.clust[i], y1=gap.stat[i]-gap.se[i], lwd=3, col="brown") arrows(x0=n.clust[i], y0=gap.stat[i], x1=n.clust[i], y1=gap.stat[i]+gap.se[i], lwd=3, col="brown") } # EXPERIMENT WITH ANOTHER LIBRARY library(popgen) ## NB. The value of gap.n is set unrealistically low in this example so ## that the examples run efficiently at compile time. We suggest ## gap.n be set to 100 (the default) for reliable results. X <- simMD(100, 3, 100, p = NULL, c.vec1 = c(0.1, 0.2, 0.3), c.vec2 = 1, ac = 2, beta = 1) res <- nps(X - 1, dmetric = "manhattan", method = "mcquitty", gap.n = 2) nps.plot(res, k.max = 2) # -------------------- # K-MEANS PARTITIONING # -------------------- # SCREE PLOT for Determining number of clusters # --------------------------------------------- wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) for (i in 2:15) wss[i] <- sum(kmeans(mydata, centers=i)$withinss) plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") # K-Means Cluster Analysis fit <- kmeans(mydata, 3) # 3 cluster solution # get cluster means aggregate(mydata,by=list(fit$cluster),FUN=mean) # append cluster assignment to each observation dat1 <- data.frame(mydata, fit$cluster) # NOTE: A robust version of K-means based on mediods can be invoked by # using pam( ) instead of kmeans( ). The function pamk( ) in the fpc # package is a wrapper for pam that also prints the suggested number # of clusters based on optimum average silhouette width. # --------------------------- # HIERARCHICAL AGGLOMERATIVE # -------------------------- # Ward Hierarchical Clustering d <- dist(mydata, method = "euclidean") # distance matrix fit <- hclust(d, method="ward") # THIS WILL TAKE LONG, ABOUT 10-15 MINUTES plot(fit) # display dendogram groups <- cutree(fit, k=5) # cut tree into 5 clusters # draw dendogram with red borders around the 5 clusters rect.hclust(fit, k=5, border="red") # The pvclust( ) function in the pvclust package provides p-values for # hierarchical clustering based on multiscale bootstrap resampling. # Clusters that are highly supported by the data will have large p values. # Interpretation details are provided Suzuki. Be aware that pvclust clusters # columns, not rows. Transpose your data before using. # Ward Hierarchical Clustering with Bootstrapped p values library(pvclust) fit <- pvclust(mydata, method.hclust="ward", method.dist="euclidean") plot(fit) # dendogram with p values # add rectangles around groups highly supported by the data pvrect(fit, alpha=.95) # --------------------------------------- # PLOTTING CLUSTERING SOLUTIONS WITH MDS # --------------------------------------- # K-Means Clustering with 3 clusters fit <- kmeans(mydata, 3) # Cluster Plot against 1st 2 principal components # vary parameters for most readable graph library(cluster) clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE, labels=2, lines=0) # ----------------------- # MODEL BASED CLUSTERING # ----------------------- # NOTE: Model based approaches assume a variety of data models and apply # maximum likelihood estimation and Bayes criteria to identify the most # likely model and number of clusters. Specifically, the Mclust( ) function # in the mclust package selects the optimal model according to BIC for EM # initialized by hierarchical clustering for parameterized Gaussian mixture # models. (phew!). One chooses the model and number of clusters with the # largest BIC. See help(mclustModelNames) to details on the model chosen # as best. library(mclust) fit <- Mclust(mydata) par(mfrow=c(1,1), mar=rep(5,4)) plot(fit, mydata) # plot results print(fit) # display the best model names(fit) fit$G #