####################### # BAGGING ####################### library(ipred) help(package="ipred") ?bagging # ================================== # Classification: Breast Cancer data # ================================== data(BreastCancer) # Test set error bagging (nbagg = 50): 3.7% (Breiman, 1998, Table 5) mod <- bagging(Class ~ Cl.thickness + Cell.size + Cell.shape + Marg.adhesion + Epith.c.size + Bare.nuclei + Bl.cromatin + Normal.nucleoli + Mitoses, data=BreastCancer, nbagg=50, coob=TRUE) print(mod) summary(mod) # show all the details #=============================================== # REGRESSION -- 1987 BASEBALL HITTER SALARY DATA #=============================================== baseball <- read.table("http://pegasus.cc.ucf.edu/~xsu/CLASS/STA4164/bb87.dat", header = F, col.names=c("id", "name", "bat86", "hit86", "hr86", "run86", "rb86", "wlk86", "yrs", "batcr", "hitcr", "hrcr", "runcr", "rbcr", "wlkcr", "leag86", "div86", "team86", "pos86", "puto86", "asst86", "err86", "salary", "leag87", "team87", "logsalary")) dim(baseball) # 263 26 fit <- bagging(logsalary~.-id - name - salary, data=baseball, nbagg=50, coob=TRUE); fit pred <- predict(fit, newdata=baseball) # COMPUTER R-SQUARED n <- nrow(baseball) sst <- (n-1)*var(baseball$logsalary) sse <- sum((baseball$logsalary-pred)^2); R2 <- (sst-sse)/sst cbind(sst, sse, ssr=sst-sse, R.squared=R2) # PLOT OF PREDICTED VS. OBSERVED plot(baseball$logsalary, pred, ylab="predicted", xlab="observed", main="Bagging Prediction for Baseball 1987") abline(a=0, b=1, col="red") ####################### # RANDOM FOREST ####################### library(randomForest) help(package="randomForest") # ================ # REGRESSION # ================ # SEARCH THE BEST mtry PARAMETER, NUMBER OF VARIABLES RANDOMLY SAMPLED AT EACH SPLIT bb.rf <- tuneRF(baseball[ , -c(1:2, 19, 23, 26)], baseball[,26], ntreeTry=50, stepFactor=2, improve=0.05, trace=TRUE, plot=TRUE, dobest=FALSE) # the best mtry is found to be 4 # RANDOM FOREST bb.rf <-randomForest(logsalary ~ bat86 + hit86 + hr86 + run86 + + rb86 + wlk86 + yrs + batcr + hitcr + hrcr + runcr + rbcr + + wlkcr + puto86 + asst86 + err86 + + div86 + team86 + leag86 + leag87 + team87, data=baseball, mtry=4, ntree=50, keep.forest=TRUE, importance=TRUE) print(bb.rf) # TWO FUNCTIONS: grow AND combine bb.rf2 <- grow(bb.rf, how.many=100) ?combine result <- combine(bb.rf, bb.rf2); print(result) # LOOK AT TREE SIZES AT ALL RUNS treesize(bb.rf, terminal=T) # GET THE TREE AT PARTICULAR RUN getTree(bb.rf, k=4, labelVar=TRUE) # LOOK AT VARIABLES ACTUALLY USED IN THE FORESTS varUsed(bb.rf, by.tree=FALSE, count=TRUE) varUsed(bb.rf, by.tree=FALSE, count=FALSE) varUsed(bb.rf, by.tree=TRUE, count=FALSE) varUsed(bb.rf, by.tree=TRUE, count=TRUE) # Plot the error rates or MSE of a randomForest object # INSPECT WHEN THE PERFORMANCE GETS STABLE ?plot.randomForest plot(bb.rf, main="MSE vs. # of boostrap Samples") # PLOT OF VARIABLE IMPORTANCE importance(bb.rf) varImpPlot(bb.rf, main="Variable Importance for Baseball 1987") # Partial Plot: Graphical Depiction of the Marginal Effec of a Variable partialPlot(bb.rf, pred.data=baseball, x.var=runcr, rug=TRUE) # ===================== ## Classification # ===================== data(iris) set.seed(71) iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE, proximity=TRUE) print(iris.rf) ## Look at variable importance: round(importance(iris.rf), 2) plot(iris.rf) varImpPlot(iris.rf) ?varImpPlot # OUTLIER MEASURES (FOR CLASSIFICATION ONLY) set.seed(1) iris.rf <- randomForest(iris[,-5], iris[,5], proximity=TRUE) plot(outlier(iris.rf), type="h", col=c("red", "green", "blue")[as.numeric(iris$Species)]) ## Do MDS on 1 - proximity: iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE) op <- par(pty="s") pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0, col=c("red", "green", "blue")[as.numeric(iris$Species)], main="Iris Data: Predictors and MDS of Proximity Based on RandomForest") par(op) print(iris.mds$GOF) # ------------------------------------------------------------------------------- # AN INTERESTING EMPIRICAL STUDY OF RELIEF AND RANDOMFOREST - VARIABLE IMPORTANCE # ------------------------------------------------------------------------------- rdat <- function(n=100, beta0=0, beta1=0, beta2=0){ x1 <- sample(x=c(0,1), size=n, replace=T) x2 <- sample(x=1:10, size=n, replace=T)/10 x3 <- sample(x=1:10, size=n, replace=T)/10 x4 <- sample(x=1:100, size=n, replace=T)/100 eta <- beta0 + beta1*x1 + beta2*x3 p <- exp(eta)/(1+exp(eta)) y <- rbinom(n, 1, prob=0.3)+1; y1 <- eta + rnorm(n) data.frame(x1, x2, x3, x4, y, y1) } nrun <- 200 out.RF <- out.RELIEF <- NULL for (i in 1:nrun){ print(paste("===============", i, "================", sep="")) dat <- rdat(n=1000) #========= # RELIEF # ======== rel <- as.data.frame(relief(data=dat[, -6], nosample=20, threshold=-100.0, vnom=1:2)) # rel <- as.data.frame(RELIEF.CON(data=dat, 40, -100.0)) out.RELIEF <- rbind(out.RELIEF, rel$weight) #=============== # RANDOM FORESTS #=============== fit.rf <-randomForest(factor(dat$y) ~ factor(dat$x1) + factor(dat$x2) + dat$x3 + dat$x4, mtry=2, ntree=2000, keep.forest=F, importance=TRUE) imp <- importance(fit.rf); # print(imp) # varImpPlot(fit.rf) out.RF <- rbind(out.RF, imp[,4]) } # VI.relief <- apply(out.RELIEF, 2, FUN=mean); VI.relief # VI.rf <- apply(out.RF, 2, FUN=mean); VI.rf out.RELIEF; out.RF VI.1000 <- data.frame(input=rep(1:4, rep(nrun, 4)), relief=as.vector(out.RELIEF), RF= as.vector(out.RF)) save(VI.100, VI.1000, file="sim-results-VI.Rdata") load(file="sim-results-VI.Rdata") postscript(file="fig5-5.eps", horizontal=F, width = 100.0, height = 95.0) par(bg="white", mfrow=c(2, 2), mar=c(4, 4, 4, 4)) boxplot(relief~input, data=VI.100, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = F, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(a) RELIEF n=100") boxplot(RF~input, data=VI.100, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = TRUE, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(b) RF n=100") boxplot(relief~input, data=VI.1000, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = F, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(a) RELIEF n=1000") boxplot(RF~input, data=VI.1000, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = TRUE, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(b) RF n=1000") dev.off() # ------------------------- # The `unsupervised' case: # ------------------------- set.seed(17) iris.urf <- randomForest(iris[, -5]) # Multi-dimensional Scaling Plot MDSplot(iris.urf, iris$Species) #