library(ada) # SET UP THE TOPTIONS FOR RPART # --------------------------------- ##default trees default=rpart.control() ##stumps stump <- rpart.control(cp=-1,maxdepth=1,minsplit=0) ##4-split trees for real and discrete boosting four <- rpart.control(cp=-1,maxdepth=2,minsplit=0) # ################################ # EXAMPLE 1: GENERATED DATA # ################################ n <- 500; p <- 10 f <- function(x,a,b,d) return( a*(x-b)^2+d ) x1 <- runif(n/2,0,4) y1 <- f(x1,-1,2,1.7)+runif(n/2,-1,1) x2 <- runif(n/2,2,6) y2 <- f(x2,1,4,-1.7)+runif(n/2,-1,1) y <- c(rep(1,n/2),rep(2,n/2)) dat <- data.frame(y=y,x1=c(x1,x2),x2=c(y1,y2), matrix(rnorm(n*(p-2)),ncol=(p-2))) names(dat)<-c("y",paste("x",1:p,sep="")) dat plot(dat$x1,dat$x2,pch=c(1:2)[y], col=c(1,8)[y], xlab=names(dat)[2],ylab=names(dat)[3]) indtrain<-sample(1:n,300,replace=FALSE) train<-dat[indtrain,]; dim(train) test<-dat[setdiff(1:n,indtrain),]; dim(test) # =========================================== # DISCRETE ADABOOST under exponential loss # =========================================== default=rpart.control() gdis<-ada(y~.,data=train,iter=50,loss="e",type="discrete", control=default); gdis # To add the testing data set to the model, gdis<-addtest(x=gdis, test.x=test[,-1], test.y=test[,1]) names(gdis) plot(gdis,FALSE,TRUE) summary(gdis,n.iter=36) pred<-predict(gdis,train[,-1]) table(pred) # Variable Importance varplot(gdis) vip <- varplot(gdis,plot.it=FALSE,type="scores") round(vip,4) # Pairwise Plot pairs(gdis,train[,-1],maxvar=5) pairs(gdis,train[,-1],var=1:2, test.x=test[,-1], test.y=test[,1],test.only=TRUE) # ROC and C-Statistic # -------------------- # PREDICTION pred<-predict(gdis, newdata=test[,-1],type="prob"); pred yhat <- pred[,2] names(pred) # ROC Curve and AUC library(verification) y <- test[,1]-1 a.ROC <- roc.area(obs=y, pred=yhat)$A; a.ROC mod.glm <- verify(obs=y, pred=yhat, bins = T) roc.plot(mod.glm, plot.thres = .5) # lines(lowess(x=1-specificity, y=sensitivity, f = 1/10), lwd=2, lty=1, col="red") text(x=0.6, y=0.3, paste("Area under ROC = ", a.ROC, sep="")) # ========================================================= # Real AdaBoost ensemble with the \epsilon-boosting # modification, 4-split trees, and 1000 iterations. # ========================================================= greal<-ada(y~.,data=train,iter=1000,type="real",nu=0.001, bag.frac=1,model.coef=FALSE, control=rpart.control(maxdepth=2,cp=-1,minsplit=0)); greal pred<-predict(greal, train[,-1],type="prob"); pred vip <- varplot(greal,plot.it=FALSE,type="scores") round(vip,4) # =============================================================== # Gentle AdaBoost ensemble with 100 iterations, tree depth of 8, # v = 0.1 (regularization), and the ensemble is shifted towards # bagging using the bag.shift=TRUE argument # =============================================================== ggen<-ada(y~.,data=train,test.x=test[,-1],test.y=test[,1],iter=100, type="gentle",nu=0.1,bag.shift=TRUE, control=rpart.control(cp=-1,maxdepth=8)); ggen # ============================================================== # L2-Boost (boosting with logistic loss) with gentle boost # ============================================================== glog<-ada(y~.,data=train,test.x=test[,-1],test.y=test[,1],iter=50, loss="l",type="gentle"); glog glog <- update(glog,train[,-1],train[,1],test[,-1],test[,1],n.iter=50) # ------------------------- # Stageweight convergence # ------------------------- greal<-ada(y~.,data=train,test.x=test[,-1],test.y=test[,1],iter=50, type="real",verbose=TRUE) ################ # EXAMPLE 2 ################ n <- 12000; p <- 10 set.seed(100) x <- matrix(rnorm(n*p),ncol=p) y <- as.factor(c(-1,1)[as.numeric(apply(x^2,1,sum)>9.34)+1]) indtrain <- sample(1:n,6000,FALSE) train <- data.frame(y=y[indtrain],x[indtrain,]) indtest <- setdiff(1:n,indtrain) test <- data.frame(y=y[indtest],x[indtest,]) control <- rpart.control(cp=-1, minsplit=0, xval=0, maxdepth=1) gdis <- ada(y~., data=train, iter=400, bag.frac=1, nu=1, control=control, test.x=test[,-1],test.y=test[,1]) gdis plot(gdis,TRUE,TRUE) summary(gdis) summary(gdis,n.iter=1) summary(gdis,n.iter=398) ?summary.ada #################### # Solubility data #################### data(soldat) n <- nrow(soldat) set.seed(100) ind <- sample(1:n) trainval <- ceiling(n*.5) testval <- ceiling(n*.3) train <- soldat[ind[1:trainval],] test <- soldat[ind[(trainval+1):(trainval+testval)],] valid <- soldat[ind[(trainval+testval+1):n],] control <- rpart.control(cp=-1, maxdepth=14, maxcompete=1,xval=0) gen1 <- ada(y~.,data=train,test.x=test[,-73],test.y=test[,73], type="gentle",control=control,iter=70) gen1 <- addtest(x=gen1, test.x=valid[,-73], test.y=valid[,73]) summary(gen1) plot(gen1,kappa=TRUE, test=TRUE) varplot(gen1) # MAKE 20 RUNS AND SUMMARIZE VARIABLE IMPORTANCE INFORMATION vars<-rep(0,72) t1<-proc.time() for(i in 1:20){ rm(gen1) gen1 <- ada(y~.,data=train,test.x=test[,-73],test.y=test[,73], type="gentle",control=control,iter=70) vec1<-varplot(gen1,plot.it=FALSE,type="scores",max.var.show=72) vars<-vars+ as.numeric(vec1[order(names(vec1))])/20 cat("i=",i," time=",(proc.time()-t1)/60,"\n") } a1<-sort(names(vec1)) a2<-order(vars,decreasing=TRUE) dotchart(vars[a2][30:1],a1[a2][30:1],main="Average Variable Imp.") ############################ # AN EXAMPLE ON MULTI-CLASS ############################# n<-1200; p<-10; K<-3 set.seed(100) x<-matrix(rnorm(n*p),ncol=p) indtrain <- sample(1:n,200,FALSE) indtest <- setdiff(1:n,indtrain) val <- qchisq(c(.33,.66),10) su <- apply(x^2,1,sum) Iy <- cbind(as.numeric(su<=val[1]), as.numeric(val[1]val[2])) y <- apply(Iy,1,which.max) test <- data.frame(y=y[indtest],x[indtest,]) Fs<-list() for(i in 1:K) Fs[[i]]<-ada(y~.,data=data.frame(y=Iy[indtrain,i],x[indtrain,]), iter=250, test.x=test, test.y=Iy[indtest,i])$model$F[[2]] preds<- sapply(1:1000,function(i)which.max(c(Fs[[1]][i],Fs[[2]][i],Fs[[3]][i]))) tab<-table(y[indtest],preds) for(i in 1:K){ cat("In class error rate for class ",i,": ", round(1-tab[i,i]/sum(tab[i,]),3),"\n") } 1-sum(diag(tab))/length(indtest) # COMAPRED TO RANDOM FOREST RESULTS library(randomForest) train<-data.frame(y=as.factor(y[indtrain]),x[indtrain,]) set.seed(100) grf<-randomForest(y~.,train) tab3<-table(y[indtest],predict(grf,test)) for(i in 1:K){ cat("In class error rate for class ",i,": ", round(1-tab3[i,i]/sum(tab[i,]),3),"\n") } 1-sum(diag(tab3))/length(indtest) # ######################################### # THE 1987 BASEBALL HITTER SALARY DATA # ######################################### dat <- read.table("http://pegasus.cc.ucf.edu/~xsu/CLASS/STA6704/bb87.dat", header = F, col.names=c("id", "name", "bat86", "hit86", "hr86", "run86", "rb86", "wlk86", "yrs", "batcr", "hitcr", "hrcr", "runcr", "rbcr", "wlkcr", "leag86", "div86", "team86", "pos86", "puto86", "asst86", "err86", "salary", "leag87", "team87", "logsalary")) dim(dat) # 263 26 dimnames(dat) #