##################################### # DECISION TREES EXAMPLE - V-Fold CV ##################################### library(tree); help(package="tree") ?tree icu <- read.table(file="http://pegasus.cc.ucf.edu/~xsu/CLASS/STA5703/icu.txt", sep = "\t", header =T); icu colnames(icu) # DATA PREPARATION icu$LOC <- ifelse(icu$LOC==0, 0, 1); dat <- icu tre1 <- tree(factor(STA)~ . -ID, data=dat, control=tree.control(nobs=nrow(dat), mincut=5, minsize=10, mindev=0.0001)) tre1; summary(tre1) par(mfrow=c(1,1), mar=c(4, 6, 4, 6)) plot(tre1); text(tre1) # COST-COMPLEXITY PRUNING - JUST FOR ILLUSTRATION PURPOSE prn <- prune.tree(tre1); plot(prn) # V-FOLD CV tree.cv <- cv.tree(tre1, FUN = prune.tree, K=10); tree.cv plot(tree.cv) # PLOT OF CROSS-VALIDATED DEVIANCE, AIC, AND BIC size.cv <- tree.cv$size dev.cv <- tree.cv$dev aic.cv <- dev.cv + 2*size.cv bic.cv <- dev.cv + log(nrow(dat))*size.cv range(aic.cv); range(bic.cv); range(dev.cv); range(size.cv) # postscript("C:/Research/RANDOM-THOUGHTS/Dave/fig1.eps", horizontal=F) m1 <- min(dev.cv, aic.cv, bic.cv) m2 <- max(dev.cv, aic.cv, bic.cv) par(bg="lightblue", mar=rep(5,4), mfrow=c(1,1)) plot(x=c(1, 21), y= c(m1, m2), type="n", xlab="tree size", ylab="cv deviance", main="", ) lines(size.cv, dev.cv, lty=1, lwd=2, col="purple") lines(size.cv, aic.cv, lty=1, lwd=2, col="brown") lines(size.cv, bic.cv, lty=1, lwd=2, col="red") text(x=19, y=bic.cv[1], "BIC") text(x=19, y=aic.cv[1], "AIC") text(x=19.5, y=dev.cv[1], "deviance") # dev.off() # OBTAIN THE BEST-SIZED TREE MODEL tree.best <- prune.tree(tre1, best=5) tree.best; summary(tree.best) par(bg="white", mar=rep(5,4), mfrow=c(1,1)) # plot(tre1); title("(a) initial tree"); text(tre1) plot(tree.best); # title(main="(b) final tree"); text(tree.best) # PREDICTION pred <- predict.tree(tree.best, newdata = dat, type="vector") yhat <- pred[,2] names(pred) # ROC Curve and AUC library(verification) y <- icu$STA a.ROC <- roc.area(obs=y, pred=yhat)$A; a.ROC mod.glm <- verify(obs=y, pred=yhat, bins = FALSE) roc.plot(mod.glm, plot.thres = NULL) # lines(lowess(x=1-specificity, y=sensitivity, f = 1/10), lwd=2, lty=1, col="red") text(x=0.6, y=0.3, "Area under ROC = 0.8496") ######################### # REGRESSION TREE EXAMPLE ######################### # THE 1987 BASEBALL HITTER SALARY DATA dat <- read.table("bb87.dat", header = F, col.names=c("id", "name", "bat86", "hit86", "hr86", "run86", "rb86", "wlk86", "yrs", "batcr", "hitcr", "hrcr", "runcr", "rbcr", "wlkcr", "leag86", "div86", "team86", "pos86", "puto86", "asst86", "err86", "salary", "leag87", "team87", "logsalary")) dim(dat) # 263 26 dimnames(dat) # ========================= # THE TEST SAMPLE METHOD # ========================= # PARTITION THE DATA set.seed(123) id.tmp <- sample(1:nrow(dat), size=trunc(nrow(dat)*2/3), replace = F) learning <- dat[id.tmp,] test <- dat[-id.tmp,] c(nrow(learning), nrow(test)) # THE LARGE INITIAL TREE tre1 <- tree(logsalary~ . -salary-id-name, data=learning, control=tree.control(nobs=nrow(dat), mincut=5, minsize=10, mindev=0.001)) tre1; summary(tre1) prn <- prune.tree(tre1, k=NULL, best=NULL, newdata=test, method="deviance"); prn par(mar=c(8, 4, 8, 4)) plot(prn, main="deviance based on test sample") size <- prn$size dev <- nrow(test)*log(prn$dev) aic <- dev + 2 * size bic <- dev + log(nrow(test))*size; range(c(dev, aic, bic)) par(mfrow=c(1, 1), mar=rep(4, 4)) plot(x=c(1, max(size)), y= c(274, 395), type="n", xlab="tree size", ylab="", main="tree size selection via test sample") lines(size, dev, lty=3, col="black") lines(size, aic, lty=2, col="blue") lines(size, bic, lty=1, col="blue") legend(x=4, y=375, legend=c("bic", "aic", "deviance"), lty=1:3) points(c(3,3), c(267, bic[22]), type="l", col="red") bsize <- prn$size[bic==min(bic)]; bsize btre1 <- prune.tree(tre1, k=prn$k[bic==min(bic)]); print(btre1) par(mfrow=c(1,1), mar=c(4, 6, 4, 6)) plot(btre1); text(btre1) # ------------------------------------------------ # TREE SIZE SELECTION VIA V-fold Cross Validation # ------------------------------------------------ # The Initial Large Tree BASED ON THE ENTIRE DATA tre1 <- tree(logsalary~ . -salary-id-name, data=dat, control=tree.control(nobs=nrow(dat), mincut=5, minsize=10, mindev=0.001)) tre1; summary(tre1) par(mfrow=c(1,1), mar=c(4, 6, 4, 6)) plot(tre1); text(tre1) # COST-COMPLEXITY PRUNING - JUST FOR ILLUSTRATION PURPOSE prn <- prune.tree(tre1); plot(prn) # V-FOLD CV tree.cv <- cv.tree(tre1, FUN = prune.tree, K=10); tree.cv plot(tree.cv) # PLOT OF CROSS-VALIDATED DEVIANCE, AIC, AND BIC size.cv <- tree.cv$size dev.cv <- nrow(dat)*log(tree.cv$dev) aic.cv <- dev.cv + 2*size.cv bic.cv <- dev.cv + log(nrow(dat))*size.cv range(aic.cv); range(bic.cv); range(dev.cv); range(size.cv) # postscript("C:/Research/RANDOM-THOUGHTS/Dave/fig1.eps", horizontal=F) par(mfrow=c(1, 1), mar=rep(4, 4)) plot(x=c(1, 36), y= c(1070, 1410), type="n", xlab="tree size", ylab="", main="tree size selection via 10-fold cross-validation") lines(size.cv, dev.cv, lty=3) lines(size.cv, aic.cv, lty=2) lines(size.cv, bic.cv, lty=1) arrows(x0=27, y0=bic.cv[6], x1=30, y1=bic.cv[6], length=.1, angle = 20) text(x=26.4, y=bic.cv[6], "BIC") arrows(x0=27, y0=aic.cv[6], x1=30, y1=aic.cv[6], length=.1, angle = 20) text(x=26.4, y=aic.cv[6], "AIC") arrows(x0=15, y0=dev.cv[9], x1=13, y1=dev.cv[9], length=.1, angle = 20) text(x=16.4, y=dev.cv[9]+3, "deviance") points(c(4,4), c(930, bic.cv[32]), type="l", col="red") # dev.off() # OBTAIN THE BEST-SIZED TREE MODEL tree.best <- prune.tree(tre1, best=4) tree.best; summary(tree.best) par(mfrow=c(1,1), mar=c(4, 6, 4, 6)) plot(tree.best); text(tree.best) # PREDICTION pred <- predict.tree(tree.best, newdata = dat, type="where") names(pred) # NOTHING BUT A LINEAR MODEL WITH ONE CATEGORICAL VARIABLE fit <- lm(dat$logsalary~factor(pred)) summary(fit)