setwd(dir="C:/Research/Conference/DMAA-2009/Program/") ############################################################ # Multiple Logistic Regression: the ICU Data ############################################################ icu <- read.table(file="icu.txt", sep = "\t", header =T); icu dim(icu) colnames(icu) # DATA PREPARATION table(icu$STA, icu$LOC) icu$LOC <- ifelse(icu$LOC==0, 0, 1); table(icu$STA, icu$CAN) table(icu$STA, icu$TYP) # ----------------------------------------------- # FITTING THE LOGISTIC MODEL - VARIABLE SELECTION # ----------------------------------------------- fit.full <- glm(STA ~ AGE + SYS + LOC + TYP + factor(RACE) + SER + CAN + CRN + INF + + CPR + SYS + HRA + PRE + TYP + FRA + PO2 + PCO + BIC + CRE + LOC, family=binomial, data=icu) # Stepwise Variable Selection library(MASS) fit <- stepAIC(fit.full, direction = "both", k=log(nrow(icu))) summary(fit) fit0 <- glm(STA ~ AGE + CAN + LOC + TYP, family=binomial, data=icu) summary(fit0) # ------------------------------------------------------------ # Variable Selection via Group-LASSO Shinkage or L1 Penality # ------------------------------------------------------------ library(grplasso) fit <- grplasso(STA ~ . - ID, data = icu, model = LogReg(), lambda = 12, center = TRUE, standardize = TRUE) fit$coef table(icu$STA, icu$CAN) # Interaction fit.int <- glm(STA ~ (AGE + CAN + SYS + LOC + TYP)^2, family=binomial, data=icu) fit <- stepAIC(fit.int, direction = "both", k=log(nrow(icu))) summary(fit) fit <- grplasso(STA ~ (AGE + SYS + LOC + TYP)^2, data = icu, model = LogReg(), lambda = (400:1)/10, center = TRUE, standardize = TRUE) fit$coef plot(fit) fit1 <- glm(STA ~ AGE + SYS + LOC + TYP + AGE:TYP, family=binomial, data=icu) summary(fit1) # ---------------- # THE FINAL MODEL # ---------------- fit1 <- glm(STA ~ AGE + SYS + LOC, family=binomial, data=icu) summary(fit1) sum1 <- summary(fit1, dispersion=1) sum1 sum1$cov.unscaled # CONFIDENCE INTERVAL FOR BETA'S library(MASS) ci <- confint(fit1, level = 0.95) ci <- confint(fit1, parm=c("AGE"), level = 0.95) exp(ci*10) # --------- # ROC CURVE # --------- # A DETAILED LOOK AT HOW THE ROC CURVE IS MADE y <- icu$STA yhat <- fit1$fitted.values n <- length(y) # CLASSIFICATION TABLE AT 0.5 table(sign(yhat>= 0.5), sign(y>=0.5)) # ROC cutoff <- sort(unique(yhat)) # 0:100/100 sensitivity <- specificity <- accuracy <- rep(1, length(cutoff)) for (k in 1:length(cutoff)) { i <- cutoff[k] n11 <- sum((yhat >= i)*y) n10 <- sum((yhat >= i)*(1-y)) n01 <- sum((yhat < i)*y) n00 <- sum((yhat < i)*(1-y)) # print(paste("THE CLASSIFICATION TABLE FOR CUTOFF = ", i, sep="")) # print(matrix(c(n11, n01, n10, n00), 2, 2)) sensitivity[k] <- n11/(n11+n01) # TRUE POSITIVIES specificity[k] <- n00/(n00+n10) # TRUE NEGATIVES accuracy[k] <- (n00+n11)/n } # ROC CURVE: sensitivity (TRUE ALARM) vs. 1-specificity (FALSE ALARM) plot(x=1-specificity, y=sensitivity, main="ROC Curve", type="l") abline(a=0, b=1, col="blue", lty=2) lines(lowess(x=1-specificity, y=sensitivity, f = 1/10), lwd=2, lty=1, col="red") text(x=0.6, y=0.3, "Area under ROC = 0.7977") # CHOOSING BEST CUTOFF POINT par(mfrow = c(1, 2), mar=rep(4,4)) plot(cutoff, accuracy, type="l", lty=2, col="black", main="(a)", xlab="cutoff points", ylab="prediction accuracy", lwd=2) b.cut <- (cutoff[accuracy==max(accuracy)])[2] segments(x0=b.cut, y0=.175, x1=b.cut, y1=max(accuracy), lty=3, col="brown") text(x=b.cut+.2, y=.5, labels="cutoff=0.358") arrows(b.cut, .495, b.cut+.055, .495, col="black", length=.05) plot(x=cutoff, y=sensitivity, type="l", lty=1, col="blue", main="(b)", xlab="cutoff points", ylab="", lwd=2) lines(x=cutoff, y=specificity, type="l", lty=1, col="red", lwd=2) legend(x=0.6, y=0.9, legend=c("sensi", "spec"), col=c("blue", "red"), lty=c(1,1)) b.cut <- cutoff[sensitivity==specificity] segments(x0=b.cut, y0=-0.01, x1=b.cut, y1=sensitivity[sensitivity==specificity], lty=3, col="brown") text(x=b.cut+.2, y=.2, labels="cutoff=0.1786") arrows(b.cut, .19, b.cut+.045, .19, col="black", length=.05) # Compute the C-Statistic library(verification) pred <- yhat a.ROC <- roc.area(obs=y, pred=pred)$A print(a.ROC) # Use the "verification" Package to plot Comparative ROC curve mod.glm <- verify(obs=y, pred=yhat, bins = FALSE) roc.plot(mod.glm, plot.thres = NULL) lines(lowess(x=1-specificity, y=sensitivity, f = 1/10), lwd=2, lty=1, col="red") text(x=0.6, y=0.3, "Area under ROC = 0.7977") # YET ANOTHER PACKAGE THAT I DONOT LIKE VERY MUCH library(ROCR) help(package=ROCR) ?performance pred <- prediction(predictions=yhat, labels=y) perf <- performance(pred, measure = "sens", x.measure = "spec") plot(perf, col=rainbow(10)) tmp <- attributes(unclass(perf)) cutoff <- tmp$alpha.values x.plot <- unlist(tmp$x.values) y.plot <- unlist(tmp$y.values) perf <- performance(pred, "acc") plot(perf, avg= "vertical", spread.estimate="boxplot", show.spread.at= seq(0.1, 0.9, by=0.05), col="red") # -------------------------------------- # LIFT; PERCENTAGE OF CAPTURED RESPONSE # -------------------------------------- y <- icu$STA yhat <- fit1$fitted.values pred <- sort(yhat, decreasing =T) obs <- y[rank(yhat)] n <- length(y) ngrp <- 10 thresholds <- quantile(pred, probs=(0:ngrp)/ngrp) thresholds[1] <- thresholds[1] -0.001; grp <- cut(pred, breaks=thresholds, labels=1:ngrp) # dat <- data.frame(id = 1:n, obs=obs, pred=pred, grp=grp); dat tmp0 <- split(obs, f=grp) n.group <- as.vector(unlist(lapply(split(rep(1,n), f=grp), FUN=sum))) n.resp.group <- as.vector(unlist(lapply(tmp0, FUN=sum))) pct.cap.resp <- n.resp.group/sum(y) pct.resp <- n.resp.group/n.group lift <- n.resp.group/(sum(y)/ngrp) cumulate <- function(x) { y <- x for (i in 1:length(x)){ y[i] <- sum(x[1:i]) } y } n.group.cum <- cumulate(n.group) n.resp.group.cum <- cumulate(n.resp.group) pct.cap.resp.cum <- n.resp.group.cum /sum(y) pct.resp.cum <- n.resp.group.cum/n.group.cum lift.cum <- n.resp.group.cum/ cumulate(rep((sum(y)/ngrp), ngrp)) results.lift <- data.frame(decile=1:ngrp, n.resp.group, pct.cap.resp, pct.resp, lift, n.resp.group.cum, pct.cap.resp.cum, pct.resp.cum, lift.cum) results.lift dat <- results.lift par(mfrow = c(3, 2), mar=rep(4,4)) ylabel <- c("% captured deaths", "% deaths", "lift") for (i in 3:5){ plot(x=dat[,1], dat[,i], xlab="", ylab=ylabel[i-2], type="l", lty=1, lwd=2, col="red") if (i==3) title("noncumulative") if (i==5) abline(h=1, lty=2, lwd=1, col="black") plot(x=dat[,1], dat[,i+4], xlab="", ylab=ylabel[i-2], type="l", lty=1, lwd=2, col="blue") if (i==3) title("cumulative") # if (i==5) abline(h=1, lty=2, lwd=2, col="black") } results.lift.present <- data.frame(decile=1:ngrp, n.resp.group, pct.of.cap.resp = paste(n.resp.group, "/", sum(y), "=", pct.cap.resp*100, "%", sep=""), pct.of.resp = paste(n.resp.group, "/", n.group, "=", pct.resp*100, "%", sep=""), lift=paste(n.resp.group, "/", sum(y)/ngrp, "=", lift, sep=""), n.resp.group.cum, pct.of.cap.resp.cum = paste(n.resp.group.cum, "/", sum(y), "=", pct.cap.resp.cum*100, "%", sep=""), pct.of.resp.cum = paste(n.resp.group.cum, "/", n.group.cum, "=", pct.resp.cum*100, "%", sep=""), lift.cum=paste(n.resp.group.cum, "/", cumulate(rep((sum(y)/ngrp), ngrp)), "=", lift.cum, sep="")) results.lift.present write.csv(results.lift.present, file="lift.csv", row.names = FALSE)