################################# # EXAMPLE On Logistic Regression ################################# library(rpart); library(MASS) ?kyphosis # THE DATA fit1 <- glm(Kyphosis ~ Age + Number + Start, family=binomial, data=kyphosis) summary(fit1, dispersion=1, correlation=T) names(fit1) names(summary(fit1)) fit1$deviance # --------------------------------- # CONFIDENCE INTERVAL FOR BETA'S # --------------------------------- library(MASS) ci <- confint(fit1, parm=c("Age", "Number"), level = 0.95) exp(ci*30) # ------------------------------------------------- # 95% Confidence Interval for a Linear Combination # ------------------------------------------------- V <- summary(fit1)$cov.unscaled; V # V1 <- summary(fit1)$cov.scaled; V1 # Useful when the Dispersion is not 1 xp <- c(0, 0, 1, 1) est <- t(xp)%*% fit1$coef; est se <- sqrt(t(xp)%*% V %*% xp) ci <- c(est - qnorm(0.975)*se, est + qnorm(0.975)*se); ci exp(ci) # ------------------------------------------------- # Likelihood Ratio Test for Comparing Nested Models # ------------------------------------------------- fit3 <- glm(Kyphosis ~ Start, family=binomial, data=kyphosis) summary(fit3) anova(fit1, fit3, test="Chisq") # ---------------------------- # Fitted Values And Prediction # ---------------------------- fit1$fitted.values # Prediction pred <- predict(fit1, newdata=data.frame(Age=100, Number=3, Start=8), type="response", se.fit=T) names(pred) pred1 <- predict(fit1, newdata=data.frame(Age=100, Number=3, Start=8), type="link", se.fit=T) # predict(fit1, newdata=data.frame(Age=100, Number=3, Start=8), type="terms", se.fit=T) # -------------------- # Interaction Model # -------------------- fit1 <- glm(Kyphosis ~ (Age + Number + Start)^2, family=binomial, data=kyphosis) summary(fit1, dispersion=1, correlation=T) fit1 <- glm(Kyphosis ~ Age * Number + Start, family=binomial, data=kyphosis) summary(fit1, dispersion=1, correlation=T) fit1 <- glm(Kyphosis ~ Age + Number + Start + Age:Number, family=binomial, data=kyphosis) summary(fit1, dispersion=1, correlation=T) # Test of Interaction anova(fit1, fit3, test="Chisq") # ----------------- # Categorical Data # ----------------- kyphosis$gender <- c(rep("female", 40), rep("male", 41)) kyphosis$age.grp <- cut(x=kyphosis$Age, breaks=3) fit.1 <- glm(Kyphosis ~ factor(age.grp), family=binomial, data=kyphosis, x=T) fit.1$x # --------------- # MODEL DELECTION # --------------- fit1 <- glm(Kyphosis ~ (Age + Number + Start)^2, family=binomial, data=kyphosis) summary(fit1, dispersion=1, correlation=T) n <- nrow(kyphosis) fit2 <- step(fit1, scope=~1, trace = 10, direction = c("both"), k = 3) summary(fit2) AIC(fit1, k = 2) # AIC AIC(fit1, k = log(n) # BIC # ----------------- # MODEL ASSESSMENT # ----------------- names(fit2) names(summary(fit2)) R2 <- (fit2$null.deviance - fit2$deviance)/fit2$null.deviance R2 resid.d <- residuals(fit2, type = c("deviance")) resid.p <- residuals(fit2, type = c("pearson")) e <- rstandard(fit2) r.jack <- rstudent(fit2) dat <- as.data.frame(cbind(id=1:nrow(kyphosis),obs.y =kyphosis$Kyphosis, pred.prob=fit2$fitted.values, resid.d=resid.d, resid.p=resid.p, e=e, r.jack=r.jack)) dat dat$id[abs(r.jack) > 2] dat$id[abs(resid.d) > 2] # OTHER INFLUENTIAL MEASURES inf <- influence.measures(fit2) inf # ========================= # ADDITIVE LOGISTIC MODEL # ========================= library(gam) ?gam data(kyphosis) dim(kyphosis) # 81 4 fit <- gam(Kyphosis ~ s(Age,4) + Number, family = binomial, data=kyphosis, trace=TRUE) summary(fit) names(fit) # CHECK OUT PLOT.GAM # plot.gam(x, residuals, rugplot, se, scale, ask = FALSE,terms,...) # preplot.gam(object, newdata, terms,...) # PLOT OF THE FUNCTIONAL FORM par(mfrow=c(1,2)) plot(fit, se=TRUE) # REPROCDUCE THE PLOT BY YOURSELF WITH CONFIDENCE BANDS fit$smooth; fit$smooth.frame fit$var; fit$additive.predictors alpha <- 0.05 lb <- fit$smooth - qnorm(1-alpha/2)*sqrt(fit$var) ub <- fit$smooth + qnorm(1-alpha/2)*sqrt(fit$var) range(lb); range(ub) plot(x=c(0, 200), y=c(-13, 3.2), xlab="age", ylab="s(age)", type="n") points(kyphosis$Age, fit$smooth, pch=1) points(kyphosis$Age, lb, pch=4) points(kyphosis$Age, ub, pch=4) # ================================================== # MODEL ASSESSMENT: THE CLASSIFICATION TABLE AND ROC # ================================================== # A DETAILED LOOK AT HOW THE ROC CURVE IS MADE y <- ifelse(kyphosis$Kyphosis=="absent", 0, 1) yhat <- fit$fitted.values cutoff <- sort(unique(yhat)) # 0:100/100 sensitivity <- specificity <- rep(1, length(cutoff)) for (k in 1:length(cutoff)) { i <- cutoff[k] n11 <- sum((yhat > i)*y) n10 <- sum((yhat > i)*(1-y)) n01 <- sum((yhat <= i)*y) n00 <- sum((yhat <= i)*(1-y)) print(paste("THE CLASSIFICATION TABLE FOR CUTOFF = ", i, sep="")) print(matrix(c(n11, n01, n10, n00), 2, 2)) sensitivity[k] <- n11/(n11+n01) # TRUE POSITIVIES specificity[k] <- n00/(n00+n10) # TRUE NEGATIVES } # ROC CURVE: sensitivity (TRUE ALARM) vs. 1-specificity (FALSE ALARM) plot(x=1-specificity, y=sensitivity, main="ROC Curve", type="l") abline(a=0, b=1, col="red", lty=2) # Compute the C-Statistic library(verification) pred <- yhat a.ROC <- roc.area(obs=y, pred=pred)$A print(a.ROC) # Use Package to plot Comparative ROC curve mod.glm <- verify(obs=y, pred=yhat, bins = FALSE) mod.logit <- verify(obs=y, pred=fit3$fitted.values, bins = FALSE) roc.plot(mod.glm, plot.thres = NULL) lines.roc(mod.logit, col = 2, lwd = 2) leg.txt <- c("GAM", "Logit") legend( 0.6, 0.4, leg.txt, col = c(1,2), lwd = 2) #