###################################### # 1987 BASEBALL SALARY EXAMPLE ###################################### # READ THE DATA INTO R setwd("C:/Courses/STA4164/2007-Fall/") rm(list=ls(all=TRUE)) baseball <- read.table("bb87.dat", header = F, col.names=c("id", "name", "bat86", "hit86", "hr86", "run86", "rb86", "wlk86", "yrs", "batcr", "hitcr", "hrcr", "runcr", "rbcr", "wlkcr", "leag86", "div86", "team86", "pos86", "puto86", "asst86", "err86", "salary", "leag87", "team87", "logsalary")) # An Alternatively Way If your computer has internet connection. baseball <- read.table("http://pegasus.cc.ucf.edu/~xsu/CLASS/STA4164/bb87.dat", header = F, col.names=c("id", "name", "bat86", "hit86", "hr86", "run86", "rb86", "wlk86", "yrs", "batcr", "hitcr", "hrcr", "runcr", "rbcr", "wlkcr", "leag86", "div86", "team86", "pos86", "puto86", "asst86", "err86", "salary", "leag87", "team87", "logsalary")) dim(baseball) attach(baseball, pos=2) # DEFINE THE FOUR PREDICTORS x1 <- runcr/yrs x2 <- sqrt(run86) x3 <- pmin(pmax(yrs - 2, 0), 5) x4 <- pmax(yrs - 7, 0) # SOME EXPLORATION DATA ANALYSIS (EDA) # ===================================== # HISTOGRAM AND QQ PLOT OF SALARY & LOGSALARY par(mfrow=c(2,2),mar=c(4, 4, 4, 4)) hist(salary, xlab="salary", main="Histogram of Salary") qqnorm(salary, main="Q-Q Plot of Salary") qqline(salary) hist(logsalary, xlab="log-alary", main="Histogram of Log(Salary)") qqnorm(logsalary, main="Q-Q Plot of Log(Salary)") qqline(logsalary) # CORRELATION MATRIX bb <- data.frame(logsalary, x1, x2, x3, x4) cor(bb) # PAIRED SCATTERPLOTS pairs(bb) # ============================================ # FIT Hoaglin and Velleman's (HV, 1995) Model # ============================================ fit <- lm(logsalary ~ x1 + x2 + x3 + x4); # GET THE TABLE OF PARAMETER ESTIMATES summary(fit) # GET THE ANOVA TABLE anova.lm(fit, ssType = 1) # AN ILLUSTRATING EXAMPLE OF THE F TEST FOR GENERAL LINEAR HYPOTHESES # H0: BETA1 = BETA2 IN THE HV MODEL # ==================================================================== x1.0 <- x1 + x2 fit1 <- lm(logsalary ~ x1.0 + x3 + x4) anova(fit1, fit, test = "F") # PREDICTION # ============ new <- data.frame(x1 = 45, x2 = 7, x3 = 3, x4 = 2) # 95% CI FOR ESTIMATING MEAN RESPONSE predict(fit, newdata = new, se.fit = TRUE, interval = "confidence", level = 0.95) # 95% PI FOR PREDICTING INDIVIDUAL RESPONSE predict(fit, newdata = new, se.fit = TRUE, interval = "prediction", level = 0.95) ########################################## # MODEL DIAGNOSTICS # ########################################## # THE DEFAULT PLOT FROM R FUNCTION plot.lm # NOTE THAT PLOT OF COOK'S DISTANCE IS AVAILABLE par(mfrow=c(2,2), mar=rep(5,4)) plot.lm(fit) # ===================== # RESIDUAL ANALYSIS # ===================== # GET FOUR TYPES OF RESIDUALS names(fit) X <- cbind(1, x1, x2, x3, x4); X H <- X%*% solve(t(X)%*%X)%*% t(X); H h <- diag(H); h MS <- anova(fit)$"Mean Sq" MSE <- MS[length(MS)]; MSE DF <- anova(fit)$"Df" df.r <- DF[length(DF)] e <- fit$residuals z <- e/sqrt(MSE) r <- e/(sqrt(MSE*(1-h))) r.jack <- r*sqrt((df.r - 1)/(df.r - r^2)) options(digits=4) cbind(y=logsalary, x1, x2, x3, x4, yhat=fit$fitted.values,h=h, e=e, z=z, r=r, r.jack =r.jack, r.jack1= rstudent(fit)) # THE STUDENTIZED JACKKNIFE RESIDUALS CAN BE OBTAINED DIRECTLY r.jack <- rstudent(fit) par(mfrow=c(1,2),mar=c(8,4,8,4)) # SOME UNIVARIATE EXPLORATION OF RESIDUALS # ========================================== # The fisrt plot: Histogram hist(r.jack, xlab="Jackknife Residual", main="Histogram of Jackknife Residuals") # The second plot: Q-Q plot qqnorm(r.jack) qqline(r.jack) # A NORMALITY TEST - LARGE P-VALUE WOULD JUSTIFY NORMALITY shapiro.test(r.jack) # SOME BIVARIATE EXPLORATION OF RESIDUALS # ========================================== par(mfrow=c(3,1),mar=c(4, 10, 4, 10)) # PLOT OF RESIDUAL VERSUS EACH CONTINUOUS PREDICTOR plot(x1, r.jack, xlab="x1", ylab="Jackknife Residual") abline(h=0) # compute the degrees of freedom for the reference t distribution n <- dim(baseball)[1] p <- 4 df <- (n-1)-(p+1) abline(h=qt(.975, df), lty=2) abline(h=-qt(.975, df), lty=2) plot(x2, r.jack, xlab="x2", ylab="Jackknife Residual") abline(h=0) abline(h=qt(.975, df), lty=2) abline(h=-qt(.975, df), lty=2) # PLOT OF RESIDUAL VERSUS PREDICTED VALUE plot(fit$fitted.values, r.jack, xlab="Predicted", ylab="Jackknife Residual") abline(h=0) abline(h=qt(.975, df), lty=2) abline(h=-qt(.975, df), lty=2) # ============================================================ # CHECK LINEARITY - PARTIAL RESIDUAL OR PARTIAL REGRESSION PLOT # ============================================================ par(mfrow=c(1, 3), oma=c(4, 4, 4, 4)) # residual vs hitcr plot r <- resid(fit) plot(hitcr, r, ylab="residuals", xlab="hits in during career") # Partial Residual Plot for hitcr fit1 <- lm(logsalary ~ hitcr + x1 + x2 + x3 + x4) pr.hitcr <- resid(fit1) + fit1$coef[2]*hitcr plot(hitcr, pr.hitcr, ylab="partial residuals", xlab="hits in during career") abline(lsfit(hitcr, pr.hitcr)) # Partial Regression Plot for X1 pr.hitcr <- residuals(lm(x1 ~ x2 + x3 + x4)) r.1 <- resid(lm(logsalary ~ x2 + x3 + x4)) plot(pr.hitcr, r.1, ylab="r(logsalary|X)", xlab="r(x1|X)") # ============================ # OUTLIER DETECTION # ============================ n <- nrow(baseball) p <- 4 infl <- influence.measures(fit); infl.mat <- as.data.frame(infl$infmat) cook.d <- infl.mat$cook.d plot(x=1:n, y=cook.d, xlab="ID") # abline(h=qf(.95, df1=p+1, df2=n-(p+1)), lty=2) # EXTRACT INFLUETIAL POINTS baseball[cook.d > 0.05, ] # HIGH COOK'S DISTANCE summary(influence.measures(fit)) # =============================== # CHECK ON MULTICOLINEARITY # =============================== temp <- rep(1, dim(baseball)[1]) # create a vector of all 1's. VIF0 <- 1 / (1 - summary(lm(temp ~ x1 + x2 + x3 + x4 -1))$r.squared) VIF1 <- 1 / (1 - summary(lm(x1 ~ x2 + x3 + x4))$r.squared) VIF2 <- 1 / (1 - summary(lm(x2 ~ x1 + x3 + x4))$r.squared) VIF3 <- 1 / (1 - summary(lm(x3 ~ x1 + x2 + x4))$r.squared) VIF4 <- 1 / (1 - summary(lm(x4 ~ x1 + x2 + x3))$r.squared) VIF <- c(VIF0, VIF1, VIF2, VIF3, VIF4) ; VIF mean(VIF)