# ============================= # FIRST READ THE DATA INTO R # ============================= data <- data.frame(SAL = c(10455, 9680, 7300, 9388, 12496, 11812, 9224,11725, 11320, 12000, 12500, 13310, 12105, 6200, 11522, 8000, 12548, 7700, 10028, 13176, 13255, 13004, 8000, 8224, 10750, 11669, 12322, 11002, 10666, 10839), CGPA = c(2.58, 2.31, 2.47, 2.52, 3.22, 3.37, 2.43, 3.08, 2.78, 2.98, 3.55, 3.64, 3.72, 2.24, 2.7, 2.3, 2.83, 2.37, 2.52, 3.22, 3.55, 3.55, 2.47, 2.47, 2.78, 2.78, 2.98, 2.58, 2.58, 2.58)) # ALTERNATIVE WAYS: YOU MAY READ DATA EITHER FROM A LOCAL FILE OR FROM THE WEB. data <- data.frame(read.table(file="http://www-stat.stanford.edu/~olshen/hrp262spring01/spring01Datasets/EX0508.DAT", header=T, col.names=c("SAL", "CGPA"))) data; summary(data) # ============================== # EXPLORTARY DATA ANALYSIS # ============================== # The above commands line contains options for graph page. # 1x1 array of figures # on the page, with 4 lines of margin around each par(mfrow=c(1,1),mar=rep(4,4)) plot(data$CGPA, data$SAL, ylab="SAL", xlab="CGPA", main="Scatter Plot of the Data" ) # PEARSON'S CORRELATION COEFFICIENT r <- cor(data$CGPA, data$SAL); r # ============================= # FIT THE SIMPLE LINEAR MODEL # ============================= fit <- lm(SAL ~ CGPA, data=data); # TABLE OF PARAMETER ESTIMATES summary(fit) # ANOVA TABLE anova(fit) # ADD THE FITTED LINE TO THE SCATTER PLOT abline(a=435.9, b=3630.6) # ALTERNATIVELY abline(fit) # ================= # PREDICTION # ================= # 95% CI FOR ESTIMATING MEAN SAL OF ALL INDIVIDUALS WITH CGPA=2.5 est.ci <- predict( fit, newdata=data.frame(CGPA=2.5), se.fit=TRUE, interval="confidence", level=0.95) est.ci # 95% PI FOR ESTIMATING THE SAL VALUE OF AN INDIVIDUALS WITH CGPA=2.5 pred.pi <- predict( fit, newdata=data.frame(CGPA=2.5), se.fit=TRUE, interval="prediction", level=0.95) pred.pi # ================================== # CONFIDENCE AND PREDICTION BANDS # ================================== # FIRST GENERATE A SERIES OF X VALUES range(data$CGPA) new <- data.frame(CGPA= seq(from=2.24, to=3.72, length=100)); new # COMPUTE THE CONFIDENCE BAND CI95 <- predict( fit, newdata=new, se.fit=TRUE, interval="confidence", level=0.95) names(CI95); CI95$fit # COMPUTE THE PREDICTION BAND PI95 <- predict( fit, newdata=new, se.fit=TRUE, interval="prediction", level=0.95) PI95$fit; range(PI95$fit) # 6157.853 16430.056 # PLOT par(mar=rep(4,4)) # First create a frame for the plot, which is big enough to contain the confidence bands plot(c(2, 4.5), c(5500, 16500), type="n", ylab="SAL", xlab="CGPA", main="Linear Fit with Confidence/Prediction Bands" ) # Add the scatter plot of the data points(data$CGPA, data$SAL, pty=2) # Add the fitted line abline(fit) lines(new$CGPA, CI95$fit[,2], lty=2, col="blue") # The lower CI bond lines(new$CGPA, CI95$fit[,3], lty=2, col="blue") # The upper CI bond lines(new$CGPA, PI95$fit[,2], lty=3, col="red") # The lower PI bond lines(new$CGPA, PI95$fit[,3], lty=3, col="red") # The upper PI bond legend(3.2,8000,c("fitted line", "confidence bands", "prediction bands"), lty=1:3)