# ============================= # FIRST READ THE DATA INTO R # ============================= data <- data.frame(SAL = c(10455, 9680, 7300, 9388, 12496, 11812, 9224,11725, 11320, 12000, 12500, 13310, 12105, 6200, 11522, 8000, 12548, 7700, 10028, 13176, 13255, 13004, 8000, 8224, 10750, 11669, 12322, 11002, 10666, 10839), CGPA = c(2.58, 2.31, 2.47, 2.52, 3.22, 3.37, 2.43, 3.08, 2.78, 2.98, 3.55, 3.64, 3.72, 2.24, 2.7, 2.3, 2.83, 2.37, 2.52, 3.22, 3.55, 3.55, 2.47, 2.47, 2.78, 2.78, 2.98, 2.58, 2.58, 2.58)) # ALTERNATIVE WAYS: YOU MAY READ DATA EITHER FROM A LOCAL FILE OR FROM THE WEB. data <- data.frame(read.table(file="http://www-stat.stanford.edu/~olshen/hrp262spring01/spring01Datasets/EX0508.DAT", header=T, col.names=c("SAL", "CGPA"))) data; summary(data) x <- data$CGPA; y <- data$SAL # ============================== # EXPLORTARY DATA ANALYSIS # ============================== # The above commands line contains options for graph page. # 1x1 array of figures # on the page, with 4 lines of margin around each par(mfrow=c(1,1),mar=rep(4,4)) plot(x=x, y=y, ylab="SAL", xlab="CGPA", main="Scatter Plot of the Data" ) # PEARSON'S CORRELATION COEFFICIENT r <- cor(x, y); r # ============================= # FIT THE SIMPLE LINEAR MODEL # ============================= fit <- lm(y ~ x); # TABLE OF PARAMETER ESTIMATES summary(fit) # ANOVA TABLE anova(fit) # ADD THE FITTED LINE TO THE SCATTER PLOT abline(fit) # ================= # PREDICTION # ================= # 95% CI FOR ESTIMATING MEAN SAL OF ALL INDIVIDUALS WITH CGPA=2.5 est.ci <- predict( fit, newdata=data.frame(x=2.5), se.fit=TRUE, interval="confidence", level=0.95) est.ci # 95% PI FOR ESTIMATING THE SAL VALUE OF AN INDIVIDUALS WITH CGPA=2.5 pred.pi <- predict( fit, newdata=data.frame(x=2.5), se.fit=TRUE, interval="prediction", level=0.95) pred.pi # ================================== # CONFIDENCE AND PREDICTION BANDS # ================================== # FIRST GENERATE A NUMBER OF X VALUES FROM THE RANGE OF THE CURRENT DATA new <- data.frame(x= seq(from=min(x), to=max(x), length=100)); new # COMPUTE THE CONFIDENCE BAND CI95 <- predict( fit, newdata=new, se.fit=TRUE, interval="confidence", level=0.95) names(CI95); CI95$fit # COMPUTE THE PREDICTION BAND PI95 <- predict( fit, newdata=new, se.fit=TRUE, interval="prediction", level=0.95) PI95$fit; range(PI95$fit) # PLOT par(mar=rep(4,4)) # First create a frame for the plot, which is big enough to contain the confidence bands plot(c(min(x), max(x)), c(min(PI95$fit), max(PI95$fit)), type="n", ylab="SAL", xlab="CGPA", main="Linear Fit with Confidence/Prediction Bands" ) # Add the scatter plot of the data points(x, y, pty=2) # Add the fitted line abline(fit) lines(new$x, CI95$fit[,2], lty=2, col="blue") # The lower CI bond lines(new$x, CI95$fit[,3], lty=2, col="blue") # The upper CI bond lines(new$x, PI95$fit[,2], lty=3, col="red") # The lower PI bond lines(new$x, PI95$fit[,3], lty=3, col="red") # The upper PI bond legend(3.2,8000,c("fitted line", "confidence bands", "prediction bands"), lty=1:3)