################################# # CHAPTER 3 TWO-SAMPLE INFERENCE ################################# # SET UP THE WORKING DIRECTORY setwd("C:/Courses/STA4502/2009-Fall/R/") # THIS REMOVES ALL THE OBJECTS IN THE CURRENT WORK DIRECTORY rm(list=ls(all=TRUE)) # ========================================= # I, COMPARE LOCATION BETWEEN TWO SAMPLES # ========================================= grp1 <- scan() 1.40 1.23 1.03 .98 1.34 1.36 1.15 1.27 grp2 <- scan() 1.16 .99 1.04 1.02 1.09 1.12 .76 .88 # Some Preliminary Summary Info n1 <- length(grp1); n2 <- length(grp2); n <- n1+n2 mean1 <- mean(grp1); mean2 <- mean(grp2) sd1 <- sd(grp1); sd2 <- sd(grp2) dmean <- mean1-mean2; dmean s.p <- sqrt(((n1-1)*sd1^2 + (n2-1)*sd2^2)/(n1+n2-2)); s.p # ----------------------- # I.1 Wilcoxon Rank Sum Test # ----------------------- # Exact p-value wilcox.test(x=grp1, y=grp2, alternative = "two.sided", exact=T) # Asymptotic p-value with continuity correction wilcox.test(x=grp1, y=grp2, alternative = "two.sided", exact=F, correct = TRUE) # --------------------- # I.2: Permutation Test # --------------------- library(DAAG) # First compute the number of possible permuted samples comb <- function(n,k) factorial(n)/(factorial(k)*factorial(n-k)) n.permuted.samples <- comb(n=n1+n2, k=n1); n.permuted.samples twot.permutation(x1=grp1, x2=grp2, nsim=n.permuted.samples, plotit=T) # ---------------------------------------- # I.3: A 95% Bootstrap CI for Mean Difference # ---------------------------------------- # Parametric 95% t CI, Assuming equal variances # ----------------------------------------------- t.test(x=grp1, y = grp2, alternative = c("two.sided"), mu = 0, var.equal = T, conf.level = 0.95) # Nonparametric Bootstrap 95% CI - APPROACH I # --------------------------------------------- library(bootstrap); help(package="bootstrap") B <- 2000 boot1 <- bootstrap(x=grp1, theta=mean, nboot=B) boot2 <- bootstrap(x=grp2, theta=mean, nboot=B) diff.b <- boot1$thetastar - boot2$thetastar hist(diff.b, nclass=20) options(digits=4) quantile(diff.b, probs = c(.01, 0.025, .05, .1, .2, .33, .50, .66, .75, .95, .975, .99)) # A Look into the Details of the Calculation for (b in 1:3){ grp1.b <- sample(grp1, size=n1, replace=T) grp2.b <- sample(grp2, size=n2, replace=T) mean1.b <- mean(grp1.b); mean2.b <- mean(grp2.b) diff.b <- mean1.b - mean2.b print(paste("$$$$$$$$$$$$ b=", b, " $$$$$$$$$$$", sep="")) print(cbind(trt=c(rep("new", n1), rep("standard", n2)), c(grp1.b, grp2.b))) print(cbind(b=b, mean1.b=mean1.b, mean2.b=mean2.b, diff.b=diff.b)) } # Parametric Bootstrap 95% CI - APPROACH II # ------------------------------------------- e1 <- grp1 - mean(grp1) e2 <- grp2 - mean(grp2) e <- c(e1, e2) options(digits=7) B <- 100 bootstrap.t <- rep(0, B) for (b in 1:B){ e.b <- sample(e, size=n, replace=T) e1.b <- e.b[1:n1]; e2.b <- e.b[(n1+1):n] mean1.b <- mean(e1.b); mean2.b <- mean(e2.b) s1.b <- sd(e1.b); s2.b <- sd(e2.b) sp.b <- sqrt(((n1-1)*var(e1.b) + (n2-1)*var(e2.b))/(n1+n2-2)); t.b <- as.numeric(t.test(x=e1.b, y=e2.b, alternative ="two.sided", var.equal = T)$statistic) # print(paste("$$$$$$$$$$$$ b=", b, " $$$$$$$$$$$", sep="")) print(e.b) print(cbind(b=b, mean1.b=mean1.b, mean2.b=mean2.b, s1.b=s1.b, s2.b=s2.b, sp.b=sp.b, t.b=t.b)) bootstrap.t[b] <- t.b } hist(bootstrap.t, nclass=20) quantile(bootstrap.t, probs = c(0.025, .05, .50, .95, .975)) # Using the Package "bootstrap" # ----------------------------- library(bootstrap); # help(package="bootstrap") ttest <- function(x0, n1, n2) { print(x0) twot <- t.test(x=x0[1:n1], y=x0[(n1+1):(n1+n2)], var.equal = T) return(twot$statistic) } B <- 5000 boot1 <- bootstrap(x=e, theta=ttest, nboot=B, n1=n1, n2=n2) boot.t <- boot1$thetastar hist(boot.t, nclass=20, main="Histogram of Boostrapped t", xlab="t") quantile(boot.t, probs = c(0.025, .05, .50, .95, .975)) t.L <- quantile(boot.t, probs=0.025); t.L t.U <- quantile(boot.t, probs=0.975); t.U cat(paste("The 95% Bootstrap Confidence Intervals are: (", dmean - t.L*s.p*sqrt(1/n1+1/n2), ", ", dmean + t.L*s.p*sqrt(1/n1+1/n2), ") \n", sep="")) # ===================== # II, Compare Variance # ===================== present <- c(10.8, 11.1, 10.4, 10.1, 11.3) new <- c(10.8, 10.5, 11.0, 10.9, 10.8, 10.7, 10.8) n1 <- length(present); n2 <- length(new) n <- n1 + n2 s1.sq <- var(present); s2.sq <- var(new) # PARAMETRIC F TEST F.stat <- max(s1.sq, s2.sq)/min(s1.sq, s2.sq) pvalue.F <- 1-pf(F.stat, n1-1, n2-1) print(cbind(s1.sq, s2.sq, F.stat, F.95th=qf(.95, n1-1, n2-1), pvalue.F)) library(lawstat) levene.test(c(present, new), c(rep(0,n1), rep(1, n2))) # ------------------------ # II.1: Siegel-Tukey Test # ------------------------ # FIND Siegel-Tukey TEST STATISTIC USING THE FOLLOWING WORKSHEET pooled <-c(present, new) trt <- c(rep("present", n1), rep("new", n2)) cbind(pooled, trt) cbind(sort(pooled), trt[rank(pooled)]) T1 <- 20.5; T2 <- 57.5 Z1 <- (T1 - n1*(n+1)/2 - .5)/sqrt(n1*n2*(n+1)/12) Z2 <- (T1 - n1*(n+1)/2 + .5)/sqrt(n1*n2*(n+1)/12) p.value <- 2*min(pnorm(Z1, lower.tail =F), pnorm(Z2)); p.value # ----------------------- # II.2: Permutation Test # ----------------------- library(DAAG) n.permut <- comb(n=n1+n2, k=n1); n.permut pmt.pvalue <- twot.permutation(x1=dev1 * n1/(n1-1), x2=dev2*n2/(n2-1), nsim=n.permut, plotit=F) pmt.pvalue # Detailed look into the computation x <- c(present, new) trt <- c(rep(0,n1), rep(1, n2)) dev1 <- (present-mean(present))^2 dev2 <- (new-mean(new))^2 sq.dev <- c(dev1, dev2) sq.dev1 <- c(dev1*n1/(n1-1), dev2*n2/(n2-1)) perm1 <- sample(sq.dev1, size=n, replace=F) perm2 <- sample(sq.dev1, size=n, replace=F) rk <- rank(sq.dev1) options(digits=4) cbind(trt, x, sq.dev, sq.dev1, perm1, perm2) x1 <- sq.dev1; x1 <- perm2 mean(x1[1:n1]); mean(x1[(n1+1):n]); mean(x1[1:n1])- mean(x1[(n1+1):n]); # --------------------------- # BOOTSTRAP - Nonparametric # --------------------------- library(bootstrap); B <- 5000 boot1 <- bootstrap(x=present, theta=var, nboot=B) boot2 <- bootstrap(x=new, theta=var, nboot=B) diff.b <- boot1$thetastar - boot2$thetastar hist(diff.b, nclass=20, main="Histogram of Diff in Variances", xlab="diff in var") quantile(diff.b, probs = c(0.025, .05, .50, .95, .975)) # TAKE A LOOK AT THE DETAILED CALCULATION B <- 10 for (b in 1:B){ grp1.b <- sample(present, size=n1, replace=T) grp2.b <- sample(new, size=n2, replace=T) var1.b <- var(grp1.b); var2.b <- var(grp2.b) diff.b <- var1.b - var2.b print(paste("$$$$$$$$$$$$ b=", b, " $$$$$$$$$$$", sep="")) print(cbind(trt, c(grp1.b, grp2.b))) print(cbind(b=b, var1.b=var1.b, var2.b=var2.b, diff.b=diff.b)) } # ==================== # III, COMPARE CDF's # ==================== # ----------------------------------------- # III.1: Kolmogorov-Smirnov Two-Sample Test # ----------------------------------------- x <- c(7.6, 8.4, 8.6, 8.7, 9.3, 9.9, 10.1, 10.6, 11.2) y <- c(5.2, 5.7, 5.9, 6.5, 6.8, 8.2, 9.1, 9.8, 10.8, 11.3, 11.5, 12.3, 12.5, 13.4, 14.6) ks <- ks.test(x=x, y=y, exact=T); ks # EXACT ks.test(x=x, y=y, exact=F) # ASYMPTOTIC # SHOW THE DETAILS OF HOW TO COMPUTE THE ASYMPTOTIC P-VALUE n1 <- length(x); n2 <- length(y) T <- ks$statistic t <- (n1*n2)/(n1+n2)*T^2; t pvalue <- 0 for (i in 1:10) { p <- (-1)^(i-1)*exp(-2*i^2*t) print(p) pvalue <- pvalue + p } pvalue <- 2*pvalue pvalue # ----------------------------------------- # III.2: Cramer-von Mises Two-Sample Test # ----------------------------------------- library(CvM2SL2Test) cvm <- cvmts.test(x, y); # Note that this corresponds to a two-sided hypothesis pval <- cvmts.pval(cvm, m=n1, n=n2) cat(paste("The Cramer-von Mises Two-Sample Test Stat is ", cvm, "\n with p-value ", pval, sep=""), "\n") # =========================== # IV, MATCHED-PAIRS DESIGN # =========================== before <- c(.68, .64, .58, .72, .60, .65, .61, .88, .84) after <- c(.73, .62, .68, .72, .59, .72, .74, .97, .87) # REMOVE 0 DIFFERENCES d <- before-after before <- before[d!=0]; after <- after[d!=0] # ------------------------------ # Wilcoxon Signed Rank Sum Test # ------------------------------ wilcox.test(x=before, y=after, paired=T, exact=T, alternative="less") # Exact wilcox.test(x=before, y=after, paired=T, exact=F, correct=T, alternative="less") # Asymptotic # ----------------------- # Sign Test - Median Test # ----------------------- T <- sum(d > 0) n.prime <- length(before) pbinom(T, size=n.prime, prob=.5, lower.tail=T)