####################### # RELIEF ####################### library(dprep) help(package="dprep") # ======================== # CLASSIFICATION EXAMPLE # ======================== data(my.iris) relief(my.iris,nosample=150,threshold=0.0) # ============================================ # FEATURE SELECTION FOR THE REGRESSION CASE # ============================================ quasar <- read.table( file = "http://pegasus.cc.ucf.edu/~xsu/CLASS/STA4164/quasar.txt", header = F, col.names=c("id", "x1", "x2", "x3", "x4", "x5", "y")) quasar$y <- log(quasar$y) quasar # RANDOM FORESTS # --------------- library(randomForest) tune.rf <- tuneRF(quasar[ , -c(1, 7)], quasar[,7], ntreeTry=50, stepFactor=2, improve=0.05, trace=TRUE, plot=TRUE, dobest=FALSE) fit.rf <-randomForest(y ~ x1 + x2 + x3 + x4 + x5, data=quasar, mtry=3, ntree=5000, keep.forest=TRUE, importance=TRUE) print(fit.rf) # Plot the error rates or MSE of a randomForest object # TO INSPECT WHEN THE PERFORMANCE GETS STABLE plot(fit.rf, main="MSE vs. # of boostrap Samples") # PLOT OF VARIABLE IMPORTANCE imp <- importance(fit.rf) VI <- imp[,1]; VI predictor <- 1:length(VI) # RELIEF # ========= y <- sort(unique(quasar[,7])) normalize <- function(x) (x-mean(x))/sd(x) X <- quasar[, -c(1, 7)] X.norm <- as.data.frame(apply(X, MARGIN=2, FUN=normalize)) out <- NULL for (i in 1:200){ print(paste("===============", i, "================", sep="")) breaks=c(min(y)-0.1, sort(sample(y[-c(1, length(y))], size=1, replace=F)), max(y)+0.1) print(breaks) z <- cut(quasar[,7], breaks=breaks, labels = F) dat <- as.data.frame(cbind(X.norm, z)) rel <- as.data.frame(RELIEF.CON(data=dat, 10, -100.0)) out <- rbind(out, rel$weight) } out VI.relief <- apply(out, 2, FUN=mean) VI.relief # PLOT IMPROTANCE TOGETHER # postscript(file="fig5-2.eps", horizontal=F, width = 100.0, height = 95.0) par(bg="white", mfrow = c(2, 1), mar=c(4, 6, 4, 6)) plot(c(0.5,(length(VI)+0.5)), range(VI), type="n", yaxt="s", xaxt="n", ann=T, xlab="predictor", ylab="% Increase in MSE", main="(a) Random Forests") for (j in 1:length(VI)) polygon(rep(c((predictor)[j]-.2, (predictor)[j] +0.2), rep(2,2)), c(0, (VI)[j], (VI)[j], 0), col="lightgray") axis(1, at=predictor, labels = paste("x", predictor, sep=""), tick =F) plot(c(0.5,(length(VI.relief)+0.5)), c(min(VI.relief)-0.02, max(VI.relief)), type="n", yaxt="s", xaxt="n", ann=T, xlab="predictor", ylab="relevance", main="(b) RELIEF") for (j in 1:length(VI.relief)) polygon(rep(c((predictor)[j]-.2, (predictor)[j] +0.2), rep(2,2)), c(0, (VI.relief)[j], (VI.relief)[j], 0), col="lightgray") axis(1, at=predictor, labels = paste("x", predictor, sep=""), tick =F) # dev.off() # ----------------------------- # SOME SIMULATION BASED RESULTS # ----------------------------- rdat <- function(n=100, beta0=-2, beta1=-2, beta2=5){ x1 <- rnorm(n, mean=2) x2 <- rnorm(n, mean=2) x3 <- rnorm(n, mean=2) x4 <- rnorm(n, mean=2) # x5 <- sample(x=1:3, size=n, replace=T) # x6 <- sample(x=1:3, size=n, replace=T) mu <- beta0 + beta1*x1 + beta2*x3 y <- mu + rnorm(n); data.frame(x1, x2, x3, x4, y) } rdat1 <- function(n=100, beta0=2, beta1=0, beta2=0){ x1 <- sample(x=c(0,1), size=n, replace=T) x2 <- sample(x=1:10, size=n, replace=T) x3 <- sample(x=1:50, size=n, replace=T) x4 <- sample(x=1:100, size=n, replace=T) mu <- beta0 + beta1*x1 + beta2*x3 y <- mu + rnorm(n); data.frame(x1, x2, x3, x4, y) } nrun <- 100 out <- NULL for (j in 1:nrun) { dat <- rdat1(n=400) y <- sort(unique(dat$y)) out0 <- NULL for (i in 1:100){ print(paste("===============", i, "================", sep="")) dat0 <- dat breaks=c(min(y)-0.1, sort(sample(y[-c(1, length(y))], size=1, replace=F)), max(y)+0.1) print(breaks) z <- cut(dat$y, breaks=breaks, labels = F) dat0$y <- z rel <- as.data.frame(RELIEF.CON(data=dat0, 20, -100.0)) out0 <- rbind(out0, rel$weight) } VI.relief <- apply(out, 2, FUN=mean) VI.relief out <- rbind(out, VI.relief) } out # --------------------------------------------------------- # AN INTERESTING EMPIRICAL STUDY OF RELIEF AND RANDOMFOREST # --------------------------------------------------------- rdat <- function(n=100, beta0=0, beta1=0, beta2=0){ x1 <- sample(x=c(0,1), size=n, replace=T) x2 <- sample(x=1:10, size=n, replace=T)/10 x3 <- sample(x=1:10, size=n, replace=T)/10 x4 <- sample(x=1:100, size=n, replace=T)/100 eta <- beta0 + beta1*x1 + beta2*x3 p <- exp(eta)/(1+exp(eta)) y <- rbinom(n, 1, prob=0.3)+1; y1 <- eta + rnorm(n) data.frame(x1, x2, x3, x4, y, y1) } nrun <- 200 out.RF <- out.RELIEF <- NULL for (i in 1:nrun){ print(paste("===============", i, "================", sep="")) dat <- rdat(n=1000) #========= # RELIEF # ======== rel <- as.data.frame(relief(data=dat[, -6], nosample=20, threshold=-100.0, vnom=1:2)) # rel <- as.data.frame(RELIEF.CON(data=dat, 40, -100.0)) out.RELIEF <- rbind(out.RELIEF, rel$weight) #=============== # RANDOM FORESTS #=============== fit.rf <-randomForest(factor(dat$y) ~ factor(dat$x1) + factor(dat$x2) + dat$x3 + dat$x4, mtry=2, ntree=2000, keep.forest=F, importance=TRUE) imp <- importance(fit.rf); # print(imp) # varImpPlot(fit.rf) out.RF <- rbind(out.RF, imp[,4]) } # VI.relief <- apply(out.RELIEF, 2, FUN=mean); VI.relief # VI.rf <- apply(out.RF, 2, FUN=mean); VI.rf out.RELIEF; out.RF VI.1000 <- data.frame(input=rep(1:4, rep(nrun, 4)), relief=as.vector(out.RELIEF), RF= as.vector(out.RF)) save(VI.100, VI.1000, file="sim-results-VI.Rdata") load(file="sim-results-VI.Rdata") # postscript(file="fig5-5.eps", horizontal=F, width = 100.0, height = 95.0) par(bg="white", mfrow=c(2, 2), mar=c(4, 4, 4, 4)) boxplot(relief~input, data=VI.100, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = F, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(a) RELIEF n=100") boxplot(RF~input, data=VI.100, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = TRUE, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(b) RF n=100") boxplot(relief~input, data=VI.1000, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = F, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(a) RELIEF n=1000") boxplot(RF~input, data=VI.1000, range = 1.5, width = rep(2,4), varwidth = T, notch = T, outline = TRUE, names=paste("X", 1:4, sep=""), plot = TRUE, col = "orange", pars = list(boxwex = 0.2, staplewex = 0.5, outwex = 0.5), horizontal = FALSE, add = FALSE, at = NULL, main="(b) RF n=1000") dev.off() ####################### RELIEF.CON <- function (data, nosample, threshold) { data <- as.matrix(data) p = dim(data)[2] f = p - 1 acum <- rep(0, f) features <- seq(f) ngroups = length(unique(data[, p])) pesototal = rep(0, f) inst <- length(data[, 1]) priors <- tabulate(data[, p])/inst dh <- rep(0, f) for (j in 1:f) { dh[j] <- diff(range(data[, j])) } for (repet in 1:10) { nearhit <- matrix(0, nosample, f) pesos <- rep(0, f) tempo <- matrix(0, ngroups, f) for (i in 1:nosample) { indices <- sample(inst, 1, replace = TRUE) muestra <- data[indices, ] datatemp <- data[-indices, ] data1 = split.data.frame(datatemp[, 1:f], datatemp[, p]) indg <- muestra[p] nearhit[i, ] <- near1(muestra[-p], data1[[indg]]) for (kk in 1:ngroups) { if (kk != indg) { # print(kk) ##################################### nearmiss <- near1(muestra[-p], data1[[kk]]) tempo[kk, ] <- (muestra[-p] - as.vector(nearmiss)) } for (ii in 1:f) { tempo[kk, ii] <- (1/nosample) * (tempo[kk, ii]/dh[ii])^2 } } pesomiss <- rep(0, f) for (jj in 1:f) { for (kk in 1:ngroups) { if (kk != indg) { pesomiss[jj] <- pesomiss[jj] + priors[kk] * tempo[kk, jj] } } pesomiss[jj] <- pesomiss[jj]/(1 - priors[indg]) } for (j in 1:f) { diff <- -(1/nosample) * ((muestra[j] - nearhit[i, j])/dh[j])^2 + pesomiss[j] pesos[j] <- pesos[j] + diff } } o1 <- order(-pesos) o2 <- pesos[o1] o3 <- o1[o2 > threshold] pesototal = pesototal + pesos acum[o3] = acum[o3] + 1 } pesotota = as.matrix(pesototal) of1 <- order(-pesotota) of2 <- pesotota[of1]/10 acum = as.matrix(acum) tabla = cbind(1:f, acum, pesotota/10) colnames(tabla) = c("feature", "frequency", "weight") # tabla = tabla[order(-tabla[, 3]), ] # cat("Features appearing in at least half of repetitions ordered by their average relevance weight: \n") print(tabla) # print(tabla[tabla[, 2] >= 5, ]) # plot(of2, ylab = "weights") # text(1:f, of2, tabla[, 1], cex = 0.7, pos = 4) relevant1 = which(acum >= 5) relevant2 = which(pesotota/10 > threshold) relevant = c(relevant1, relevant2) relevant = relevant[duplicated(relevant)] # cat("selected features", "\n") relevant = tabla[1:length(relevant), 1] return(tabla) } reliefcat <- function(data, nosample, threshold, vnom) { data <- as.matrix(data) p = dim(data)[2] f = p - 1 acum <- rep(0, f) features <- seq(f) ngroups = length(unique(data[, p])) pesototal = rep(0, f) inst <- length(data[, 1]) priors <- tabulate(data[, p])/inst dh <- rep(0, f) dh[vnom] = 1 for (j in features[-vnom]) { dh[j] <- diff(range(data[, j])) } for (repet in 1:10) { nearhit <- matrix(0, nosample, f) pesos <- rep(0, f) tempo <- matrix(0, ngroups, f) for (i in 1:nosample) { indices <- sample(inst, 1, replace = TRUE) muestra <- data[indices, ] datatemp <- data[-indices, ] data1 = split.data.frame(datatemp[, 1:f], datatemp[, p]) indg <- muestra[p] nearhit[i, ] <- near2(muestra[-p], data1[[indg]], vnom) for (kk in 1:ngroups) { if (kk != indg) { nearmiss <- near2(muestra[-p], data1[[kk]], vnom) tempo[kk, -vnom] <- (muestra[-c(vnom, p)] - as.vector(nearmiss[-vnom])) for (jj in vnom) { if (muestra[jj] != nearmiss[jj]) tempo[kk, jj] = 1 } } for (ii in 1:f) { tempo[kk, ii] <- (tempo[kk, ii]/dh[ii])^2 } } pesomiss <- rep(0, f) for (jj in 1:f) { for (kk in 1:ngroups) { if (kk != indg) { pesomiss[jj] <- pesomiss[jj] + priors[kk] * tempo[kk, jj]/nosample } } pesomiss[jj] <- pesomiss[jj]/(1 - priors[indg]) } for (j in features[-vnom]) { diff = -(1/nosample) * ((muestra[j] - nearhit[i, j])/dh[j])^2 + pesomiss[j] pesos[j] = pesos[j] + diff } for (j in features[vnom]) { if (muestra[j] != nearhit[i, j]) { diff = -(1/nosample) + pesomiss[j] pesos[j] <- pesos[j] + diff } else { pesos[j] = pesos[j] + pesomiss[j] } } } o1 <- order(-pesos) o2 <- pesos[o1] o3 <- o1[o2 > threshold] pesototal = pesototal + pesos acum[o3] = acum[o3] + 1 } pesotota = as.matrix(pesototal) of1 <- order(-pesotota) of2 <- pesotota[of1]/10 acum = as.matrix(acum) tabla = cbind(1:f, acum, pesotota/10) colnames(tabla) = c("feature", "frequency", "weight") # tabla = tabla[order(-tabla[, 3]), ] # cat("Features appearing in at least half of repetitions ordered by their average relevance weight: \n") # print(tabla[tabla[, 2] >= 5, ]) # plot(of2, ylab = "weights") # text(1:f, of2, tabla[, 1], pos = 4) relevant1 = which(acum >= 5) relevant2 = which(pesotota/10 > threshold) relevant = c(relevant1, relevant2) relevant = relevant[duplicated(relevant)] # cat("selected features", "\n") relevant = tabla[1:length(relevant), 1] return(tabla) }