#### watch out use of "..." inside Boot function #### the "<<-" in the parallelized stuff? The logic is: #### a) we could pass the arguments explicitly as part of #### clusterApplyLB but for those objects that means #### repeatedly passing to the same slave node #### (e.g., 200 bootstrap samples, and only 4 nodes) #### a very large object. #### b) using the "<<-" results in important speed savings #### at least in some tests I did. #### c) just in case, right before the "<<-" I delete all objects #### with same name; #### d) the user is unlikely to store objects of his/her own #### creation in the slave nodes; thus, one of the biggest #### problems of "<<-" disappears. #### However, just in case we: #### e.1. Use an unlikely name #### e.2. Verify, when the cluster is initialized, that there is no #### object in any node with that name, and refuse to run unless #### the problem is fixed. ##### in bootSimplifyRF maybe allow allBootRuns = NULL so that object is small. ####usingCluster <- TRUE ####if(usingCluster) { #### library(snow) #### library(Rmpi) #### clusterNumberNodes <- 1 #### typeCluster <- "MPI" #### TheCluster <- makeCluster(clusterNumberNodes, #### type = typeCluster) #### clusterSetupSPRNG(TheCluster, seed = as.numeric(Sys.time())) #### clusterEvalQ(TheCluster, library(randomForest)) ####} require(randomForest) basicClusterInit <- function(clusterNumberNodes = 1, nameCluster = "TheCluster", typeCluster = "MPI") { ## Initialize a cluster ## Note that Rmpi already checks if another cluster is up and running, ## so we do not need to do that. if(!(typeCluster %in% c("MPI", "PVM"))) stop("typeCluster needs to be PVM or MPI") library(snow) if(typeCluster == "MPI") { library(Rmpi) } if(typeCluster == "PVM") { print("Make sure you have rpvm installed. We have only checked with Rmpi. Then do library(rpvm)") } if(length(find(nameCluster))) stop("\nThere is another object called ", nameCluster,".\n", "This could mean that a cluster with that name already exists;\n", " in this case, please use the existing cluster \n", " ---you do not need to initialize the cluster, \n", " just pass its name as the parameter for 'nameCluster'---\n", " or stop that cluster and initialize a new one. \n", "It could also mean that there is\n", " already an object with this name; either remove the object\n", " or use another name for the cluster.\n") assign(nameCluster, makeCluster(clusterNumberNodes, type = typeCluster), env = .GlobalEnv) sprng.seed <- round(2^32 * runif(1)) print(paste("Using as seed for SPRNG", sprng.seed)) clusterSetupSPRNG(eval(parse(text = nameCluster)), seed = sprng.seed) clusterEvalQ(eval(parse(text = nameCluster)), library(randomForest)) clusterEvalQ(eval(parse(text = nameCluster)), library(varSelRF)) } varSelRF <- function(xdata, Class, c.sd = 1, mtryFactor = 1, ntree = 5000, ntreeIterat = 2000, vars.drop.num = NULL, vars.drop.frac = 0.2, whole.range = TRUE, recompute.var.imp = FALSE, verbose = FALSE, returnFirstForest = TRUE, fitted.rf = NULL) { if(!is.factor(Class)) stop("Class should be a factor") if( (is.null(vars.drop.num) & is.null(vars.drop.frac)) | (!is.null(vars.drop.num) & !is.null(vars.drop.frac))) stop("One (and only one) of vars.drop.frac and vars.drop.num must be NULL and the other set") max.num.steps <- dim(xdata)[2] num.subjects <- dim(xdata)[1] if(is.null(colnames(xdata))) colnames(xdata) <- paste("v", 1:dim(xdata)[2], sep ="") ##oversize the vectors; will prune later. n.vars <- vars <- OOB.rf <- OOB.sd <- rep(NA, max.num.steps) oobError <- function(rf) { ## should not be exported in the namespace. ## The out of bag error ooo <- rf$confusion[, -dim(rf$confusion)[2]] s.ooo <- sum(ooo) diag(ooo) <- 0 sum(ooo)/s.ooo } if(!is.null(fitted.rf)) { if(ncol(fitted.rf$importance) < 2) stop("The fitted rf was not fitted with importance = TRUE") n.ntree <- fitted.rf$ntree mtry <- fitted.rf$mtry n.mtryFactor <- mtry/sqrt(ncol(xdata)) if((n.ntree != ntree) | (n.mtryFactor != mtryFactor)) warning("Using as ntree and mtry the parameters obtained from fitted.rf", immediate.= TRUE) ntree <- n.ntree mtryFactor <- n.mtryFactor rm(n.ntree, n.mtryFactor) rf <- fitted.rf } else { mtry <- floor(sqrt(ncol(xdata)) * mtryFactor) rf <- randomForest(x = xdata, y = Class, ntree = ntree, mtry = mtry, importance = TRUE, keep.forest = FALSE) } if(returnFirstForest) FirstForest <- rf else FirstForest <- NULL m.iterated.ob.error <- m.initial.ob.error <- oobError(rf) sd.iterated.ob.error <- sd.initial.ob.error <- sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) * (1/num.subjects)) if(verbose) { print(paste("Initial OOB error: mean = ", round(m.initial.ob.error, 4), "; sd = ", round(sd.initial.ob.error, 4), sep = "")) } importances <- importance(rf, type = 1, scale = FALSE) selected.vars <- order(importances, decreasing = TRUE) ordered.importances <- importances[selected.vars] initialImportances <- importances initialOrderedImportances <- ordered.importances j <- 1 n.vars[j] <- dim(xdata)[2] vars[j] <- paste(colnames(xdata), collapse = " + ") OOB.rf[j] <- m.iterated.ob.error OOB.sd[j] <- sd.iterated.ob.error var.simplify <- TRUE while(var.simplify) { print("gc inside loop of varSelRF") print(gc()) ## last.rf <- rf; why do we do this? last.rf <- NULL last.vars <- selected.vars previous.m.error <- m.iterated.ob.error previous.sd.error <- sd.iterated.ob.error if(recompute.var.imp & (j > 1)) { importances <- importance(rf, type = 1, scale = FALSE) tmp.order <- order(importances, decreasing = TRUE) selected.vars <- selected.vars[tmp.order] ordered.importances <- importances[tmp.order] } num.vars <- length(selected.vars) if(is.null(vars.drop.num)) vars.drop <- round(num.vars * vars.drop.frac) else vars.drop <- vars.drop.num if(num.vars >= (vars.drop + 2)) { selected.vars <- selected.vars[1: (num.vars - vars.drop)] ordered.importances <- ordered.importances[1: (num.vars - vars.drop)] } else { selected.vars <- selected.vars[1:2] ordered.importances <- ordered.importances[1:2] } ## couldn't we eliminate the following? if((length(selected.vars) < 2) | (any(selected.vars < 1))) { var.simplify <- FALSE break } if(length(selected.vars) <= 2) var.simplify <- FALSE mtry <- floor(sqrt(length(selected.vars)) * mtryFactor) if(mtry > length(selected.vars)) mtry <- length(selected.vars) if(recompute.var.imp) rf <- randomForest(x = xdata[, selected.vars], y = Class, importance= TRUE, ntree = ntree, mtry = mtry, keep.forest = FALSE) else rf <- randomForest(x = xdata[, selected.vars], y = Class, importance= FALSE, ntree = ntreeIterat, mtry = mtry, keep.forest = FALSE) m.iterated.ob.error <- oobError(rf) sd.iterated.ob.error <- sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) * (1/num.subjects)) if(verbose) { print(paste("..... iteration ", j, "; OOB error: mean = ", round(m.iterated.ob.error, 4), "; sd = ", round(sd.iterated.ob.error, 4), sep = "")) } j <- j + 1 n.vars[j] <- length(selected.vars) vars[j] <- paste(colnames(xdata)[selected.vars], collapse = " + ") OOB.rf[j] <- m.iterated.ob.error OOB.sd[j] <- sd.iterated.ob.error if(!whole.range & ( (m.iterated.ob.error > (m.initial.ob.error + c.sd*sd.initial.ob.error)) | (m.iterated.ob.error > (previous.m.error + c.sd*previous.sd.error))) ) var.simplify <- FALSE } if (!whole.range) { if(!is.null(colnames(xdata))) selected.vars <- sort(colnames(xdata)[last.vars]) else selected.vars <- last.vars out <- list(selec.history = data.frame( Number.Variables = n.vars, Vars.in.Forest = vars, OOB = OOB.rf, sd.OOB = OOB.sd)[1:j,], rf.model = last.rf, selected.vars = selected.vars, selected.model = paste(selected.vars, collapse = " + "), best.model.nvars = length(selected.vars), initialImportances = initialImportances, initialOrderedImportances = initialOrderedImportances, ntree = ntree, mtry = mtry, firstForest = FirstForest) class(out) <- "varSelRF" return(out) } else { ## Prune the too long vectors created at begin. ## not needed above, because we select the 1:j rows ## of the return matrix selec.history. n.vars <- n.vars[1:j] vars <- vars[1:j] OOB.rf<- OOB.rf[1:j] OOB.sd <- OOB.sd[1:j] min.oob.ci <- min(OOB.rf) + c.sd * OOB.sd[which.min(OOB.rf)] best.pos <- which(OOB.rf <= min.oob.ci)[which.min(n.vars[which(OOB.rf <= min.oob.ci)])] selected.vars <- sort(unlist(strsplit(vars[best.pos], " + ", fixed = TRUE))) out <- list(selec.history = data.frame( Number.Variables = n.vars, Vars.in.Forest = vars, OOB = OOB.rf, sd.OOB = OOB.sd), rf.model = NA, selected.vars = selected.vars, selected.model = paste(selected.vars, collapse = " + "), best.model.nvars = n.vars[best.pos], initialImportances = initialImportances, initialOrderedImportances = initialOrderedImportances, ntree = ntree, ntreeIterat = ntreeIterat, mtryFactor = mtryFactor, ##mtry = mtry, firstForest = FirstForest) class(out) <- "varSelRF" return(out) } } print.varSelRF <- function(x, ...) { cat("\nBackwards elimination on random forest; ") cat(paste("ntree = ", x$ntree,"; mtryFactor = ", x$mtryFactor, "\n"), sep ="") cat("\n Selected variables:\n") print(x$selected.vars) cat("\n Number of selected variables:", x$best.model.nvars, "\n\n") } #summary.varSelRF <- print.varSelRF plot.varSelRF <- function(x, nvar = NULL, ...) { if(dev.interactive()) { op <- par(ask = TRUE, las = 1) } else { op <- par(las = 1) } on.exit(par(op)) if(is.null(nvar)) nvar <- min(30, length(x$initialOrderedImportances)) dotchart(rev(x$initialOrderedImportances[1:nvar]), main = "Initial importances", xlab = "Importances (unscaled)") ylim <- c(0, max(0.50, x$selec.history$OOB)) plot(x$selec.history$Number.Variables, x$selec.history$OOB, type = "b", xlab = "Number of variables used", ylab = "OOB error", log = "x", ylim = ylim, ...) ## if(max(x$selec.history$Number.Variables) > 300) ## axis(1, at = c(1, 2, 3, 5, 8, 15, 25, 50, 75, 150, 200, 300), ## labels = c(1, 2, 3, 5, 8, 15, 25, 50, 75, 150, 200, 300)) lines(x$selec.history$Number.Variables, x$selec.history$OOB + 2 * x$selec.history$sd.OOB, lty = 2) lines(x$selec.history$Number.Variables, x$selec.history$OOB - 2 * x$selec.history$sd.OOB, lty = 2) } ## We could also write a varSelRFCV zz: move al TODO varSelRFBoot <- function(xdata, Class, c.sd = 1, mtryFactor = 1, ntree = 5000, ntreeIterat = 2000, vars.drop.frac = 0.2, bootnumber = 200, whole.range = TRUE, recompute.var.imp = FALSE, usingCluster = TRUE, TheCluster = NULL, srf = NULL, ...) { ## beware there is a lot of data copying... pass by reference, or ## minimize copying or something. if(usingCluster & is.null(TheCluster)) { basicClusterInit() warning("Initializing a one-slave MPI cluster called \"TheCluster\"\n", "For more control, pass the name for the cluster, \n", "after initializing with basicClusterInit.\n", "It is much, much better if you explicitly set the parameters\n", "you want for your cluster (name, type, and number of nodes)\n", immediate.= TRUE) } if(usingCluster & !is.null(TheCluster)) { cat("\n Using as cluster ", deparse(substitute(TheCluster))) if(!length(find(TheCluster))) stop("\nOoops, but", TheCluster, "does not yet exist.\n", "Please create it using basicClusterInit.") } if(usingCluster) { test.presence1 <- clusterEvalQ(TheCluster, length(find("xdataTheCluster"))) test.presence2 <- clusterEvalQ(TheCluster, length(find("ClassTheCluster"))) if(any(test.presence1)) stop("At least one slave node has an object called xdataTheCluster") if(any(test.presence2)) stop("At least one slave node has an object called ClassTheCluster") ## clusterExport(TheCluster, c("varSelRF")) } if(is.null(colnames(xdata))) colnames(xdata) <- paste("v", 1:dim(xdata)[2], sep ="") if(!is.null(srf)) { ## we are passing a simplified rf object. if(class(srf) != "varSelRF") stop("srf must be the results of a run of varSelRF") n.ntree <- srf$ntree n.ntreeIterat <- srf$ntreeIterat n.mtryFactor <- srf$mtryFactor if((n.ntree != ntree) | (n.mtryFactor != mtryFactor) | (n.ntreeIterat != ntreeIterat)) warning("Using as ntree and mtryFactor the parameters obtained from srf", immediate.= TRUE) ntree <- n.ntree mtryFactor <- n.mtryFactor rm(n.ntree, n.mtryFactor) all.data.run <- srf } else { ## we are simplifying the random forest all.data.run <- varSelRF(Class = Class, xdata = xdata, c.sd = c.sd, mtryFactor = mtryFactor, ntree = ntree, ntreeIterat = ntreeIterat, vars.drop.frac = vars.drop.frac, whole.range = whole.range, recompute.var.imp = recompute.var.imp) ### ...) } columns.data <- which(colnames(xdata) %in% all.data.run$selected.vars) all.data.rf.mtry <- floor(mtryFactor * sqrt(length(columns.data))) if(all.data.rf.mtry > length(columns.data)) all.data.rf.mtry <- length(columns.data) all.data.rf.predict <- randomForest(y = Class, x = xdata[, columns.data], ntree = all.data.run$ntree, mtry = all.data.rf.mtry, xtest = xdata[, columns.data], ytest = Class, keep.forest = FALSE) full.pred <- all.data.rf.predict$test$predicted all.data.selected.vars <- all.data.run$selected.vars ## all.data.selected.model <- all.data.run$selected.model all.data.best.model.nvars <- all.data.run$best.model.nvars N <- length(Class) solution.sizes <- rep(NA, bootnumber) overlap.with.full <- rep(NA, bootnumber) vars.in.solutions <- vector() ## solutions <- rep(NA, bootnumber) bootTrainTest <- function(dummy, c.sd, mtryFactor, ntree, ntreeIterat, whole.range, recompute.var.imp, ...) { N <- length(ClassTheCluster) sample.again <- TRUE while(sample.again) { bootsample <- unlist(tapply(1:N, ClassTheCluster, function(x) sample(x, size = length(x), replace = TRUE))) ## sure, this isn't the fastest, but will do for now. nobootsample <- setdiff(1:N, bootsample) if(!length(nobootsample)) sample.again <- TRUE else sample.again <- FALSE } ## this is an ugly hack to prevent nobootsamples ## of size 0. train.data <- xdataTheCluster[bootsample, , drop = FALSE] test.data <- xdataTheCluster[nobootsample, , drop = FALSE] train.class <- ClassTheCluster[bootsample] test.class <- ClassTheCluster[nobootsample] boot.run <- varSelRF(Class = train.class, xdata = train.data, c.sd = c.sd, mtryFactor = mtryFactor, ntree = ntree, ntreeIterat = ntreeIterat, whole.range = whole.range, recompute.var.imp = recompute.var.imp, vars.drop.frac = vars.drop.frac) ### ...) output.cl <- list() output.cl$best.model.nvars <- boot.run$best.model.nvars output.cl$selected.model <- boot.run$selected.model output.cl$selected.vars <- boot.run$selected.vars output.cl$nobootsample <- nobootsample output.cl$bootsample <- bootsample output.cl$initialImportances <- boot.run$initialImportances output.cl$initialOrderedImportances <- boot.run$initialOrderedImportances output.cl$selec.history <- boot.run$selec.history boot.col.data <- which(colnames(xdataTheCluster) %in% boot.run$selected.vars) run.test.mtry <- floor(mtryFactor * sqrt(length(boot.col.data))) if(run.test.mtry > length(boot.col.data)) run.test.mtry <- length(boot.col.data) boot.run.test <- randomForest(y = train.class, x = train.data[, boot.col.data, drop = FALSE], ntree = boot.run$ntree, mtry = run.test.mtry, keep.forest = FALSE, xtest = test.data[, boot.col.data, drop = FALSE])$test ## ytest = test.class)$test output.cl$class.pred.array <- boot.run.test$predicted output.cl$prob.pred.array <- boot.run.test$votes rm(boot.run.test) gc() return(output.cl) } ## bootTrainTest; this function is defined to be used in the cluster. ## we use it too if not in the cluster. There is a bit too much ## copying, because we define a pair of new objects ## xdataTheCluster and ClassTheCluster that need not be defined ## if we just copied the code. But that is ugly and hard to follow, ## and the non-cluster is slow for other reasons. if(usingCluster) { print("gc inside varSelRFBoot papply") print(gc()) clusterEvalQ(TheCluster, rm(list = c("xdataTheCluster", "ClassTheCluster"))) xdataTheCluster <<- xdata ClassTheCluster <<- Class clusterExport(TheCluster, c("xdataTheCluster", "ClassTheCluster")) cat("\n Running bootstrap iterations using cluster (can take a while)\n") boot.runs <- clusterApplyLB(TheCluster, 1:bootnumber, bootTrainTest, c.sd = c.sd, mtryFactor = mtryFactor, ntree = ntree, ntreeIterat = ntreeIterat, whole.range = whole.range, recompute.var.imp = recompute.var.imp) ## clean up before leaving clusterEvalQ(TheCluster, rm(list = c("xdataTheCluster", "ClassTheCluster"))) } else { ## Not using Cluster boot.runs <- list() xdataTheCluster <- xdata ClassTheCluster <- Class cat("\n Running bootstrap iterations") for(nboot in 1:bootnumber) { cat(".") boot.runs[[nboot]] <- bootTrainTest(nboot, c.sd = c.sd, mtryFactor = mtryFactor, ntree = ntree, ntreeIterat = ntreeIterat, whole.range = whole.range, recompute.var.imp = recompute.var.imp) } cat("\n") } solutions <- unlist(lapply(boot.runs, function(z) { paste(sort(z$selected.vars), collapse = " + ")})) vars.in.solutions <- unlist(lapply(boot.runs, function(z) z$selected.vars)) solution.sizes <- unlist(lapply(boot.runs, function(z) z$best.model.nvars)) overlap.with.full <- unlist(lapply(boot.runs, function(x) { length(intersect(x$selected.vars, all.data.selected.vars))/ sqrt(all.data.best.model.nvars * x$best.model.nvars)})) prob.pred.array <- array(NA, dim = c(N, nlevels(Class), bootnumber), dimnames = list(1:N, levels(Class), paste("BootstrapReplication.", 1:bootnumber, sep = ""))) ## to store class predictions as data frame class.pred.array <- data.frame(Class) for(nb in 1:bootnumber) { nobootsample <- boot.runs[[nb]]$nobootsample class.pred.array[[nb]] <- factor(NA, levels = levels(Class)) class.pred.array[nobootsample, nb] <- boot.runs[[nb]]$class.pred.array prob.pred.array[nobootsample, , nb] <- boot.runs[[nb]]$prob.pred.array } names(class.pred.array) <- paste("BootstrapReplication.", 1:bootnumber, sep = "") ############## The .632+ estimate of prediction error ################## ## one: \hat{Err}^{(1)} in Efron & Tibshirani, 1997, p. 550, ## or leave-one-out bootstrap error. ## resubst: \bar{err}, the apparent error rate, resubstitution rate, ## or "in sample" error rate. ## full.pred: the "in sample" prediction from full model; only used ## to obtain gamma. ## gamma: gamma in p. 552 ## r: R in p. 552 (bounding in [0, 1]). ## err632: the .632 error ## errprime: the one prime; \hat{Err}^{(1)}' ## err: the .632+ error ## This is how I find one one <- mean(apply(cbind(class.pred.array, Class), 1, function(x) {mean(x[-(bootnumber + 1)] != x[bootnumber + 1], na.rm = TRUE)}), na.rm = TRUE) ## this is equivalent to what Torsten Hothorn does in ipred resubst <- mean(full.pred != Class) ## The following code I take directly from the function ## bootest.factor, by Torsten Hothorn, in package ipred. err632 <- 0.368 * resubst + 0.632 * one gamma <- sum(outer(as.numeric(Class), as.numeric(full.pred), function(x, y) ifelse(x == y, 0, 1)))/(length(Class)^2) r <- (one - resubst)/(gamma - resubst) r <- ifelse(one > resubst & gamma > resubst, r, 0) if((r > 1) | (r < 0)) { ## just debugging; eliminar más adelante print(paste("r outside of 0, 1 bounds: one", one, "resubst", resubst, "gamma", gamma)) if(r > 1) { r <- 1 print("setting r to 1") } else if(r < 0) { r <- 0 print("setting r to 0") } } errprime <- min(one, gamma) err <- err632 + (errprime - resubst) * (0.368 * 0.632 * r)/(1 - 0.368 * r) cat("\n .632+ prediction error ", round(err, 4), "\n") out <- list(number.of.bootsamples = bootnumber, bootstrap.pred.error = err, resubstitution.error = resubst, leave.one.out.bootstrap.error = one, all.data.randomForest = all.data.rf.predict, all.data.vars = all.data.selected.vars, all.data.run = all.data.run, class.predictions = class.pred.array, prob.predictions = prob.pred.array, number.of.vars = solution.sizes, overlap = overlap.with.full, all.vars.in.solutions = vars.in.solutions, all.solutions = solutions, class = Class, allBootRuns = boot.runs) class(out) <- "varSelRFBoot" return(out) } print.varSelRFBoot <- function(x, ...) { cat("\n\n Variable selection with random forest \n") cat(" ------------------------------\n") ## cat("\n\n randomForest summary \n") ## print(object$all.data.randomForest) cat("\n Variables used \n") print(x$all.data.vars) cat("\n \n Number of variables used: ", length(x$all.data.vars), "\n") cat("\n\n Bootstrap results\n") cat(" ------------------\n") cat("\n Bootstrap (.632+) estimate of prediction error: \n", " (using", x$number.of.bootsamples, "bootstrap iterations): \n", x$bootstrap.pred.error) cat("\n\n Number of vars in bootstrapped forests: \n") print(summary(x$number.of.vars)) } ## zz: pass gene names and subject names?? ## look at examples from ~/Proyectos/Signatures/Symposum/boot.pamr.knn.dlda.R ## o similar bajo Gema Moreno ## Or not, because I think I am ussing column names ## but look at that code anyway for improvements #### in summaryBoot: a plot of error rate vs. #### number of variables #### requires saving el "all.data.run" en el Boot, which should ALWAYS be done!! summary.varSelRFBoot <- function(object, return.model.freqs = FALSE, return.class.probs = TRUE, return.var.freqs.b.models = TRUE, ...) { cat("\n\n Variable selection using all data \n") cat(" ------------------------------\n") ### cat("\n\n randomForest summary \n") ### print(object$all.data.randomForest) cat("\n \n variables used \n") print(object$all.data.vars) cat("\n \n Number of variables used: ", length(object$all.data.vars), "\n") cat("\n\n Bootstrap results\n") cat(" ------------------\n") cat("\n\n Bootstrap (.632+) estimate of prediction error:", object$bootstrap.pred.error, " (using", object$number.of.bootsamples, "bootstrap iterations).\n") cat("\n\n Resubstitution error: ", object$resubstitution.error, "\n") cat("\n\n Leave-one-out bootstrap error: ", object$leave.one.out.bootstrap.error, "\n") nk <- as.vector(table(object$class)) cat("\n\n Error rate at random: ", 1 - (max(nk)/sum(nk)), "\n") cat("\n\n Number of vars in bootstrapped forests: \n") print(summary(object$number.of.vars)) cat cat("\n") cat("\n Overlapp of bootstrapped forests with forest from all data\n") print(summary(object$overlap)) if(return.var.freqs.b.models) { cat("\n\n Variable freqs. in bootstrapped models \n") print(sort(table(object$all.vars.in.solutions), decreasing = TRUE)/object$number.of.bootsamples) } in.all.data <- which(names(table(object$all.vars.in.solutions)) %in% object$all.data.vars) cat("\n\n Variable freqs. of variables in forest from all data, and summary \n") print(table(object$all.vars.in.solutions)[in.all.data]/object$number.of.bootsamples) cat("\n") print(summary(table(object$all.vars.in.solutions)[in.all.data]/object$number.of.bootsamples)) if(return.model.freqs) { tmp.table <- sort(table(object$all.solutions), decreasing = TRUE)/object$number.of.bootsamples n.tmp.table <- names(tmp.table) dim(tmp.table) <- c(dim(tmp.table), 1) rownames(tmp.table) <- n.tmp.table colnames(tmp.table) <- "Freq." cat("\n\n Solutions frequencies in bootstrapped models \n") print(tmp.table) } if(return.class.probs) cat("\n\n Mean class membership probabilities from out of bag samples\n") if(return.class.probs) { mean.class.probs <- apply(object$prob.predictions, c(1, 2), function(x) mean(x, na.rm = TRUE)) colnames(mean.class.probs) <- levels(object$class) } if(return.class.probs) return(mean.class.probs) } plot.varSelRFBoot <- function(x, oobProb = TRUE, oobProbBoxPlot = FALSE, ErrorNum = TRUE, subject.names = NULL, class.to.plot = NULL, ...) { if(oobProb | oobProbBoxPlot) { mean.class.probs <- apply(x$prob.predictions, c(1, 2), function(x) mean(x, na.rm = TRUE)) colnames(mean.class.probs) <- levels(x$class) rainbow.col <- rainbow(nlevels(x$class)) if(dev.interactive()) { op <- par(ask = TRUE, las = 1) } else { op <- par(las = 1) } on.exit(par(op)) #### for(i in 1:ncol(mean.class.probs)) { if(is.null(class.to.plot)) class.to.plot <- 1:ncol(mean.class.probs) for(i in class.to.plot) { if(oobProbBoxPlot) { boxplot(data.frame(t(x$prob.predictions[ , i, ])), xlab = "Samples", ylab = "Out of bag probability of membership", main = paste("Class", colnames(mean.class.probs)[i]), type = "p", col = rainbow.col[as.numeric(x$class)], pch = 19, ylim = c(0, 1.3), axes = FALSE) } else { ### dotchart(mean.class.probs[, i], ### xlab = "Samples", ### ylab = "(Average) Out of bag probability of membership", ### main = paste("Class", colnames(mean.class.probs)[i]), ### type = "p", ### col = rainbow.col[as.numeric(x$class)], ### pch = 19, ylim = c(0, 1.2), ### axes = FALSE) plot(mean.class.probs[, i], xlab = "Samples", ylab = "(Average) Out of bag probability of membership", main = paste("Class", colnames(mean.class.probs)[i]), type = "p", col = rainbow.col[as.numeric(x$class)], pch = 19, ylim = c(0, 1.3), axes = FALSE) } if(!is.null(subject.names)) text(mean.class.probs[ , i], labels = subject.names, pos = 2, cex = 0.8) box() ##axis(1) par(las = 2) axis(1, at = seq(1:dim(mean.class.probs)[1]), labels = seq(1:dim(mean.class.probs)[1])) segments(x0 = seq(1:dim(mean.class.probs)[1]), y0 = 0, x1 = seq(1:dim(mean.class.probs)[1]), y1 = 1, lty = 2, col = "grey") axis(2, at = seq(from = 0, to = 1, by = 0.2)) legend(y = c(1.29, 1.15), x = c(1, dim(mean.class.probs)[1]), legend = paste("Class", levels(x$class)), col = rainbow.col, pch = 19, bty = "n" ) abline( h = 1.05, lty = 1) abline(h = seq(from = 0, to = 1, length = 1 + nlevels(x$class))[-c(1, nlevels(x$class) + 1)], lty = 2) } } if(ErrorNum) { par(las = 2) all.data.errors <- x$all.data.run$selec.history$OOB ngenes <- x$all.data.run$selec.history$Number.Variables maxplot <- max( c(unlist(lapply(x$allBootRuns, function(x) max(x$selec.history$OOB))), all.data.errors)) minplot <- min( c(unlist(lapply(x$allBootRuns, function(x) min(x$selec.history$OOB))), all.data.errors)) minplot <- minplot * (1 - 0.1) maxplot <- maxplot * (1 + 0.2) plot(ngenes, all.data.errors, type = "l", axes = TRUE, xlab = "Number of variables", ylab = "OOB Error rate", ylim = c(minplot, maxplot), lty = 1, log = "x", col = "red", lwd = 2, main = "OOB Error rate vs. Number of variables in predictor", xlim = c(2, max(ngenes)*1.1)) ### plot(num.points.plot:1, ### x$allBootRuns[[1]]$other$trained.pam.cv$error, ### type = "l", axes = FALSE, xlab = "Number of genes", ### ylab = "CV Error rate", ylim = c(minplot, maxplot), lty = 2, ### main = "CV Error rate vs. Number of genes in predictor.") legend(x = 10, y = maxplot, legend = c("Bootstrap samples", "Original sample"), lty = c(2, 1), lwd = c(1, 3), col = c("Black", "Red"), bty = "n") ## box() ## axis(2) ## axis(1) if(max(ngenes) > 300) axis(1, at = c(2, 3, 5, 8, 15, 20, 25, 35, 50, 75, 150, 200, 300), labels = c(2, 3, 5, 8, 15, 10, 25, 35, 50, 75, 150, 200, 300)) for(nb in 1:x$number.of.bootsamples) lines(x$allBootRuns[[nb]]$selec.history$Number.Variables, x$allBootRuns[[nb]]$selec.history$OOB, lty = 2) lines(ngenes, all.data.errors, col = "red", lwd = 4) } } randomVarImpsRF <- function(xdata, Class, forest, numrandom = 100, whichImp = "impsUnscaled", usingCluster = TRUE, TheCluster = NULL, ...) { if(!all(whichImp %in% c("impsScaled", "impsUnscaled", "impsGini"))) stop("whichImp contains a non-valid option; should be one or more \n", "of impsScaled, impsUnscaled, impsGini") if(usingCluster & is.null(TheCluster)) { basicClusterInit() warning("Initializing a one-slave MPI cluster called \"TheCluster\"\n", "For more control, pass the name for the cluster, \n", "after initializing with basicClusterInit.\n", "It is much, much better if you explicitly set the parameters\n", "you want for your cluster (name, type, and number of nodes)\n", immediate. = TRUE) } if(usingCluster & !is.null(TheCluster)) { print(paste("Using as cluster ", deparse(substitute(TheCluster)))) if(!length(find(TheCluster))) stop("\nOoops, but", TheCluster, "does not yet exist.\n", "Please create it using basicClusterInit.") } if(usingCluster) { test.presence1 <- clusterEvalQ(TheCluster, length(find("xdataTheCluster"))) test.presence2 <- clusterEvalQ(TheCluster, length(find("ClassTheCluster"))) if(any(test.presence1)) stop("At least one slave node has an object called xdataTheCluster") if(any(test.presence2)) stop("At least one slave node has an object called ClassTheCluster") ### clusterExport(TheCluster, ) } ontree <- forest$ntree omtry <- forest$mtry nodesize <- 1 if(usingCluster) { iRF.cluster <- function(dummy, ontree, omtry, nodesize, ...) { rf <- randomForest(x = xdataTheCluster, y = sample(ClassTheCluster), ntree = ontree, mtry = omtry, nodesize = nodesize, importance = TRUE, keep.forest = FALSE, ...) ## If we specify the importance measure, only that var is evaluated. ## if we say "ALL", all three. impsScaled <- NULL impsUnscaled <- NULL impsGini <- NULL if("impsUnscaled" %in% whichImp) impsUnscaled <- importance(rf, type = 1, scale = FALSE) if("impsScaled" %in% whichImp) impsScaled <- importance(rf, type = 1, scale = TRUE) if("impsGini" %in% whichImp) impsGini <- rf$importance[, ncol(rf$importance)] return(list(impsScaled, impsUnscaled, impsGini)) } clusterEvalQ(TheCluster, rm(list = c("xdataTheCluster", "ClassTheCluster"))) xdataTheCluster <<- xdata ClassTheCluster <<- Class clusterExport(TheCluster, c("xdataTheCluster", "ClassTheCluster")) cat("\n Obtaining random importances using cluster (might take a while)\n") outCl <- clusterApplyLB(TheCluster, 1:numrandom, iRF.cluster, ontree = ontree, omtry = omtry, nodesize = nodesize) clusterEvalQ(TheCluster, rm(list = c("xdataTheCluster", "ClassTheCluster"))) } else { outCl <- list() cat("\n Obtaining random importances ") for(nriter in 1:numrandom) { cat(".") rf <- randomForest(x = xdata, y = sample(Class), ntree = ontree, mtry = omtry, nodesize = nodesize, importance = TRUE, keep.forest = FALSE, ...) impsScaled <- NULL impsUnscaled <- NULL impsGini <- NULL if("impsUnscaled" %in% whichImp) impsUnscaled <- importance(rf, type = 1, scale = FALSE) if("impsScaled" %in% whichImp) impsScaled <- importance(rf, type = 1, scale = TRUE) if("impsGini" %in% whichImp) impsGini <- rf$importance[, ncol(rf$importance)] outCl[[nriter]] <- list(impsScaled, impsUnscaled, impsGini) } cat("\n") } ## randomVarImps <- list() if("impsScaled" %in% whichImp) { randomVarImps$impsScaled <- matrix(unlist(lapply(outCl, function(x) x[[1]])), ncol = length(outCl)) colnames(randomVarImps$impsScaled) <- 1:numrandom rownames(randomVarImps$impsScaled) <- rownames(forest$importance) } if("impsUnscaled" %in% whichImp){ randomVarImps$impsUnscaled <- matrix(unlist(lapply(outCl, function(x) x[[2]])), ncol = length(outCl)) colnames(randomVarImps$impsUnscaled) <- 1:numrandom rownames(randomVarImps$impsUnscaled) <- rownames(forest$importance) } if("impsGini" %in% whichImp) { randomVarImps$impsGini <- matrix(unlist(lapply(outCl, function(x) x[[3]])), ncol = length(outCl)) colnames(randomVarImps$impsGini) <- 1:numrandom rownames(randomVarImps$impsGini) <- rownames(forest$importance) } class(randomVarImps) <- c(class(randomVarImps), "randomVarImpsRF") return(randomVarImps) ## Hopefully, we are returning a list with only the components selected.zz } randomVarImpsRFplot <- function(randomImportances, forest, whichImp = "impsUnscaled", nvars = NULL, show.var.names = FALSE, vars.highlight = NULL, main = NULL, screeRandom = TRUE, lwdBlack = 1.5, lwdRed = 2, lwdLightblue = 1, cexPoint = 1, overlayTrue = FALSE, xlab = NULL, ylab = NULL, ...) { if(ncol(forest$importance) < 2) stop("The fitted rf", deparse(substitute(forest)), "was not fitted with importance = TRUE") randomImportances <- switch(whichImp, "impsUnscaled" = randomImportances$impsUnscaled, "impsScaled" = randomImportances$impsScaled, "impsGini" = randomImportances$impsGini, ) originalForestImportance <- switch(whichImp, "impsUnscaled" = importance(forest, type = 1, scale = FALSE), "impsScaled" = importance(forest, type = 1, scale = TRUE), "impsGini" = forest$importance[, ncol(forest$importance)] ) if(is.null(originalForestImportance)) stop("\n Not valid 'whichImp' \n") if(is.null(xlab)) xlab <- "(Ordered) Variable" if(is.null(ylab)) ylab <- switch(whichImp, "impsUnscaled" = "Importance (unscaled)", "impsScaled" = "Importance (scaled)", "impsGini" = "Importance (Gini)", ) nvars <- min(nvars, dim(randomImportances)[1]) ylim <- range(originalForestImportance, randomImportances) plottingOrder <- order(originalForestImportance, decreasing = TRUE)[1:nvars] orderedOriginalImps <- originalForestImportance[plottingOrder] plot(orderedOriginalImps, type = "n", axes = FALSE, xlab = xlab, ylab = ylab, main = main, ylim = ylim, ...) abline(h = 0, lty = 2, col = "blue") axis(2) box() if(show.var.names) { axis(1, labels = names(orderedOriginalImps), at = 1:length(orderedOriginalImps)) } else { axis(1) } if(!overlayTrue) ###points(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack, ### col = "black", type = "b", cex = cexPoint) lines(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack, col = "black", type = "b", cex = cexPoint) if(!is.null(vars.highlight)) { if(length(vars.highlight) > nvars) { warning("Not all vars. to highlight will be shown; increase nvars\n") cat("\n Not all vars. to highlight will be shown; increase nvars\n") } pos.selected <- which(names(orderedOriginalImps) %in% vars.highlight) if(!length(pos.selected)){ warning("No selected vars. among those to show\n") cat("\nNo selected vars. among those to show\n") } else segments(pos.selected, 0, pos.selected, orderedOriginalImps[pos.selected], col = "blue", lwd = 2) if(length(pos.selected) < length(vars.highlight)) { warning("Not shown ", length(vars.highlight) - length(pos.selected), " of the 'to-highlight' variables\n") cat("Not shown ", length(vars.highlight) - length(pos.selected), " of the 'to-highlight' variables\n") } } column.of.mean <- dim(randomImportances)[2] + 1 if(screeRandom) { randomImportances <- apply(randomImportances, 2, sort, decreasing = TRUE) randomImportances <- randomImportances[1:nvars, ] randomImportances <- cbind(randomImportances, apply(randomImportances, 1, mean)) } else { randomImportances <- randomImportances[plottingOrder, ] randomImportances <- cbind(randomImportances, apply(randomImportances, 1, mean)) } matlines(randomImportances[, -column.of.mean], col = "lightblue", lwd = lwdLightblue, lty = 1) lines(x = 1:nvars, y = randomImportances[, column.of.mean], lwd = lwdRed, col = "red") if(overlayTrue) { points(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack, col = "black", type = "b", cex = cexPoint) lines(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack, col = "black", type = "l") } } varSelImpSpecRF <- function(forest, xdata = NULL, Class = NULL, randomImps = NULL, threshold = 0.10, numrandom = 20, whichImp = "impsUnscaled", usingCluster = TRUE, TheCluster = NULL, ...) { if((is.null(xdata) | is.null(Class)) & is.null(randomImps)) stop("You must specify a randomVarImpsRF object OR", "valid covariates and class objects.\n") if((!is.null(xdata) & !is.null(Class)) & !is.null(randomImps)) warning("Using only the randomVarImpsRF object.", "Covariates and class objects discarded.\n", immediate. = TRUE) if(length(whichImp) > 1) stop("You can only use one importance measure") originalImps <- switch(whichImp, "impsUnscaled" = importance(forest, type = 1, scale = FALSE), "impsScaled" = importance(forest, type = 1, scale = TRUE), "impsGini" = forest$importance[, ncol(forest$importance)] ) if(is.null(originalImps)) stop("\n Not valid 'whichImp' \n") originalImpsOrder <- order(originalImps, decreasing = TRUE) if(is.null(randomImps)) { randomImps <- switch(whichImp, "impsUnscaled" = randomVarImpsRF(xdata, Class, forest, numrandom = numrandom, whichImp = "impsUnscaled", usingCluster = usingCluster, TheCluster = TheCluster, ...)$impsUnscaled, "impsUnscaled" = randomVarImpsRF(xdata, Class, forest, numrandom = numrandom, whichImp = "impsScaled", usingCluster = usingCluster, TheCluster = TheCluster, ...)$impsScaled, "impsUnscaled" = randomVarImpsRF(xdata, Class, forest, numrandom = numrandom, whichImp = "impsGini", usingCluster = usingCluster, TheCluster = TheCluster, ...)$impsGini, ) } else { elemento <- match(whichImp, names(randomImps)) if(is.na(elemento)) stop("The requested importance was not calculated\n", "for the randomImps", deparse(substitute(randomImps)),"\n", "object.\n") cat("\n Using the randomVarImpsRF", deparse(substitute(randomImps)), "object. xdata, Class, numrandom ignored.\n") randomImps <- randomImps[[elemento]] } randomImps <- apply(randomImps, 2, sort, decreasing = TRUE) thresholds <- apply(randomImps, 1, function(x) quantile(x, probs = 1 - threshold, type = 8)) largest.value <- which(originalImps[originalImpsOrder] <= thresholds)[1] if(is.na(largest.value)) { selected.vars <- originalImpsOrder warning("All variables selected; could signal a problem") } else if(largest.value == 1) { selected.vars <- NA warning("No variables selected; could signal a problem") } else selected.vars <- originalImpsOrder[1:(largest.value - 1)] return(selected.vars) } selProbPlot <- function(object, k = c(20, 100), color = TRUE, legend = FALSE, xlegend = 68, ylegend = 0.93, cexlegend = 1.4, main = NULL, xlab = "Rank of gene", ylab = "Selection probability", pch = 19, ...) { ## selection probability plots, such as in Pepe ## et al. 2003 (ROC paper). if(class(object) != "varSelRFBoot") stop("This function only works with objects created\n", "with the varSelRFBoot function.\n") nboot <- object$number.of.bootsamples if(nboot < 100) warning("You only used ", nboot, " bootstrap samples. Might be too few.", immediate. = TRUE) original.imps <- object$all.data.run$initialImportances original.ranks <- rank(-original.imps) boot.ranks <- lapply(object$allBootRuns, function(x) {rank(-x$initialImportances)}) boot.ranks <- matrix(unlist(boot.ranks), ncol = nboot) k1 <- apply(boot.ranks, 1, function(z) {sum(z <= k[1])/nboot}) k2 <- apply(boot.ranks, 1, function(z) {sum(z <= k[2])/nboot}) if(color) { plot(original.ranks[original.ranks <= k[2]], k2[original.ranks <= k[2]], xlim = c(1, k[2]), col = "red", pch = pch, xlab = xlab, ylab = ylab, main = main, ylim = c(0, 1), ...) points(original.ranks[original.ranks <= k[1]], k1[original.ranks <= k[1]], col = "blue", pch = pch) if(legend) legend(x = xlegend, y = ylegend, legend = c(paste("Top", k[2]), paste("Top", k[1])), col = c("red", "blue"), pch = pch, cex = cexlegend) } else { plot(original.ranks[original.ranks <= k[2]], k2[original.ranks <= k[2]], xlim = c(1, k[2]), pch = pch, xlab = "Rank of gene", ylab = "Selection probability", main = main, ylim = c(0, 1), ...) points(original.ranks[original.ranks <= k[1]], k1[original.ranks <= k[1]], pch = 21) if(legend) legend(x = xlegend, y = ylegend, legend = c(paste("Top", k[2]), paste("Top", k[1])), pch = c(19, 21), cex = (cexlegend + 0.2)) } } ################################################################### ################################################################### ########## ############## ########## Custom code used in paper ############## ########## (unlikely to be of interest ############## ########## for anybody else) ############## ########## ############## ################################################################### ################################################################### figureSummary.varSelRFBoot <- function(object) { ## to create data for figures of paper with randomdata errorrate <- object$bootstrap.pred.error nvused <- length(object$all.data.vars) return(c(errorrate, nvused)) } figureSummary2.varSelRFBoot <- function(object) { ## to create data for figures of paper with randomdata errorrate <- object$bootstrap.pred.error nvused <- length(object$all.data.vars) median.nvars <- median (object$number.of.vars) iq1.nvars <- quantile(object$number.of.vars, p = 0.25) iq3.nvars <- quantile(object$number.of.vars, p = 0.75) string1 <- paste(formatC(round(nvused, 0), width = 6)," (", round(iq1.nvars, 0),", ", round(median.nvars, 0), ", ", round(iq3.nvars, 0),"), ", formatC(round(errorrate, 3), width = 4)," \\\\ ", sep = "") string1 } tableSummary.varSelRFBoot <- function(object, name){ ### to create data ready for the LaTeX tables of the paper neatname <- switch(name, "gl.boot" = "Leukemia ", "vv.boot" = "Breast 2 cl.", "vv3.boot" ="Breast 3 cl.", "nci.boot" ="NCI 60 ", "ra.boot" = "Adenocar. ", "brain.boot" = "Brain ", "colon.boot" = "Colon ", "lymphoma.boot" = "Lymphoma ", "prostate.boot" = "Prostate ", "srbct.boot" = "Srbct ") errorrate <- object$bootstrap.pred.error nvused <- length(object$all.data.vars) median.nvars <- median (object$number.of.vars) iq1.nvars <- quantile(object$number.of.vars, p = 0.25) iq3.nvars <- quantile(object$number.of.vars, p = 0.75) in.all.data <- which(names(table(object$all.vars.in.solutions)) %in% object$all.data.vars) tmp1 <- table(object$all.vars.in.solutions)[in.all.data]/object$number.of.bootsamples median.freq <- median(tmp1) if(nvused == 2) { iq1.freq <- min(tmp1) iq3.freq <- max(tmp1) } else { iq1.freq <- quantile(tmp1, p = 0.25) iq3.freq <- quantile(tmp1, p = 0.75) } if(nvused > 2) string1 <- paste(neatname, "& ", formatC(round(errorrate, 3), width = 4)," & ", formatC(round(nvused, 0), width = 4)," & ", formatC(round(median.nvars, 0), width = 4)," (", round(iq1.nvars, 0),", ", round(iq3.nvars, 0),") & ", formatC(round(median.freq, 2), width = 4)," (", round(iq1.freq, 2),", ", round(iq3.freq, 2),")\\\\", sep = "") else string1 <- paste(neatname, "& ", formatC(round(errorrate, 3), width = 4)," & ", formatC(round(nvused, 0), width = 4)," & ", formatC(round(median.nvars, 0), width = 4)," (", round(iq1.nvars, 0),", ", round(iq3.nvars, 0),") & ", formatC(round(median.freq, 2), width = 4)," (", round(iq1.freq, 2),", ", round(iq3.freq, 2),")\\footnotemark[1]\\\\", sep = "") cat(string1, "\n") } ################################################################### ################################################################### ########## ############## ########## Miscellaneous code ############## ########## ############## ################################################################### ################################################################### rVI <- function(xdata, ydata, forest, numrandom = 40, whichImp = "impsUnscaled", ...) { ## from randomVarImpsRF, but simplified to use only ## one tytpe of importance ontree <- forest$ntree omtry <- forest$mtry if(usingCluster) { iRF.cluster <- function(dummy, ontree, omtry, ...) { rf <- randomForest(x = xdataTheCluster, y = sample(ClassTheCluster), ntree = ontree, mtry = omtry, importance = TRUE, keep.forest = FALSE, ...) imps <- switch(whichImp, "impsUnscaled" = importance(rf, type = 1, scale = FALSE), "impsScaled" = importance(rf, type = 1, scale = TRUE), "impsGini" = rf$importance[, ncol(rf$importance)] ) if(is.null(imps)) stop("\n Not valid 'whichImp' \n") return(imps) } clusterEvalQ(TheCluster, rm(list = c("xdataTheCluster", "ClassTheCluster"))) xdataTheCluster <<- xdata ClassTheCluster <<- ydata clusterExport(TheCluster, c("xdataTheCluster", "ClassTheCluster")) outCl <- clusterApplyLB(TheCluster, 1:numrandom, iRF.cluster, ontree = ontree, omtry = omtry) } else { outCl <- list() for(nriter in 1:numrandom) { rf <- randomForest(x = xdata, y = sample(ydata), ntree = ontree, mtry = omtry, importance = TRUE, keep.forest = FALSE, ...) imps <- switch(whichImp, "impsUnscaled" = importance(rf, type = 1, scale = FALSE), "impsScaled" = importance(rf, type = 1, scale = TRUE), "impsGini" = rf$importance[, ncol(rf$importance)] ) if(is.null(imps)) stop("\n Not valid 'whichImp' \n") outCl[[nriter]] <- imps } } randomVarImps <- matrix(unlist(outCl), ncol = numrandom) rownames(randomVarImps) <- rownames(forest$importance) colnames(randomVarImps) <- 1:numrandom class(randomVarImps) <- c(class(randomVarImps), "rVI") return(randomVarImps) } ##### old stuff ####nr <- 10; nc <- 20; x <- matrix(rnorm(2* nr*nc), ncol = nc) ####colnames(x) <- paste("v", 1:nc, sep ="") ####Class <- factor(c(rep("A", nr), rep("B", nr))) ####xdata <- x ####library(randomForest) ####rf1 <- randomForest(x = x, y = Class, importance = TRUE, #### keep.forest = FALSE, ntree = 2000) ####usingCluster <- TRUE ####if(usingCluster) { #### library(snow) #### library(Rmpi) #### clusterNumberNodes <- 4 #### typeCluster <- "MPI" #### TheCluster <- makeCluster(clusterNumberNodes, #### type = typeCluster) #### clusterSetupSPRNG(TheCluster) #### clusterEvalQ(TheCluster, library(randomForest)) ####} ####screePlotRF <- function(forest, randomImportances, #### nvars = 50, #### show.var.names = FALSE, #### vars.highlight = NULL, #### main = NULL, scale = TRUE) { #### nvars <- min(nvars, dim(randomImportances)[1]) #### if(scale) { #### original.imps <- importance(forest, type = 1, scale = TRUE) #### } else { #### original.imps <- importance(forest, type = 1, scale = FALSE) #### } #### ordered.imps <- sort(original.imps, decreasing = TRUE)[1:nvars] #### plot(ordered.imps, type = "b", axes = FALSE, lwd = 1.5, #### xlab = "Variable", ylab = "Importance", #### main = main) #### abline(h = 0, lty = 2, col = "blue") #### axis(2) #### box() #### if(show.var.names) { #### axis(1, labels = names(ordered.imps), #### at = 1:length(ordered.imps)) #### } else { #### axis(1) #### } #### if(!is.null(vars.highlight)) { #### if(length(vars.highlight) > nvars) { #### warning("Not all vars. to highlight will be shown; increase nvars\n") #### cat("\n Not all vars. to highlight will be shown; increase nvars\n") #### } #### pos.selected <- which(names(ordered.imps) %in% vars.highlight) #### if(!length(pos.selected)){ #### warning("No selected vars. among those to show\n") #### cat("\nNo selected vars. among those to show\n") #### } #### else segments(pos.selected, 0, #### pos.selected, ordered.imps[pos.selected], #### col = "blue", lwd = 2) #### if(length(pos.selected) < length(vars.highlight)) { #### warning("Not shown ", length(vars.highlight) - length(pos.selected), #### " of the 'to-highlight' variables\n") #### cat("Not shown ", length(vars.highlight) - length(pos.selected), #### " of the 'to-highlight' variables\n") #### } #### } #### randomImportances <- randomImportances[1:nvars, ] #### column.mean <- dim(randomImportances)[2] #### matlines(randomImportances[, -column.mean], #### col = "green", lty = 1) #### lines(x = 1:nvars, #### y = randomImportances[, column.mean], #### lwd = 1.3, col = "red") ####} ####screePlotRF(rf1, ii1, nvars = 20, vars.highligh = paste("v", c(8, 4), sep =""), show.var.names = TRUE) ####i1 <- randomVarImpsRF(xdata, Class, rf1, numrandom = 3) ####screePlotRF(rf1, i1) ### This version allows optimizing mtry and ntree; leave code here, ### but don't use. ExpedrimentalvarSelRF <- function(xdata, Class, vars.drop.num = NULL, vars.drop.frac = 0.5, c.sd = 1, whole.range = TRUE, recompute.var.imp = FALSE, verbose = FALSE, returnFirstForest = TRUE, ## next are all for tune2RF ## but this is a mess; should pass them as ## control. tuneMtry = FALSE, tuneNtree = FALSE, startNtree = 1000, startMtryFactor = 1, stepFactorMtry = 1.25, stepFactorNtree = 1.75, minCorNtree = 0.975, quantNtree = 0.025, returnForest = TRUE, ntreeTry = 2000) { if( (is.null(vars.drop.num) & is.null(vars.drop.frac)) | (!is.null(vars.drop.num) & !is.null(vars.drop.frac))) stop("One (and only one) of vars.drop.frac and vars.drop.num must be NULL and the other set") max.num.steps <- dim(xdata)[2] num.subjects <- dim(xdata)[1] if(is.null(colnames(xdata))) colnames(xdata) <- paste("v", 1:dim(xdata)[2], sep ="") ##oversize the vectors; will prune later. n.vars <- vars <- OOB.rf <- OOB.sd <- rep(NA, max.num.steps) ## First get optimal values for mtry and ntree, and get first run: rf <- tune2RF(x = xdata, y = Class, tuneMtry = tuneMtry, tuneNtree = tuneNtree, startNtree = startNtree, mtryFactor = mtryFactor, stepFactorMtry = stepFactorMtry, stepFactorNtree = stepFactorNtree, minCorNtree = minCorNtree, quantNtree = quantNtree, returnForest = TRUE, ntreeTry = ntreeTry, verbose = verbose) ## We'll need it for future runs ntree <- rf$ntree mtry <- rf$mtry if(returnFirstForest) FirstForest <- rf else FirstForest <- NULL # rf <- randomForest(x = xdata, y = Class, importance= TRUE, # ntree = ntree, keep.forest = FALSE) m.iterated.ob.error <- m.initial.ob.error <- oobError(rf) sd.iterated.ob.error <- sd.initial.ob.error <- sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) * (1/num.subjects)) if(verbose) { print(paste("Initial OOB error: mean = ", round(m.initial.ob.error, 4), "; sd = ", round(sd.initial.ob.error, 4), sep = "")) } ### previous code (before rF < 4.3.1) #selected.vars <- order(rf$importance[, (ncol(rf$importance) - 1)], decreasing = TRUE) # ordered.importance <- rf$importance[selected.vars,(ncol(rf$importance) -1)] importances <- importance(rf, type = 1, scale = FALSE) selected.vars <- order(importances, decreasing = TRUE) ordered.importances <- importances[selected.vars] initialImportances <- importances initialOrderedImportances <- ordered.importances j <- 1 n.vars[j] <- dim(xdata)[2] vars[j] <- paste(colnames(xdata), collapse = " + ") OOB.rf[j] <- m.iterated.ob.error OOB.sd[j] <- sd.iterated.ob.error var.simplify <- TRUE while(var.simplify) { last.rf <- rf last.vars <- selected.vars # print(paste(".........Number of variables before selection", # dim(xdata)[2])) ## debug previous.m.error <- m.iterated.ob.error previous.sd.error <- sd.iterated.ob.error if(recompute.var.imp & (j > 1)) { ## need to set indexes as absolute w.r.t. original data #### tmp.order <- order(rf$importance[, (ncol(rf$importance) - 1)], #### decreasing = TRUE) #### selected.vars <- selected.vars[tmp.order] #### ordered.importance <- rf$importance[tmp.order, (ncol(rf$importance) -1)] importances <- importance(rf, type = 1, scale = FALSE) tmp.order <- order(importances, decreasing = TRUE) selected.vars <- selected.vars[tmp.order] ordered.importances <- importances[tmp.order] } num.vars <- length(selected.vars) #### if(any(is.na(ordered.importances))) { #### print("********** Nas in ordered.importances ******") #### browser() #### } if(any(ordered.importances < 0)) { selected.vars <- selected.vars[-which(ordered.importances < 0)] ordered.importances <- ordered.importances[-which(ordered.importances < 0)] } else { if(is.null(vars.drop.num)) vars.drop <- round(num.vars * vars.drop.frac) else vars.drop <- vars.drop.num if(num.vars >= (vars.drop + 2)) { selected.vars <- selected.vars[1: (num.vars - vars.drop)] ordered.importances <- ordered.importances[1: (num.vars - vars.drop)] } else { selected.vars <- selected.vars[1:2] ordered.importances <- ordered.importances[1:2] } } ## couldn't we eliminate the following? if((length(selected.vars) < 2) | (any(selected.vars < 1))) { var.simplify <- FALSE break } if(length(selected.vars) <= 2) var.simplify <- FALSE if(recompute.var.imp) rf <- randomForest(x = xdata[, selected.vars], y = Class, importance= TRUE, ntree = ntree, mtry = mtry, keep.forest = FALSE) else rf <- randomForest(x = xdata[, selected.vars], y = Class, importance= FALSE, ntree = ntree, mtry = mtry, keep.forest = FALSE) m.iterated.ob.error <- oobError(rf) sd.iterated.ob.error <- sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) * (1/num.subjects)) if(verbose) { print(paste("..... iteration ", j, "; OOB error: mean = ", round(m.iterated.ob.error, 4), "; sd = ", round(sd.iterated.ob.error, 4), sep = "")) } j <- j + 1 n.vars[j] <- length(selected.vars) vars[j] <- paste(colnames(xdata)[selected.vars], collapse = " + ") OOB.rf[j] <- m.iterated.ob.error OOB.sd[j] <- sd.iterated.ob.error if(!whole.range & ( (m.iterated.ob.error > (m.initial.ob.error + c.sd*sd.initial.ob.error)) | (m.iterated.ob.error > (previous.m.error + c.sd*previous.sd.error))) ) var.simplify <- FALSE } if (!whole.range) { if(!is.null(colnames(xdata))) selected.vars <- sort(colnames(xdata)[last.vars]) else selected.vars <- last.vars out <- list(selec.history = data.frame( Number.Variables = n.vars, Vars.in.Forest = vars, OOB = OOB.rf, sd.OOB = OOB.sd)[1:j,], rf.model = last.rf, selected.vars = selected.vars, selected.model = paste(selected.vars, collapse = " + "), best.model.nvars = length(selected.vars), initialImportances = initialImportances, initialOrderedImportances = initialOrderedImportances, ntree = ntree, mtry = mtry, firstForest = FirstForest) class(out) <- "varSelRF" return(out) } else { n.vars <- n.vars[1:j] vars <- vars[1:j] OOB.rf<- OOB.rf[1:j] OOB.sd <- OOB.sd[1:j] ##browser() min.oob.ci <- min(OOB.rf) + c.sd * OOB.sd[which.min(OOB.rf)] best.pos <- which(OOB.rf <= min.oob.ci)[which.min(n.vars[which(OOB.rf <= min.oob.ci)])] selected.vars <- sort(unlist(strsplit(vars[best.pos], " + ", fixed = TRUE))) out <- list(selec.history = data.frame( Number.Variables = n.vars, Vars.in.Forest = vars, OOB = OOB.rf, sd.OOB = OOB.sd), rf.model = NA, selected.vars = selected.vars, selected.model = paste(selected.vars, collapse = " + "), best.model.nvars = n.vars[best.pos], initialImportances = initialImportances, initialOrderedImportances = initialOrderedImportances, ntree = ntree, mtry = mtry, firstForest = FirstForest) class(out) <- "varSelRF" return(out) } } tune2RF <- function(x, y, tuneMtry = TRUE, tuneNtree = TRUE, startNtree = 1000, startMtryFactor = 1, stepFactorMtry = 1.25, stepFactorNtree = 1.75, minCorNtree = 0.9, quantNtree = 0.025, ## with 4000, look at first 100 returnForest = TRUE, ntreeTry = 2000, verbose = TRUE, plot = FALSE, ...) { ## if tuneMtry = TRUE and tuneNtree = TRUE, ## ntree is used for tuneRF, mtryFactor is ignored ## if tuneMtry = TRUE and tuneNtree = FALSE ## ntree used for tuneRF and final ntree, and mtry factor is ignored ## if tuneMtry = FALSE and tuneRF = TRUE, ## ntree is used only as starting value for search of ntree and ## mtryFactor is used for mtry ## if tuneMtry = FALSE and tuneRF = FALSE, ## mtryFactor and ntree are the ntree and mtry used. if(plot) { op <- par(mfrow = c(1,2)) on.exit(par(op)) } if(tuneMtry) { tunedMtry <- tuneRF(x, y, stepFactor = stepFactorMtry, ntreeTry = ntreeTry, mtryStart = floor(sqrt(ncol(x)) * startMtryFactor), doBest = FALSE, plot = plot, trace = verbose) tunedMtry <- tunedMtry[which.min(tunedMtry[, 2]), 1] } else { tunedMtry <- floor(sqrt(ncol(x)) * startMtryFactor) } tunedNtree <- startNtree if(tuneNtree) { if(verbose) cat("\n Tunning ntree: initial forest construction \n") f1 <- randomForest(x, y, mtry = tunedMtry, importance = TRUE, keep.forest = FALSE, ntree = tunedNtree) repeat { f2 <- randomForest(x, y, mtry = tunedMtry, importance = TRUE, keep.forest = FALSE, ntree = round(stepFactorNtree * tunedNtree)) m1 <- cbind( importance(f1, type = 1, scale = FALSE), importance(f2, type = 1, scale = FALSE)) m1[m1 < quantile(m1, 1-quantNtree)] <- NA m1 <- na.omit(m1) m2 <- m1 m2[m2 <= 0] <- NA m2 <- na.omit(m2) mc1 <- cor(m1) mc.rob <- cov.rob(m2, method = "mcd", cor = TRUE)$cor if(verbose) { cat("\n ... tunning ntree; cor. importances successive ntrees\n") colnames(mc1) <- c(tunedNtree, round(stepFactorNtree * tunedNtree)) rownames(mc1) <- c(tunedNtree, round(stepFactorNtree * tunedNtree)) colnames(mc.rob) <- c(tunedNtree, round(stepFactorNtree * tunedNtree)) rownames(mc.rob) <- c(tunedNtree, round(stepFactorNtree * tunedNtree)) cat("\n correlation matrix\n") print(round(mc1, 4)) cat("\n robust correlation matrix\n") print(round(mc.rob, 4)) cat(" \n using ", dim(m2)[1], "observations\n") cat("\n") } if((min(mc1) > minCorNtree) & (min(mc.rob) > minCorNtree)) { plot(m1[, 1], m1[, 2], xlab = paste("Ntree", tunedNtree), ylab = paste("Ntree", round(stepFactorNtree * tunedNtree)), main = paste("Upper ", quantNtree, "th quantile of importances", sep = "")) break } tunedNtree <- round(stepFactorNtree * tunedNtree) f1 <- f2 } } if(returnForest) { if(tuneNtree) return(f1) else return(randomForest(x, y, mtry = tunedMtry, importance = TRUE, keep.forest = FALSE, ntree = ntree) ) } else { return(c(tunedMtry = tunedMtry, tunedNtree = tunedNtree)) } } boot.imp <- function(data, class, ntree = 20000, B = 200) { ## just checking the output from the pg.plots. is correct. N <- length(class) mat.out <- matrix(NA, nrow = dim(data)[2], ncol = B) for(i in 1:B) { sample.again <- TRUE while(sample.again) { bootsample <- unlist(tapply(1:N, class, function(x) sample(x, size = length(x), replace = TRUE))) ## sure, this isn't the fastest, but will do for now. nobootsample <- setdiff(1:N, bootsample) if(!length(nobootsample)) sample.again <- TRUE else sample.again <- FALSE } ## this is an ugly hack to prevetn nobootsamples ## of size 0. train.data <- data[bootsample, , drop = FALSE] train.class <- class[bootsample] rftmp <- randomForest(x = train.data, y = train.class, ntree = ntree, keep.forest = FALSE, importance = TRUE) mat.out[, i] <- importance(rftmp, type = 1, scale = FALSE) } return(mat.out) } ####gl.b.i <- boot.imp(gl.data, gl.class, B = 20) ####pairs(cbind(gl.b.i, importance(gl.20000.rf, type = 1, scale = FALSE)), #### pch = ".") ####pairs(cbind(gl.b.i[, 1:10], importance(gl.20000.rf, type = 1, scale = FALSE)), #### pch = ".") ####summary(cbind(gl.b.i, importance(gl.20000.rf, type = 1, scale = FALSE))) ####boxplot(data.frame(cbind(gl.b.i, importance(gl.20000.rf, type = 1, scale = FALSE)))) #####f.mtr <- function(x, mf = 3) { ##### x2 <- floor(sqrt(x) * mf) ##### x2[x2 > x] <- x[x2 > x] ##### return(x2) #####} #####plot(x = sqrt(2:200), f.mtr(2:200, mf = 5), type = "l")