Add optional penalties when splitting variables that have NAs.

Also include several bug fixes from the Java project.
2019-08-29 13:54:38 -07:00 · 2019-08-29 13:54:38 -07:00 · 3f0f6c0878
commit 3f0f6c0878
parent 48859b0249
7 changed files with 114 additions and 49 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 Package: largeRCRF
 Type: Package
 Title: Large Random Competing Risks Forests
-Version: 1.0.4
+Version: 1.0.5
 Authors@R: c(
    person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")),
    person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs"))
--- a/R/create_java_covariates.R
+++ b/R/create_java_covariates.R
@ -29,8 +29,8 @@
 NULL
 # @rdname covariates
-Java_BooleanCovariate <- function(name, index){
+Java_BooleanCovariate <- function(name, index, na.penalty){
-    covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index))
+    covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index), na.penalty)
    covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
    return(covariate)
@ -38,19 +38,19 @@ Java_BooleanCovariate <- function(name, index){
 # @rdname covariates
 # @param levels The levels of the factor as a character vector
-Java_FactorCovariate <- function(name, index, levels){
+Java_FactorCovariate <- function(name, index, levels, na.penalty){
    levelsArray <- .jarray(levels, makeResponse(.class_String))
    levelsList <- .jcall("java/util/Arrays", "Ljava/util/List;", "asList", .jcast(levelsArray, "[Ljava/lang/Object;"))
-    covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList)
+    covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList, na.penalty)
    covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
    return(covariate)
 }
 # @rdname covariates
-Java_NumericCovariate <- function(name, index){
+Java_NumericCovariate <- function(name, index, na.penalty){
-    covariate <- .jnew(.class_NumericCovariate, name, as.integer(index))
+    covariate <- .jnew(.class_NumericCovariate, name, as.integer(index), na.penalty)
    covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
    return(covariate)
--- a/R/loadData.R
+++ b/R/loadData.R
@ -1,4 +1,4 @@
-loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
+loadData <- function(data, xVarNames, responses, covariateList.java = NULL, na.penalty = NULL){
  if(class(responses) == "integer" | class(responses) == "numeric"){
    responses <- Numeric(responses)
@ -6,7 +6,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
  # connectToData provides a pre-created covariate list we can re-use
  if(is.null(covariateList.java)){
-    covariateList.java <- getCovariateList(data, xVarNames)
+    covariateList.java <- getCovariateList(data, xVarNames, na.penalty)
  }
  textColumns <- list()
@ -22,7 +22,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
 }
-getCovariateList <- function(data, xvarNames){
+getCovariateList <- function(data, xvarNames, na.penalty){
  covariateList <- .jcast(.jnew(.class_ArrayList, length(xvarNames)), .class_List)
  for(i in 1:length(xvarNames)){
@ -31,14 +31,14 @@ getCovariateList <- function(data, xvarNames){
    column <- data[,xName]
    if(class(column) == "numeric" | class(column) == "integer"){
-      covariate <- Java_NumericCovariate(xName, i-1)
+      covariate <- Java_NumericCovariate(xName, i-1, na.penalty[i])
    }
    else if(class(column) == "logical"){
-      covariate <- Java_BooleanCovariate(xName, i-1)
+      covariate <- Java_BooleanCovariate(xName, i-1, na.penalty[i])
    }
    else if(class(column) == "factor"){
      lvls <- levels(column)
-      covariate <- Java_FactorCovariate(xName, i-1, lvls)
+      covariate <- Java_FactorCovariate(xName, i-1, lvls, na.penalty[i])
    }
    else{
      stop("Unknown column type")
--- a/R/processFormula.R
+++ b/R/processFormula.R
@ -2,7 +2,7 @@
 # Internal function that takes a formula and processes it for use in the Java
 # code. existingCovariateList is optional; if not provided then a new one is
 # created internally.
-processFormula <- function(formula, data, covariateList.java = NULL){
+processFormula <- function(formula, data, covariateList.java = NULL, na.penalty = NULL){
  # Having an R copy of the data loaded at the same time can be wasteful; we
  # also allow users to provide an environment of the data which gets removed
@ -52,8 +52,44 @@ processFormula <- function(formula, data, covariateList.java = NULL){
  # remove any response variables on the right-hand-side
  covariateData <- filteredData[, !(names(filteredData) %in% variablesToDrop), drop=FALSE]
  # Now that we know how many predictor variables we have, we should check na.penalty
  if(!is.null(na.penalty)){
    if(!is.numeric(na.penalty) & !is.logical(na.penalty)){
      stop("na.penalty must be either logical or numeric.")
    }
    if(is.logical(na.penalty) & length(na.penalty) != 1 & length(na.penalty) != ncol(covariateData)){
      stop("na.penalty must have length of either 1 or the number of predictor variables if logical.")
    }
    if(is.numeric(na.penalty) & length(na.penalty) != 1){
      stop("na.penalty must have length 1 if logical.")
    }
    if(anyNA(na.penalty)){
      stop("na.penalty cannot contain NAs.")
    }
    # All good; now to transform it.
    if(is.numeric(na.penalty)){
      na.threshold <- na.penalty
      na.penalty <- apply(covariateData, 2, function(x){mean(is.na(x))}) >= na.threshold
    }
    else if(is.logical(na.penalty) & length(na.penalty) == 1){
      na.penalty <- rep(na.penalty, times = ncol(covariateData))
    }
    # else{} - na.penalty is logical and the correct length; no need to do anything to it
  }
-  dataset <- loadData(covariateData, colnames(covariateData), responses, covariateList.java = covariateList.java)
+  dataset <- loadData(
    covariateData, 
    colnames(covariateData), 
    responses, 
    covariateList.java = covariateList.java,
    na.penalty = na.penalty
  )
  return(dataset)
 }
--- a/R/train.R
+++ b/R/train.R
@ -178,9 +178,9 @@ train.internal <- function(dataset, splitFinder,
 #' @param data A data.frame containing the columns of the predictors and
 #'   responses.
 #' @param splitFinder A split finder that's used to score splits in the random
-#'   forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
+#'   forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
-#'   or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
+#'   \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
-#'   this function tries to pick one based on the response. For
+#'   function tries to pick one based on the response. For
 #'   \code{\link{CR_Response}} without censor times, it will pick a
 #'   \code{\link{LogRankSplitFinder}}; while if censor times were provided it
 #'   will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
@ -188,19 +188,19 @@ train.internal <- function(dataset, splitFinder,
 #' @param nodeResponseCombiner A response combiner that's used to combine
 #'   responses for each terminal node in a tree (regression example; average the
 #'   observations in each tree into a single number). See
-#'   \code{\link{CR_ResponseCombiner}} or
+#'   \code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
-#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
+#'   you don't specify one, this function tries to pick one based on the
-#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
+#'   response. For \code{\link{CR_Response}} it picks a
-#'   picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
+#'   \code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
-#'   responses it picks a \code{\link{MeanResponseCombiner}}.
+#'   picks a \code{\link{MeanResponseCombiner}}.
 #' @param forestResponseCombiner A response combiner that's used to combine
 #'   predictions across trees into one final result (regression example; average
 #'   the prediction of each tree into a single number). See
-#'   \code{\link{CR_FunctionCombiner}} or
+#'   \code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
-#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
+#'   you don't specify one, this function tries to pick one based on the
-#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
+#'   response. For \code{\link{CR_Response}} it picks a
-#'   picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
+#'   \code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
-#'   responses it picks a \code{\link{MeanResponseCombiner}}.
+#'   picks a \code{\link{MeanResponseCombiner}}.
 #' @param ntree An integer that specifies how many trees should be trained.
 #' @param numberOfSplits A tuning parameter specifying how many random splits
 #'   should be tried for a covariate; a value of 0 means all splits will be
@ -217,6 +217,20 @@ train.internal <- function(dataset, splitFinder,
 #' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
 #'   controls tree length; by default \code{maxNodeDepth} is an extremely high
 #'   number and tree depth is controlled by \code{nodeSize}.
 #' @param na.penalty This parameter controls whether predictor variables with
 #'   NAs should be penalized when being considered for a best split. Best splits
 #'   (and the associated score) are determined on only non-NA data; the penalty
 #'   is to take the best split identified, and to randomly assign any NAs
 #'   (according to the proportion of data split left and right), and then
 #'   recalculate the corresponding split score, when is then compared with the
 #'   other split candiate variables. This penalty adds some computational time,
 #'   so it may be disabled for some variables. \code{na.penalty} may be
 #'   specified as a vector of logicals indicating, for each predictor variable,
 #'   whether the penalty should be applied to that variable. If it's length 1
 #'   then it applies to all variables. Alternatively, a single numeric value may
 #'   be provided to indicate a threshold whereby the penalty is activated only
 #'   if the proportion of NAs for that variable in the training set exceeds that
 #'   threshold.
 #' @param splitPureNodes This parameter determines whether the algorithm will
 #'   split a pure node. If set to FALSE, then before every split it will check
 #'   that every response is the same, and if so, not split. If set to TRUE it
@ -290,17 +304,17 @@ train.internal <- function(dataset, splitFinder,
 #'
 #' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
 #'    LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
-#'    CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, 
+#'    CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
 #'    mtry=1, nodeSize=10)
 #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
 #' ypred <- predict(forest, newData)
 train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
                  forestResponseCombiner = NULL, ntree, numberOfSplits, mtry,
-                  nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL,
+                  nodeSize, maxNodeDepth = 100000, na.penalty = TRUE, splitPureNodes=TRUE, 
-                  savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(),
+                  savePath=NULL, savePath.overwrite=c("warn", "delete", "merge"), 
-                  randomSeed = NULL, displayProgress = TRUE){
+                  cores = getCores(), randomSeed = NULL, displayProgress = TRUE){
-  dataset <- processFormula(formula, data)
+  dataset <- processFormula(formula, data, na.penalty = na.penalty)
  forest <- train.internal(dataset, splitFinder = splitFinder,
                           nodeResponseCombiner = nodeResponseCombiner,
--- a/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar
+++ b/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar
--- a/man/train.Rd
+++ b/man/train.Rd
@ -6,8 +6,8 @@
 \usage{
 train(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
  forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, nodeSize,
-  maxNodeDepth = 1e+05, splitPureNodes = TRUE, savePath = NULL,
+  maxNodeDepth = 1e+05, na.penalty = TRUE, splitPureNodes = TRUE,
-  savePath.overwrite = c("warn", "delete", "merge"),
+  savePath = NULL, savePath.overwrite = c("warn", "delete", "merge"),
  cores = getCores(), randomSeed = NULL, displayProgress = TRUE)
 }
 \arguments{
@ -19,9 +19,9 @@ constructed.}
 responses.}
 \item{splitFinder}{A split finder that's used to score splits in the random
-forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
+forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
-or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
+\code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
-this function tries to pick one based on the response. For
+function tries to pick one based on the response. For
 \code{\link{CR_Response}} without censor times, it will pick a
 \code{\link{LogRankSplitFinder}}; while if censor times were provided it
 will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
@ -30,20 +30,20 @@ responses it picks a \code{\link{WeightedVarianceSplitFinder}}.}
 \item{nodeResponseCombiner}{A response combiner that's used to combine
 responses for each terminal node in a tree (regression example; average the
 observations in each tree into a single number). See
-\code{\link{CR_ResponseCombiner}} or
+\code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
-\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
+you don't specify one, this function tries to pick one based on the
-tries to pick one based on the response. For \code{\link{CR_Response}} it
+response. For \code{\link{CR_Response}} it picks a
-picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
+\code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
-responses it picks a \code{\link{MeanResponseCombiner}}.}
+picks a \code{\link{MeanResponseCombiner}}.}
 \item{forestResponseCombiner}{A response combiner that's used to combine
 predictions across trees into one final result (regression example; average
 the prediction of each tree into a single number). See
-\code{\link{CR_FunctionCombiner}} or
+\code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
-\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
+you don't specify one, this function tries to pick one based on the
-tries to pick one based on the response. For \code{\link{CR_Response}} it
+response. For \code{\link{CR_Response}} it picks a
-picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
+\code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
-responses it picks a \code{\link{MeanResponseCombiner}}.}
+picks a \code{\link{MeanResponseCombiner}}.}
 \item{ntree}{An integer that specifies how many trees should be trained.}
@ -66,6 +66,21 @@ as large as \code{nodeSize}.}
 controls tree length; by default \code{maxNodeDepth} is an extremely high
 number and tree depth is controlled by \code{nodeSize}.}
 \item{na.penalty}{This parameter controls whether predictor variables with
 NAs should be penalized when being considered for a best split. Best splits
 (and the associated score) are determined on only non-NA data; the penalty
 is to take the best split identified, and to randomly assign any NAs
 (according to the proportion of data split left and right), and then
 recalculate the corresponding split score, when is then compared with the
 other split candiate variables. This penalty adds some computational time,
 so it may be disabled for some variables. \code{na.penalty} may be
 specified as a vector of logicals indicating, for each predictor variable,
 whether the penalty should be applied to that variable. If it's length 1
 then it applies to all variables. Alternatively, a single numeric value may
 be provided to indicate a threshold whereby the penalty is activated only
 if the proportion of NAs for that variable in the training set exceeds that
 threshold.}
 \item{splitPureNodes}{This parameter determines whether the algorithm will
 split a pure node. If set to FALSE, then before every split it will check
 that every response is the same, and if so, not split. If set to TRUE it
@ -160,7 +175,7 @@ data <- data.frame(x1, x2)
 forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
   LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
-   CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, 
+   CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
   mtry=1, nodeSize=10)
 newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
 ypred <- predict(forest, newData)