Add optional penalties when splitting variables that have NAs.

Also include several bug fixes from the Java project.
2019-08-29 13:54:38 -07:00 · 2019-08-29 13:54:38 -07:00 · 3f0f6c0878
commit 3f0f6c0878
parent 48859b0249
7 changed files with 114 additions and 49 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 Package: largeRCRF
 Type: Package
 Title: Large Random Competing Risks Forests
-Version: 1.0.4
+Version: 1.0.5
 Authors@R: c(
    person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")),
    person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs"))
--- a/R/create_java_covariates.R
+++ b/R/create_java_covariates.R
@ -29,8 +29,8 @@
 NULL

 # @rdname covariates
-Java_BooleanCovariate <- function(name, index){
-    covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index))
+Java_BooleanCovariate <- function(name, index, na.penalty){
+    covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index), na.penalty)
    covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists

    return(covariate)
@ -38,19 +38,19 @@ Java_BooleanCovariate <- function(name, index){

 # @rdname covariates
 # @param levels The levels of the factor as a character vector
-Java_FactorCovariate <- function(name, index, levels){
+Java_FactorCovariate <- function(name, index, levels, na.penalty){
    levelsArray <- .jarray(levels, makeResponse(.class_String))
    levelsList <- .jcall("java/util/Arrays", "Ljava/util/List;", "asList", .jcast(levelsArray, "[Ljava/lang/Object;"))

-    covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList)
+    covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList, na.penalty)
    covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists

    return(covariate)
 }

 # @rdname covariates
-Java_NumericCovariate <- function(name, index){
-    covariate <- .jnew(.class_NumericCovariate, name, as.integer(index))
+Java_NumericCovariate <- function(name, index, na.penalty){
+    covariate <- .jnew(.class_NumericCovariate, name, as.integer(index), na.penalty)
    covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists

    return(covariate)
--- a/R/loadData.R
+++ b/R/loadData.R
@ -1,4 +1,4 @@
-loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
+loadData <- function(data, xVarNames, responses, covariateList.java = NULL, na.penalty = NULL){

  if(class(responses) == "integer" | class(responses) == "numeric"){
    responses <- Numeric(responses)
@ -6,7 +6,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){

  # connectToData provides a pre-created covariate list we can re-use
  if(is.null(covariateList.java)){
-    covariateList.java <- getCovariateList(data, xVarNames)
+    covariateList.java <- getCovariateList(data, xVarNames, na.penalty)
  }
  
  textColumns <- list()
@ -22,7 +22,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){

 }

-getCovariateList <- function(data, xvarNames){
+getCovariateList <- function(data, xvarNames, na.penalty){
  covariateList <- .jcast(.jnew(.class_ArrayList, length(xvarNames)), .class_List)

  for(i in 1:length(xvarNames)){
@ -31,14 +31,14 @@ getCovariateList <- function(data, xvarNames){
    column <- data[,xName]

    if(class(column) == "numeric" | class(column) == "integer"){
-      covariate <- Java_NumericCovariate(xName, i-1)
+      covariate <- Java_NumericCovariate(xName, i-1, na.penalty[i])
    }
    else if(class(column) == "logical"){
-      covariate <- Java_BooleanCovariate(xName, i-1)
+      covariate <- Java_BooleanCovariate(xName, i-1, na.penalty[i])
    }
    else if(class(column) == "factor"){
      lvls <- levels(column)
-      covariate <- Java_FactorCovariate(xName, i-1, lvls)
+      covariate <- Java_FactorCovariate(xName, i-1, lvls, na.penalty[i])
    }
    else{
      stop("Unknown column type")
--- a/R/processFormula.R
+++ b/R/processFormula.R
@ -2,7 +2,7 @@
 # Internal function that takes a formula and processes it for use in the Java
 # code. existingCovariateList is optional; if not provided then a new one is
 # created internally.
-processFormula <- function(formula, data, covariateList.java = NULL){
+processFormula <- function(formula, data, covariateList.java = NULL, na.penalty = NULL){
  
  # Having an R copy of the data loaded at the same time can be wasteful; we
  # also allow users to provide an environment of the data which gets removed
@ -52,8 +52,44 @@ processFormula <- function(formula, data, covariateList.java = NULL){
  # remove any response variables on the right-hand-side
  covariateData <- filteredData[, !(names(filteredData) %in% variablesToDrop), drop=FALSE]
  
+  # Now that we know how many predictor variables we have, we should check na.penalty
+  if(!is.null(na.penalty)){
+    if(!is.numeric(na.penalty) & !is.logical(na.penalty)){
+      stop("na.penalty must be either logical or numeric.")
+    }
+    
+    if(is.logical(na.penalty) & length(na.penalty) != 1 & length(na.penalty) != ncol(covariateData)){
+      stop("na.penalty must have length of either 1 or the number of predictor variables if logical.")
+    }
+    
+    if(is.numeric(na.penalty) & length(na.penalty) != 1){
+      stop("na.penalty must have length 1 if logical.")
+    }
+    
+    if(anyNA(na.penalty)){
+      stop("na.penalty cannot contain NAs.")
+    }
+
+    
+    # All good; now to transform it.
+    if(is.numeric(na.penalty)){
+      na.threshold <- na.penalty
+      na.penalty <- apply(covariateData, 2, function(x){mean(is.na(x))}) >= na.threshold
+    }
+    else if(is.logical(na.penalty) & length(na.penalty) == 1){
+      na.penalty <- rep(na.penalty, times = ncol(covariateData))
+    }
+    # else{} - na.penalty is logical and the correct length; no need to do anything to it
+    
+  }
  
-  dataset <- loadData(covariateData, colnames(covariateData), responses, covariateList.java = covariateList.java)
+  dataset <- loadData(
+    covariateData, 
+    colnames(covariateData), 
+    responses, 
+    covariateList.java = covariateList.java,
+    na.penalty = na.penalty
+  )
  
  return(dataset)
 }
--- a/R/train.R
+++ b/R/train.R
@ -178,9 +178,9 @@ train.internal <- function(dataset, splitFinder,
 #' @param data A data.frame containing the columns of the predictors and
 #'   responses.
 #' @param splitFinder A split finder that's used to score splits in the random
-#'   forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
-#'   or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
-#'   this function tries to pick one based on the response. For
+#'   forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
+#'   \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
+#'   function tries to pick one based on the response. For
 #'   \code{\link{CR_Response}} without censor times, it will pick a
 #'   \code{\link{LogRankSplitFinder}}; while if censor times were provided it
 #'   will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
@ -188,19 +188,19 @@ train.internal <- function(dataset, splitFinder,
 #' @param nodeResponseCombiner A response combiner that's used to combine
 #'   responses for each terminal node in a tree (regression example; average the
 #'   observations in each tree into a single number). See
-#'   \code{\link{CR_ResponseCombiner}} or
-#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
-#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
-#'   picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
-#'   responses it picks a \code{\link{MeanResponseCombiner}}.
+#'   \code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
+#'   you don't specify one, this function tries to pick one based on the
+#'   response. For \code{\link{CR_Response}} it picks a
+#'   \code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
+#'   picks a \code{\link{MeanResponseCombiner}}.
 #' @param forestResponseCombiner A response combiner that's used to combine
 #'   predictions across trees into one final result (regression example; average
 #'   the prediction of each tree into a single number). See
-#'   \code{\link{CR_FunctionCombiner}} or
-#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
-#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
-#'   picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
-#'   responses it picks a \code{\link{MeanResponseCombiner}}.
+#'   \code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
+#'   you don't specify one, this function tries to pick one based on the
+#'   response. For \code{\link{CR_Response}} it picks a
+#'   \code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
+#'   picks a \code{\link{MeanResponseCombiner}}.
 #' @param ntree An integer that specifies how many trees should be trained.
 #' @param numberOfSplits A tuning parameter specifying how many random splits
 #'   should be tried for a covariate; a value of 0 means all splits will be
@ -217,6 +217,20 @@ train.internal <- function(dataset, splitFinder,
 #' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
 #'   controls tree length; by default \code{maxNodeDepth} is an extremely high
 #'   number and tree depth is controlled by \code{nodeSize}.
+#' @param na.penalty This parameter controls whether predictor variables with
+#'   NAs should be penalized when being considered for a best split. Best splits
+#'   (and the associated score) are determined on only non-NA data; the penalty
+#'   is to take the best split identified, and to randomly assign any NAs
+#'   (according to the proportion of data split left and right), and then
+#'   recalculate the corresponding split score, when is then compared with the
+#'   other split candiate variables. This penalty adds some computational time,
+#'   so it may be disabled for some variables. \code{na.penalty} may be
+#'   specified as a vector of logicals indicating, for each predictor variable,
+#'   whether the penalty should be applied to that variable. If it's length 1
+#'   then it applies to all variables. Alternatively, a single numeric value may
+#'   be provided to indicate a threshold whereby the penalty is activated only
+#'   if the proportion of NAs for that variable in the training set exceeds that
+#'   threshold.
 #' @param splitPureNodes This parameter determines whether the algorithm will
 #'   split a pure node. If set to FALSE, then before every split it will check
 #'   that every response is the same, and if so, not split. If set to TRUE it
@ -290,17 +304,17 @@ train.internal <- function(dataset, splitFinder,
 #'
 #' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
 #'    LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
-#'    CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, 
+#'    CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
 #'    mtry=1, nodeSize=10)
 #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
 #' ypred <- predict(forest, newData)
 train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
                  forestResponseCombiner = NULL, ntree, numberOfSplits, mtry,
-                  nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL,
-                  savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(),
-                  randomSeed = NULL, displayProgress = TRUE){
+                  nodeSize, maxNodeDepth = 100000, na.penalty = TRUE, splitPureNodes=TRUE, 
+                  savePath=NULL, savePath.overwrite=c("warn", "delete", "merge"), 
+                  cores = getCores(), randomSeed = NULL, displayProgress = TRUE){
  
-  dataset <- processFormula(formula, data)
+  dataset <- processFormula(formula, data, na.penalty = na.penalty)
  
  forest <- train.internal(dataset, splitFinder = splitFinder,
                           nodeResponseCombiner = nodeResponseCombiner,
--- a/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar
+++ b/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar
--- a/man/train.Rd
+++ b/man/train.Rd
@ -6,8 +6,8 @@
 \usage{
 train(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
  forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, nodeSize,
-  maxNodeDepth = 1e+05, splitPureNodes = TRUE, savePath = NULL,
-  savePath.overwrite = c("warn", "delete", "merge"),
+  maxNodeDepth = 1e+05, na.penalty = TRUE, splitPureNodes = TRUE,
+  savePath = NULL, savePath.overwrite = c("warn", "delete", "merge"),
  cores = getCores(), randomSeed = NULL, displayProgress = TRUE)
 }
 \arguments{
@ -19,9 +19,9 @@ constructed.}
 responses.}

 \item{splitFinder}{A split finder that's used to score splits in the random
-forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
-or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
-this function tries to pick one based on the response. For
+forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
+\code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
+function tries to pick one based on the response. For
 \code{\link{CR_Response}} without censor times, it will pick a
 \code{\link{LogRankSplitFinder}}; while if censor times were provided it
 will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
@ -30,20 +30,20 @@ responses it picks a \code{\link{WeightedVarianceSplitFinder}}.}
 \item{nodeResponseCombiner}{A response combiner that's used to combine
 responses for each terminal node in a tree (regression example; average the
 observations in each tree into a single number). See
-\code{\link{CR_ResponseCombiner}} or
-\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
-tries to pick one based on the response. For \code{\link{CR_Response}} it
-picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
-responses it picks a \code{\link{MeanResponseCombiner}}.}
+\code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
+you don't specify one, this function tries to pick one based on the
+response. For \code{\link{CR_Response}} it picks a
+\code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
+picks a \code{\link{MeanResponseCombiner}}.}

 \item{forestResponseCombiner}{A response combiner that's used to combine
 predictions across trees into one final result (regression example; average
 the prediction of each tree into a single number). See
-\code{\link{CR_FunctionCombiner}} or
-\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
-tries to pick one based on the response. For \code{\link{CR_Response}} it
-picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
-responses it picks a \code{\link{MeanResponseCombiner}}.}
+\code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
+you don't specify one, this function tries to pick one based on the
+response. For \code{\link{CR_Response}} it picks a
+\code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
+picks a \code{\link{MeanResponseCombiner}}.}

 \item{ntree}{An integer that specifies how many trees should be trained.}

@ -66,6 +66,21 @@ as large as \code{nodeSize}.}
 controls tree length; by default \code{maxNodeDepth} is an extremely high
 number and tree depth is controlled by \code{nodeSize}.}

+\item{na.penalty}{This parameter controls whether predictor variables with
+NAs should be penalized when being considered for a best split. Best splits
+(and the associated score) are determined on only non-NA data; the penalty
+is to take the best split identified, and to randomly assign any NAs
+(according to the proportion of data split left and right), and then
+recalculate the corresponding split score, when is then compared with the
+other split candiate variables. This penalty adds some computational time,
+so it may be disabled for some variables. \code{na.penalty} may be
+specified as a vector of logicals indicating, for each predictor variable,
+whether the penalty should be applied to that variable. If it's length 1
+then it applies to all variables. Alternatively, a single numeric value may
+be provided to indicate a threshold whereby the penalty is activated only
+if the proportion of NAs for that variable in the training set exceeds that
+threshold.}
+
 \item{splitPureNodes}{This parameter determines whether the algorithm will
 split a pure node. If set to FALSE, then before every split it will check
 that every response is the same, and if so, not split. If set to TRUE it
@ -160,7 +175,7 @@ data <- data.frame(x1, x2)

 forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
   LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
-   CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, 
+   CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
   mtry=1, nodeSize=10)
 newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
 ypred <- predict(forest, newData)