Add optional penalties when splitting variables that have NAs.

Also include several bug fixes from the Java project.
This commit is contained in:
Joel Therrien 2019-08-29 13:54:38 -07:00
parent 48859b0249
commit 3f0f6c0878
7 changed files with 114 additions and 49 deletions

View file

@ -1,7 +1,7 @@
Package: largeRCRF Package: largeRCRF
Type: Package Type: Package
Title: Large Random Competing Risks Forests Title: Large Random Competing Risks Forests
Version: 1.0.4 Version: 1.0.5
Authors@R: c( Authors@R: c(
person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")), person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")),
person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs")) person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs"))

View file

@ -29,8 +29,8 @@
NULL NULL
# @rdname covariates # @rdname covariates
Java_BooleanCovariate <- function(name, index){ Java_BooleanCovariate <- function(name, index, na.penalty){
covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index)) covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index), na.penalty)
covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
return(covariate) return(covariate)
@ -38,19 +38,19 @@ Java_BooleanCovariate <- function(name, index){
# @rdname covariates # @rdname covariates
# @param levels The levels of the factor as a character vector # @param levels The levels of the factor as a character vector
Java_FactorCovariate <- function(name, index, levels){ Java_FactorCovariate <- function(name, index, levels, na.penalty){
levelsArray <- .jarray(levels, makeResponse(.class_String)) levelsArray <- .jarray(levels, makeResponse(.class_String))
levelsList <- .jcall("java/util/Arrays", "Ljava/util/List;", "asList", .jcast(levelsArray, "[Ljava/lang/Object;")) levelsList <- .jcall("java/util/Arrays", "Ljava/util/List;", "asList", .jcast(levelsArray, "[Ljava/lang/Object;"))
covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList) covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList, na.penalty)
covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
return(covariate) return(covariate)
} }
# @rdname covariates # @rdname covariates
Java_NumericCovariate <- function(name, index){ Java_NumericCovariate <- function(name, index, na.penalty){
covariate <- .jnew(.class_NumericCovariate, name, as.integer(index)) covariate <- .jnew(.class_NumericCovariate, name, as.integer(index), na.penalty)
covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
return(covariate) return(covariate)

View file

@ -1,4 +1,4 @@
loadData <- function(data, xVarNames, responses, covariateList.java = NULL){ loadData <- function(data, xVarNames, responses, covariateList.java = NULL, na.penalty = NULL){
if(class(responses) == "integer" | class(responses) == "numeric"){ if(class(responses) == "integer" | class(responses) == "numeric"){
responses <- Numeric(responses) responses <- Numeric(responses)
@ -6,7 +6,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
# connectToData provides a pre-created covariate list we can re-use # connectToData provides a pre-created covariate list we can re-use
if(is.null(covariateList.java)){ if(is.null(covariateList.java)){
covariateList.java <- getCovariateList(data, xVarNames) covariateList.java <- getCovariateList(data, xVarNames, na.penalty)
} }
textColumns <- list() textColumns <- list()
@ -22,7 +22,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
} }
getCovariateList <- function(data, xvarNames){ getCovariateList <- function(data, xvarNames, na.penalty){
covariateList <- .jcast(.jnew(.class_ArrayList, length(xvarNames)), .class_List) covariateList <- .jcast(.jnew(.class_ArrayList, length(xvarNames)), .class_List)
for(i in 1:length(xvarNames)){ for(i in 1:length(xvarNames)){
@ -31,14 +31,14 @@ getCovariateList <- function(data, xvarNames){
column <- data[,xName] column <- data[,xName]
if(class(column) == "numeric" | class(column) == "integer"){ if(class(column) == "numeric" | class(column) == "integer"){
covariate <- Java_NumericCovariate(xName, i-1) covariate <- Java_NumericCovariate(xName, i-1, na.penalty[i])
} }
else if(class(column) == "logical"){ else if(class(column) == "logical"){
covariate <- Java_BooleanCovariate(xName, i-1) covariate <- Java_BooleanCovariate(xName, i-1, na.penalty[i])
} }
else if(class(column) == "factor"){ else if(class(column) == "factor"){
lvls <- levels(column) lvls <- levels(column)
covariate <- Java_FactorCovariate(xName, i-1, lvls) covariate <- Java_FactorCovariate(xName, i-1, lvls, na.penalty[i])
} }
else{ else{
stop("Unknown column type") stop("Unknown column type")

View file

@ -2,7 +2,7 @@
# Internal function that takes a formula and processes it for use in the Java # Internal function that takes a formula and processes it for use in the Java
# code. existingCovariateList is optional; if not provided then a new one is # code. existingCovariateList is optional; if not provided then a new one is
# created internally. # created internally.
processFormula <- function(formula, data, covariateList.java = NULL){ processFormula <- function(formula, data, covariateList.java = NULL, na.penalty = NULL){
# Having an R copy of the data loaded at the same time can be wasteful; we # Having an R copy of the data loaded at the same time can be wasteful; we
# also allow users to provide an environment of the data which gets removed # also allow users to provide an environment of the data which gets removed
@ -52,8 +52,44 @@ processFormula <- function(formula, data, covariateList.java = NULL){
# remove any response variables on the right-hand-side # remove any response variables on the right-hand-side
covariateData <- filteredData[, !(names(filteredData) %in% variablesToDrop), drop=FALSE] covariateData <- filteredData[, !(names(filteredData) %in% variablesToDrop), drop=FALSE]
# Now that we know how many predictor variables we have, we should check na.penalty
if(!is.null(na.penalty)){
if(!is.numeric(na.penalty) & !is.logical(na.penalty)){
stop("na.penalty must be either logical or numeric.")
}
if(is.logical(na.penalty) & length(na.penalty) != 1 & length(na.penalty) != ncol(covariateData)){
stop("na.penalty must have length of either 1 or the number of predictor variables if logical.")
}
if(is.numeric(na.penalty) & length(na.penalty) != 1){
stop("na.penalty must have length 1 if logical.")
}
if(anyNA(na.penalty)){
stop("na.penalty cannot contain NAs.")
}
# All good; now to transform it.
if(is.numeric(na.penalty)){
na.threshold <- na.penalty
na.penalty <- apply(covariateData, 2, function(x){mean(is.na(x))}) >= na.threshold
}
else if(is.logical(na.penalty) & length(na.penalty) == 1){
na.penalty <- rep(na.penalty, times = ncol(covariateData))
}
# else{} - na.penalty is logical and the correct length; no need to do anything to it
}
dataset <- loadData(covariateData, colnames(covariateData), responses, covariateList.java = covariateList.java) dataset <- loadData(
covariateData,
colnames(covariateData),
responses,
covariateList.java = covariateList.java,
na.penalty = na.penalty
)
return(dataset) return(dataset)
} }

View file

@ -178,9 +178,9 @@ train.internal <- function(dataset, splitFinder,
#' @param data A data.frame containing the columns of the predictors and #' @param data A data.frame containing the columns of the predictors and
#' responses. #' responses.
#' @param splitFinder A split finder that's used to score splits in the random #' @param splitFinder A split finder that's used to score splits in the random
#' forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} #' forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
#' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, #' \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
#' this function tries to pick one based on the response. For #' function tries to pick one based on the response. For
#' \code{\link{CR_Response}} without censor times, it will pick a #' \code{\link{CR_Response}} without censor times, it will pick a
#' \code{\link{LogRankSplitFinder}}; while if censor times were provided it #' \code{\link{LogRankSplitFinder}}; while if censor times were provided it
#' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric #' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
@ -188,19 +188,19 @@ train.internal <- function(dataset, splitFinder,
#' @param nodeResponseCombiner A response combiner that's used to combine #' @param nodeResponseCombiner A response combiner that's used to combine
#' responses for each terminal node in a tree (regression example; average the #' responses for each terminal node in a tree (regression example; average the
#' observations in each tree into a single number). See #' observations in each tree into a single number). See
#' \code{\link{CR_ResponseCombiner}} or #' \code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function #' you don't specify one, this function tries to pick one based on the
#' tries to pick one based on the response. For \code{\link{CR_Response}} it #' response. For \code{\link{CR_Response}} it picks a
#' picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric #' \code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
#' responses it picks a \code{\link{MeanResponseCombiner}}. #' picks a \code{\link{MeanResponseCombiner}}.
#' @param forestResponseCombiner A response combiner that's used to combine #' @param forestResponseCombiner A response combiner that's used to combine
#' predictions across trees into one final result (regression example; average #' predictions across trees into one final result (regression example; average
#' the prediction of each tree into a single number). See #' the prediction of each tree into a single number). See
#' \code{\link{CR_FunctionCombiner}} or #' \code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function #' you don't specify one, this function tries to pick one based on the
#' tries to pick one based on the response. For \code{\link{CR_Response}} it #' response. For \code{\link{CR_Response}} it picks a
#' picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric #' \code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
#' responses it picks a \code{\link{MeanResponseCombiner}}. #' picks a \code{\link{MeanResponseCombiner}}.
#' @param ntree An integer that specifies how many trees should be trained. #' @param ntree An integer that specifies how many trees should be trained.
#' @param numberOfSplits A tuning parameter specifying how many random splits #' @param numberOfSplits A tuning parameter specifying how many random splits
#' should be tried for a covariate; a value of 0 means all splits will be #' should be tried for a covariate; a value of 0 means all splits will be
@ -217,6 +217,20 @@ train.internal <- function(dataset, splitFinder,
#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it #' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
#' controls tree length; by default \code{maxNodeDepth} is an extremely high #' controls tree length; by default \code{maxNodeDepth} is an extremely high
#' number and tree depth is controlled by \code{nodeSize}. #' number and tree depth is controlled by \code{nodeSize}.
#' @param na.penalty This parameter controls whether predictor variables with
#' NAs should be penalized when being considered for a best split. Best splits
#' (and the associated score) are determined on only non-NA data; the penalty
#' is to take the best split identified, and to randomly assign any NAs
#' (according to the proportion of data split left and right), and then
#' recalculate the corresponding split score, when is then compared with the
#' other split candiate variables. This penalty adds some computational time,
#' so it may be disabled for some variables. \code{na.penalty} may be
#' specified as a vector of logicals indicating, for each predictor variable,
#' whether the penalty should be applied to that variable. If it's length 1
#' then it applies to all variables. Alternatively, a single numeric value may
#' be provided to indicate a threshold whereby the penalty is activated only
#' if the proportion of NAs for that variable in the training set exceeds that
#' threshold.
#' @param splitPureNodes This parameter determines whether the algorithm will #' @param splitPureNodes This parameter determines whether the algorithm will
#' split a pure node. If set to FALSE, then before every split it will check #' split a pure node. If set to FALSE, then before every split it will check
#' that every response is the same, and if so, not split. If set to TRUE it #' that every response is the same, and if so, not split. If set to TRUE it
@ -290,17 +304,17 @@ train.internal <- function(dataset, splitFinder,
#' #'
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, #' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
#' LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2), #' LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
#' CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, #' CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
#' mtry=1, nodeSize=10) #' mtry=1, nodeSize=10)
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
#' ypred <- predict(forest, newData) #' ypred <- predict(forest, newData)
train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL, train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, forestResponseCombiner = NULL, ntree, numberOfSplits, mtry,
nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL, nodeSize, maxNodeDepth = 100000, na.penalty = TRUE, splitPureNodes=TRUE,
savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(), savePath=NULL, savePath.overwrite=c("warn", "delete", "merge"),
randomSeed = NULL, displayProgress = TRUE){ cores = getCores(), randomSeed = NULL, displayProgress = TRUE){
dataset <- processFormula(formula, data) dataset <- processFormula(formula, data, na.penalty = na.penalty)
forest <- train.internal(dataset, splitFinder = splitFinder, forest <- train.internal(dataset, splitFinder = splitFinder,
nodeResponseCombiner = nodeResponseCombiner, nodeResponseCombiner = nodeResponseCombiner,

View file

@ -6,8 +6,8 @@
\usage{ \usage{
train(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL, train(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, nodeSize, forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, nodeSize,
maxNodeDepth = 1e+05, splitPureNodes = TRUE, savePath = NULL, maxNodeDepth = 1e+05, na.penalty = TRUE, splitPureNodes = TRUE,
savePath.overwrite = c("warn", "delete", "merge"), savePath = NULL, savePath.overwrite = c("warn", "delete", "merge"),
cores = getCores(), randomSeed = NULL, displayProgress = TRUE) cores = getCores(), randomSeed = NULL, displayProgress = TRUE)
} }
\arguments{ \arguments{
@ -19,9 +19,9 @@ constructed.}
responses.} responses.}
\item{splitFinder}{A split finder that's used to score splits in the random \item{splitFinder}{A split finder that's used to score splits in the random
forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
this function tries to pick one based on the response. For function tries to pick one based on the response. For
\code{\link{CR_Response}} without censor times, it will pick a \code{\link{CR_Response}} without censor times, it will pick a
\code{\link{LogRankSplitFinder}}; while if censor times were provided it \code{\link{LogRankSplitFinder}}; while if censor times were provided it
will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
@ -30,20 +30,20 @@ responses it picks a \code{\link{WeightedVarianceSplitFinder}}.}
\item{nodeResponseCombiner}{A response combiner that's used to combine \item{nodeResponseCombiner}{A response combiner that's used to combine
responses for each terminal node in a tree (regression example; average the responses for each terminal node in a tree (regression example; average the
observations in each tree into a single number). See observations in each tree into a single number). See
\code{\link{CR_ResponseCombiner}} or \code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
\code{\link{MeanResponseCombiner}}. If you don't specify one, this function you don't specify one, this function tries to pick one based on the
tries to pick one based on the response. For \code{\link{CR_Response}} it response. For \code{\link{CR_Response}} it picks a
picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric \code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
responses it picks a \code{\link{MeanResponseCombiner}}.} picks a \code{\link{MeanResponseCombiner}}.}
\item{forestResponseCombiner}{A response combiner that's used to combine \item{forestResponseCombiner}{A response combiner that's used to combine
predictions across trees into one final result (regression example; average predictions across trees into one final result (regression example; average
the prediction of each tree into a single number). See the prediction of each tree into a single number). See
\code{\link{CR_FunctionCombiner}} or \code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
\code{\link{MeanResponseCombiner}}. If you don't specify one, this function you don't specify one, this function tries to pick one based on the
tries to pick one based on the response. For \code{\link{CR_Response}} it response. For \code{\link{CR_Response}} it picks a
picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric \code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
responses it picks a \code{\link{MeanResponseCombiner}}.} picks a \code{\link{MeanResponseCombiner}}.}
\item{ntree}{An integer that specifies how many trees should be trained.} \item{ntree}{An integer that specifies how many trees should be trained.}
@ -66,6 +66,21 @@ as large as \code{nodeSize}.}
controls tree length; by default \code{maxNodeDepth} is an extremely high controls tree length; by default \code{maxNodeDepth} is an extremely high
number and tree depth is controlled by \code{nodeSize}.} number and tree depth is controlled by \code{nodeSize}.}
\item{na.penalty}{This parameter controls whether predictor variables with
NAs should be penalized when being considered for a best split. Best splits
(and the associated score) are determined on only non-NA data; the penalty
is to take the best split identified, and to randomly assign any NAs
(according to the proportion of data split left and right), and then
recalculate the corresponding split score, when is then compared with the
other split candiate variables. This penalty adds some computational time,
so it may be disabled for some variables. \code{na.penalty} may be
specified as a vector of logicals indicating, for each predictor variable,
whether the penalty should be applied to that variable. If it's length 1
then it applies to all variables. Alternatively, a single numeric value may
be provided to indicate a threshold whereby the penalty is activated only
if the proportion of NAs for that variable in the training set exceeds that
threshold.}
\item{splitPureNodes}{This parameter determines whether the algorithm will \item{splitPureNodes}{This parameter determines whether the algorithm will
split a pure node. If set to FALSE, then before every split it will check split a pure node. If set to FALSE, then before every split it will check
that every response is the same, and if so, not split. If set to TRUE it that every response is the same, and if so, not split. If set to TRUE it
@ -160,7 +175,7 @@ data <- data.frame(x1, x2)
forest <- train(CR_Response(delta, u) ~ x1 + x2, data, forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2), LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
mtry=1, nodeSize=10) mtry=1, nodeSize=10)
newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
ypred <- predict(forest, newData) ypred <- predict(forest, newData)