Add optional penalties when splitting variables that have NAs.
Also include several bug fixes from the Java project.
This commit is contained in:
parent
48859b0249
commit
3f0f6c0878
7 changed files with 114 additions and 49 deletions
|
@ -1,7 +1,7 @@
|
|||
Package: largeRCRF
|
||||
Type: Package
|
||||
Title: Large Random Competing Risks Forests
|
||||
Version: 1.0.4
|
||||
Version: 1.0.5
|
||||
Authors@R: c(
|
||||
person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")),
|
||||
person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs"))
|
||||
|
|
|
@ -29,8 +29,8 @@
|
|||
NULL
|
||||
|
||||
# @rdname covariates
|
||||
Java_BooleanCovariate <- function(name, index){
|
||||
covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index))
|
||||
Java_BooleanCovariate <- function(name, index, na.penalty){
|
||||
covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index), na.penalty)
|
||||
covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
|
||||
|
||||
return(covariate)
|
||||
|
@ -38,19 +38,19 @@ Java_BooleanCovariate <- function(name, index){
|
|||
|
||||
# @rdname covariates
|
||||
# @param levels The levels of the factor as a character vector
|
||||
Java_FactorCovariate <- function(name, index, levels){
|
||||
Java_FactorCovariate <- function(name, index, levels, na.penalty){
|
||||
levelsArray <- .jarray(levels, makeResponse(.class_String))
|
||||
levelsList <- .jcall("java/util/Arrays", "Ljava/util/List;", "asList", .jcast(levelsArray, "[Ljava/lang/Object;"))
|
||||
|
||||
covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList)
|
||||
covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList, na.penalty)
|
||||
covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
|
||||
|
||||
return(covariate)
|
||||
}
|
||||
|
||||
# @rdname covariates
|
||||
Java_NumericCovariate <- function(name, index){
|
||||
covariate <- .jnew(.class_NumericCovariate, name, as.integer(index))
|
||||
Java_NumericCovariate <- function(name, index, na.penalty){
|
||||
covariate <- .jnew(.class_NumericCovariate, name, as.integer(index), na.penalty)
|
||||
covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists
|
||||
|
||||
return(covariate)
|
||||
|
|
12
R/loadData.R
12
R/loadData.R
|
@ -1,4 +1,4 @@
|
|||
loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
|
||||
loadData <- function(data, xVarNames, responses, covariateList.java = NULL, na.penalty = NULL){
|
||||
|
||||
if(class(responses) == "integer" | class(responses) == "numeric"){
|
||||
responses <- Numeric(responses)
|
||||
|
@ -6,7 +6,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
|
|||
|
||||
# connectToData provides a pre-created covariate list we can re-use
|
||||
if(is.null(covariateList.java)){
|
||||
covariateList.java <- getCovariateList(data, xVarNames)
|
||||
covariateList.java <- getCovariateList(data, xVarNames, na.penalty)
|
||||
}
|
||||
|
||||
textColumns <- list()
|
||||
|
@ -22,7 +22,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){
|
|||
|
||||
}
|
||||
|
||||
getCovariateList <- function(data, xvarNames){
|
||||
getCovariateList <- function(data, xvarNames, na.penalty){
|
||||
covariateList <- .jcast(.jnew(.class_ArrayList, length(xvarNames)), .class_List)
|
||||
|
||||
for(i in 1:length(xvarNames)){
|
||||
|
@ -31,14 +31,14 @@ getCovariateList <- function(data, xvarNames){
|
|||
column <- data[,xName]
|
||||
|
||||
if(class(column) == "numeric" | class(column) == "integer"){
|
||||
covariate <- Java_NumericCovariate(xName, i-1)
|
||||
covariate <- Java_NumericCovariate(xName, i-1, na.penalty[i])
|
||||
}
|
||||
else if(class(column) == "logical"){
|
||||
covariate <- Java_BooleanCovariate(xName, i-1)
|
||||
covariate <- Java_BooleanCovariate(xName, i-1, na.penalty[i])
|
||||
}
|
||||
else if(class(column) == "factor"){
|
||||
lvls <- levels(column)
|
||||
covariate <- Java_FactorCovariate(xName, i-1, lvls)
|
||||
covariate <- Java_FactorCovariate(xName, i-1, lvls, na.penalty[i])
|
||||
}
|
||||
else{
|
||||
stop("Unknown column type")
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# Internal function that takes a formula and processes it for use in the Java
|
||||
# code. existingCovariateList is optional; if not provided then a new one is
|
||||
# created internally.
|
||||
processFormula <- function(formula, data, covariateList.java = NULL){
|
||||
processFormula <- function(formula, data, covariateList.java = NULL, na.penalty = NULL){
|
||||
|
||||
# Having an R copy of the data loaded at the same time can be wasteful; we
|
||||
# also allow users to provide an environment of the data which gets removed
|
||||
|
@ -52,8 +52,44 @@ processFormula <- function(formula, data, covariateList.java = NULL){
|
|||
# remove any response variables on the right-hand-side
|
||||
covariateData <- filteredData[, !(names(filteredData) %in% variablesToDrop), drop=FALSE]
|
||||
|
||||
# Now that we know how many predictor variables we have, we should check na.penalty
|
||||
if(!is.null(na.penalty)){
|
||||
if(!is.numeric(na.penalty) & !is.logical(na.penalty)){
|
||||
stop("na.penalty must be either logical or numeric.")
|
||||
}
|
||||
|
||||
if(is.logical(na.penalty) & length(na.penalty) != 1 & length(na.penalty) != ncol(covariateData)){
|
||||
stop("na.penalty must have length of either 1 or the number of predictor variables if logical.")
|
||||
}
|
||||
|
||||
if(is.numeric(na.penalty) & length(na.penalty) != 1){
|
||||
stop("na.penalty must have length 1 if logical.")
|
||||
}
|
||||
|
||||
if(anyNA(na.penalty)){
|
||||
stop("na.penalty cannot contain NAs.")
|
||||
}
|
||||
|
||||
|
||||
# All good; now to transform it.
|
||||
if(is.numeric(na.penalty)){
|
||||
na.threshold <- na.penalty
|
||||
na.penalty <- apply(covariateData, 2, function(x){mean(is.na(x))}) >= na.threshold
|
||||
}
|
||||
else if(is.logical(na.penalty) & length(na.penalty) == 1){
|
||||
na.penalty <- rep(na.penalty, times = ncol(covariateData))
|
||||
}
|
||||
# else{} - na.penalty is logical and the correct length; no need to do anything to it
|
||||
|
||||
}
|
||||
|
||||
dataset <- loadData(covariateData, colnames(covariateData), responses, covariateList.java = covariateList.java)
|
||||
dataset <- loadData(
|
||||
covariateData,
|
||||
colnames(covariateData),
|
||||
responses,
|
||||
covariateList.java = covariateList.java,
|
||||
na.penalty = na.penalty
|
||||
)
|
||||
|
||||
return(dataset)
|
||||
}
|
50
R/train.R
50
R/train.R
|
@ -178,9 +178,9 @@ train.internal <- function(dataset, splitFinder,
|
|||
#' @param data A data.frame containing the columns of the predictors and
|
||||
#' responses.
|
||||
#' @param splitFinder A split finder that's used to score splits in the random
|
||||
#' forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
|
||||
#' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
|
||||
#' this function tries to pick one based on the response. For
|
||||
#' forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
|
||||
#' \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
|
||||
#' function tries to pick one based on the response. For
|
||||
#' \code{\link{CR_Response}} without censor times, it will pick a
|
||||
#' \code{\link{LogRankSplitFinder}}; while if censor times were provided it
|
||||
#' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
|
||||
|
@ -188,19 +188,19 @@ train.internal <- function(dataset, splitFinder,
|
|||
#' @param nodeResponseCombiner A response combiner that's used to combine
|
||||
#' responses for each terminal node in a tree (regression example; average the
|
||||
#' observations in each tree into a single number). See
|
||||
#' \code{\link{CR_ResponseCombiner}} or
|
||||
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
#' tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
#' picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' \code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
|
||||
#' you don't specify one, this function tries to pick one based on the
|
||||
#' response. For \code{\link{CR_Response}} it picks a
|
||||
#' \code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
|
||||
#' picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' @param forestResponseCombiner A response combiner that's used to combine
|
||||
#' predictions across trees into one final result (regression example; average
|
||||
#' the prediction of each tree into a single number). See
|
||||
#' \code{\link{CR_FunctionCombiner}} or
|
||||
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
#' tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
#' picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' \code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
|
||||
#' you don't specify one, this function tries to pick one based on the
|
||||
#' response. For \code{\link{CR_Response}} it picks a
|
||||
#' \code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
|
||||
#' picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' @param ntree An integer that specifies how many trees should be trained.
|
||||
#' @param numberOfSplits A tuning parameter specifying how many random splits
|
||||
#' should be tried for a covariate; a value of 0 means all splits will be
|
||||
|
@ -217,6 +217,20 @@ train.internal <- function(dataset, splitFinder,
|
|||
#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
|
||||
#' controls tree length; by default \code{maxNodeDepth} is an extremely high
|
||||
#' number and tree depth is controlled by \code{nodeSize}.
|
||||
#' @param na.penalty This parameter controls whether predictor variables with
|
||||
#' NAs should be penalized when being considered for a best split. Best splits
|
||||
#' (and the associated score) are determined on only non-NA data; the penalty
|
||||
#' is to take the best split identified, and to randomly assign any NAs
|
||||
#' (according to the proportion of data split left and right), and then
|
||||
#' recalculate the corresponding split score, when is then compared with the
|
||||
#' other split candiate variables. This penalty adds some computational time,
|
||||
#' so it may be disabled for some variables. \code{na.penalty} may be
|
||||
#' specified as a vector of logicals indicating, for each predictor variable,
|
||||
#' whether the penalty should be applied to that variable. If it's length 1
|
||||
#' then it applies to all variables. Alternatively, a single numeric value may
|
||||
#' be provided to indicate a threshold whereby the penalty is activated only
|
||||
#' if the proportion of NAs for that variable in the training set exceeds that
|
||||
#' threshold.
|
||||
#' @param splitPureNodes This parameter determines whether the algorithm will
|
||||
#' split a pure node. If set to FALSE, then before every split it will check
|
||||
#' that every response is the same, and if so, not split. If set to TRUE it
|
||||
|
@ -290,17 +304,17 @@ train.internal <- function(dataset, splitFinder,
|
|||
#'
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
#' LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
|
||||
#' CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
|
||||
#' CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
|
||||
#' mtry=1, nodeSize=10)
|
||||
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
|
||||
forestResponseCombiner = NULL, ntree, numberOfSplits, mtry,
|
||||
nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL,
|
||||
savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(),
|
||||
randomSeed = NULL, displayProgress = TRUE){
|
||||
nodeSize, maxNodeDepth = 100000, na.penalty = TRUE, splitPureNodes=TRUE,
|
||||
savePath=NULL, savePath.overwrite=c("warn", "delete", "merge"),
|
||||
cores = getCores(), randomSeed = NULL, displayProgress = TRUE){
|
||||
|
||||
dataset <- processFormula(formula, data)
|
||||
dataset <- processFormula(formula, data, na.penalty = na.penalty)
|
||||
|
||||
forest <- train.internal(dataset, splitFinder = splitFinder,
|
||||
nodeResponseCombiner = nodeResponseCombiner,
|
||||
|
|
Binary file not shown.
47
man/train.Rd
47
man/train.Rd
|
@ -6,8 +6,8 @@
|
|||
\usage{
|
||||
train(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
|
||||
forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, nodeSize,
|
||||
maxNodeDepth = 1e+05, splitPureNodes = TRUE, savePath = NULL,
|
||||
savePath.overwrite = c("warn", "delete", "merge"),
|
||||
maxNodeDepth = 1e+05, na.penalty = TRUE, splitPureNodes = TRUE,
|
||||
savePath = NULL, savePath.overwrite = c("warn", "delete", "merge"),
|
||||
cores = getCores(), randomSeed = NULL, displayProgress = TRUE)
|
||||
}
|
||||
\arguments{
|
||||
|
@ -19,9 +19,9 @@ constructed.}
|
|||
responses.}
|
||||
|
||||
\item{splitFinder}{A split finder that's used to score splits in the random
|
||||
forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
|
||||
or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
|
||||
this function tries to pick one based on the response. For
|
||||
forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or
|
||||
\code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this
|
||||
function tries to pick one based on the response. For
|
||||
\code{\link{CR_Response}} without censor times, it will pick a
|
||||
\code{\link{LogRankSplitFinder}}; while if censor times were provided it
|
||||
will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
|
||||
|
@ -30,20 +30,20 @@ responses it picks a \code{\link{WeightedVarianceSplitFinder}}.}
|
|||
\item{nodeResponseCombiner}{A response combiner that's used to combine
|
||||
responses for each terminal node in a tree (regression example; average the
|
||||
observations in each tree into a single number). See
|
||||
\code{\link{CR_ResponseCombiner}} or
|
||||
\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
|
||||
responses it picks a \code{\link{MeanResponseCombiner}}.}
|
||||
\code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If
|
||||
you don't specify one, this function tries to pick one based on the
|
||||
response. For \code{\link{CR_Response}} it picks a
|
||||
\code{\link{CR_ResponseCombiner}}; for integer or numeric responses it
|
||||
picks a \code{\link{MeanResponseCombiner}}.}
|
||||
|
||||
\item{forestResponseCombiner}{A response combiner that's used to combine
|
||||
predictions across trees into one final result (regression example; average
|
||||
the prediction of each tree into a single number). See
|
||||
\code{\link{CR_FunctionCombiner}} or
|
||||
\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
|
||||
responses it picks a \code{\link{MeanResponseCombiner}}.}
|
||||
\code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If
|
||||
you don't specify one, this function tries to pick one based on the
|
||||
response. For \code{\link{CR_Response}} it picks a
|
||||
\code{\link{CR_FunctionCombiner}}; for integer or numeric responses it
|
||||
picks a \code{\link{MeanResponseCombiner}}.}
|
||||
|
||||
\item{ntree}{An integer that specifies how many trees should be trained.}
|
||||
|
||||
|
@ -66,6 +66,21 @@ as large as \code{nodeSize}.}
|
|||
controls tree length; by default \code{maxNodeDepth} is an extremely high
|
||||
number and tree depth is controlled by \code{nodeSize}.}
|
||||
|
||||
\item{na.penalty}{This parameter controls whether predictor variables with
|
||||
NAs should be penalized when being considered for a best split. Best splits
|
||||
(and the associated score) are determined on only non-NA data; the penalty
|
||||
is to take the best split identified, and to randomly assign any NAs
|
||||
(according to the proportion of data split left and right), and then
|
||||
recalculate the corresponding split score, when is then compared with the
|
||||
other split candiate variables. This penalty adds some computational time,
|
||||
so it may be disabled for some variables. \code{na.penalty} may be
|
||||
specified as a vector of logicals indicating, for each predictor variable,
|
||||
whether the penalty should be applied to that variable. If it's length 1
|
||||
then it applies to all variables. Alternatively, a single numeric value may
|
||||
be provided to indicate a threshold whereby the penalty is activated only
|
||||
if the proportion of NAs for that variable in the training set exceeds that
|
||||
threshold.}
|
||||
|
||||
\item{splitPureNodes}{This parameter determines whether the algorithm will
|
||||
split a pure node. If set to FALSE, then before every split it will check
|
||||
that every response is the same, and if so, not split. If set to TRUE it
|
||||
|
@ -160,7 +175,7 @@ data <- data.frame(x1, x2)
|
|||
|
||||
forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
|
||||
CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
|
||||
CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
|
||||
mtry=1, nodeSize=10)
|
||||
newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
ypred <- predict(forest, newData)
|
||||
|
|
Loading…
Reference in a new issue