diff --git a/DESCRIPTION b/DESCRIPTION index b8829b0..6dca363 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: largeRCRF Type: Package Title: Large Random Competing Risks Forests -Version: 1.0.4 +Version: 1.0.5 Authors@R: c( person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")), person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs")) diff --git a/R/create_java_covariates.R b/R/create_java_covariates.R index 3c60a25..4cb1bd0 100644 --- a/R/create_java_covariates.R +++ b/R/create_java_covariates.R @@ -29,8 +29,8 @@ NULL # @rdname covariates -Java_BooleanCovariate <- function(name, index){ - covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index)) +Java_BooleanCovariate <- function(name, index, na.penalty){ + covariate <- .jnew(.class_BooleanCovariate, name, as.integer(index), na.penalty) covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists return(covariate) @@ -38,19 +38,19 @@ Java_BooleanCovariate <- function(name, index){ # @rdname covariates # @param levels The levels of the factor as a character vector -Java_FactorCovariate <- function(name, index, levels){ +Java_FactorCovariate <- function(name, index, levels, na.penalty){ levelsArray <- .jarray(levels, makeResponse(.class_String)) levelsList <- .jcall("java/util/Arrays", "Ljava/util/List;", "asList", .jcast(levelsArray, "[Ljava/lang/Object;")) - covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList) + covariate <- .jnew(.class_FactorCovariate, name, as.integer(index), levelsList, na.penalty) covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists return(covariate) } # @rdname covariates -Java_NumericCovariate <- function(name, index){ - covariate <- .jnew(.class_NumericCovariate, name, as.integer(index)) +Java_NumericCovariate <- function(name, index, na.penalty){ + covariate <- .jnew(.class_NumericCovariate, name, as.integer(index), na.penalty) covariate <- .jcast(covariate, .class_Object) # needed for later adding it into Java Lists return(covariate) diff --git a/R/loadData.R b/R/loadData.R index 7a1c0df..5ee2383 100644 --- a/R/loadData.R +++ b/R/loadData.R @@ -1,4 +1,4 @@ -loadData <- function(data, xVarNames, responses, covariateList.java = NULL){ +loadData <- function(data, xVarNames, responses, covariateList.java = NULL, na.penalty = NULL){ if(class(responses) == "integer" | class(responses) == "numeric"){ responses <- Numeric(responses) @@ -6,7 +6,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){ # connectToData provides a pre-created covariate list we can re-use if(is.null(covariateList.java)){ - covariateList.java <- getCovariateList(data, xVarNames) + covariateList.java <- getCovariateList(data, xVarNames, na.penalty) } textColumns <- list() @@ -22,7 +22,7 @@ loadData <- function(data, xVarNames, responses, covariateList.java = NULL){ } -getCovariateList <- function(data, xvarNames){ +getCovariateList <- function(data, xvarNames, na.penalty){ covariateList <- .jcast(.jnew(.class_ArrayList, length(xvarNames)), .class_List) for(i in 1:length(xvarNames)){ @@ -31,14 +31,14 @@ getCovariateList <- function(data, xvarNames){ column <- data[,xName] if(class(column) == "numeric" | class(column) == "integer"){ - covariate <- Java_NumericCovariate(xName, i-1) + covariate <- Java_NumericCovariate(xName, i-1, na.penalty[i]) } else if(class(column) == "logical"){ - covariate <- Java_BooleanCovariate(xName, i-1) + covariate <- Java_BooleanCovariate(xName, i-1, na.penalty[i]) } else if(class(column) == "factor"){ lvls <- levels(column) - covariate <- Java_FactorCovariate(xName, i-1, lvls) + covariate <- Java_FactorCovariate(xName, i-1, lvls, na.penalty[i]) } else{ stop("Unknown column type") diff --git a/R/processFormula.R b/R/processFormula.R index 26d1097..e86b369 100644 --- a/R/processFormula.R +++ b/R/processFormula.R @@ -2,7 +2,7 @@ # Internal function that takes a formula and processes it for use in the Java # code. existingCovariateList is optional; if not provided then a new one is # created internally. -processFormula <- function(formula, data, covariateList.java = NULL){ +processFormula <- function(formula, data, covariateList.java = NULL, na.penalty = NULL){ # Having an R copy of the data loaded at the same time can be wasteful; we # also allow users to provide an environment of the data which gets removed @@ -52,8 +52,44 @@ processFormula <- function(formula, data, covariateList.java = NULL){ # remove any response variables on the right-hand-side covariateData <- filteredData[, !(names(filteredData) %in% variablesToDrop), drop=FALSE] + # Now that we know how many predictor variables we have, we should check na.penalty + if(!is.null(na.penalty)){ + if(!is.numeric(na.penalty) & !is.logical(na.penalty)){ + stop("na.penalty must be either logical or numeric.") + } + + if(is.logical(na.penalty) & length(na.penalty) != 1 & length(na.penalty) != ncol(covariateData)){ + stop("na.penalty must have length of either 1 or the number of predictor variables if logical.") + } + + if(is.numeric(na.penalty) & length(na.penalty) != 1){ + stop("na.penalty must have length 1 if logical.") + } + + if(anyNA(na.penalty)){ + stop("na.penalty cannot contain NAs.") + } + + + # All good; now to transform it. + if(is.numeric(na.penalty)){ + na.threshold <- na.penalty + na.penalty <- apply(covariateData, 2, function(x){mean(is.na(x))}) >= na.threshold + } + else if(is.logical(na.penalty) & length(na.penalty) == 1){ + na.penalty <- rep(na.penalty, times = ncol(covariateData)) + } + # else{} - na.penalty is logical and the correct length; no need to do anything to it + + } - dataset <- loadData(covariateData, colnames(covariateData), responses, covariateList.java = covariateList.java) + dataset <- loadData( + covariateData, + colnames(covariateData), + responses, + covariateList.java = covariateList.java, + na.penalty = na.penalty + ) return(dataset) } \ No newline at end of file diff --git a/R/train.R b/R/train.R index 116bc09..a513413 100644 --- a/R/train.R +++ b/R/train.R @@ -178,9 +178,9 @@ train.internal <- function(dataset, splitFinder, #' @param data A data.frame containing the columns of the predictors and #' responses. #' @param splitFinder A split finder that's used to score splits in the random -#' forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} -#' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, -#' this function tries to pick one based on the response. For +#' forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or +#' \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this +#' function tries to pick one based on the response. For #' \code{\link{CR_Response}} without censor times, it will pick a #' \code{\link{LogRankSplitFinder}}; while if censor times were provided it #' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric @@ -188,19 +188,19 @@ train.internal <- function(dataset, splitFinder, #' @param nodeResponseCombiner A response combiner that's used to combine #' responses for each terminal node in a tree (regression example; average the #' observations in each tree into a single number). See -#' \code{\link{CR_ResponseCombiner}} or -#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function -#' tries to pick one based on the response. For \code{\link{CR_Response}} it -#' picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric -#' responses it picks a \code{\link{MeanResponseCombiner}}. +#' \code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If +#' you don't specify one, this function tries to pick one based on the +#' response. For \code{\link{CR_Response}} it picks a +#' \code{\link{CR_ResponseCombiner}}; for integer or numeric responses it +#' picks a \code{\link{MeanResponseCombiner}}. #' @param forestResponseCombiner A response combiner that's used to combine #' predictions across trees into one final result (regression example; average #' the prediction of each tree into a single number). See -#' \code{\link{CR_FunctionCombiner}} or -#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function -#' tries to pick one based on the response. For \code{\link{CR_Response}} it -#' picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric -#' responses it picks a \code{\link{MeanResponseCombiner}}. +#' \code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If +#' you don't specify one, this function tries to pick one based on the +#' response. For \code{\link{CR_Response}} it picks a +#' \code{\link{CR_FunctionCombiner}}; for integer or numeric responses it +#' picks a \code{\link{MeanResponseCombiner}}. #' @param ntree An integer that specifies how many trees should be trained. #' @param numberOfSplits A tuning parameter specifying how many random splits #' should be tried for a covariate; a value of 0 means all splits will be @@ -217,6 +217,20 @@ train.internal <- function(dataset, splitFinder, #' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it #' controls tree length; by default \code{maxNodeDepth} is an extremely high #' number and tree depth is controlled by \code{nodeSize}. +#' @param na.penalty This parameter controls whether predictor variables with +#' NAs should be penalized when being considered for a best split. Best splits +#' (and the associated score) are determined on only non-NA data; the penalty +#' is to take the best split identified, and to randomly assign any NAs +#' (according to the proportion of data split left and right), and then +#' recalculate the corresponding split score, when is then compared with the +#' other split candiate variables. This penalty adds some computational time, +#' so it may be disabled for some variables. \code{na.penalty} may be +#' specified as a vector of logicals indicating, for each predictor variable, +#' whether the penalty should be applied to that variable. If it's length 1 +#' then it applies to all variables. Alternatively, a single numeric value may +#' be provided to indicate a threshold whereby the penalty is activated only +#' if the proportion of NAs for that variable in the training set exceeds that +#' threshold. #' @param splitPureNodes This parameter determines whether the algorithm will #' split a pure node. If set to FALSE, then before every split it will check #' that every response is the same, and if so, not split. If set to TRUE it @@ -290,17 +304,17 @@ train.internal <- function(dataset, splitFinder, #' #' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, #' LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2), -#' CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, +#' CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, #' mtry=1, nodeSize=10) #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) #' ypred <- predict(forest, newData) train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL, forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, - nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL, - savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(), - randomSeed = NULL, displayProgress = TRUE){ + nodeSize, maxNodeDepth = 100000, na.penalty = TRUE, splitPureNodes=TRUE, + savePath=NULL, savePath.overwrite=c("warn", "delete", "merge"), + cores = getCores(), randomSeed = NULL, displayProgress = TRUE){ - dataset <- processFormula(formula, data) + dataset <- processFormula(formula, data, na.penalty = na.penalty) forest <- train.internal(dataset, splitFinder = splitFinder, nodeResponseCombiner = nodeResponseCombiner, diff --git a/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar b/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar index ad8f858..9de61f0 100644 Binary files a/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar and b/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar differ diff --git a/man/train.Rd b/man/train.Rd index a07e47c..09ea41a 100644 --- a/man/train.Rd +++ b/man/train.Rd @@ -6,8 +6,8 @@ \usage{ train(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL, forestResponseCombiner = NULL, ntree, numberOfSplits, mtry, nodeSize, - maxNodeDepth = 1e+05, splitPureNodes = TRUE, savePath = NULL, - savePath.overwrite = c("warn", "delete", "merge"), + maxNodeDepth = 1e+05, na.penalty = TRUE, splitPureNodes = TRUE, + savePath = NULL, savePath.overwrite = c("warn", "delete", "merge"), cores = getCores(), randomSeed = NULL, displayProgress = TRUE) } \arguments{ @@ -19,9 +19,9 @@ constructed.} responses.} \item{splitFinder}{A split finder that's used to score splits in the random -forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} -or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, -this function tries to pick one based on the response. For +forest training algorithm. See \code{\link{CompetingRiskSplitFinders}} or +\code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this +function tries to pick one based on the response. For \code{\link{CR_Response}} without censor times, it will pick a \code{\link{LogRankSplitFinder}}; while if censor times were provided it will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric @@ -30,20 +30,20 @@ responses it picks a \code{\link{WeightedVarianceSplitFinder}}.} \item{nodeResponseCombiner}{A response combiner that's used to combine responses for each terminal node in a tree (regression example; average the observations in each tree into a single number). See -\code{\link{CR_ResponseCombiner}} or -\code{\link{MeanResponseCombiner}}. If you don't specify one, this function -tries to pick one based on the response. For \code{\link{CR_Response}} it -picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric -responses it picks a \code{\link{MeanResponseCombiner}}.} +\code{\link{CR_ResponseCombiner}} or \code{\link{MeanResponseCombiner}}. If +you don't specify one, this function tries to pick one based on the +response. For \code{\link{CR_Response}} it picks a +\code{\link{CR_ResponseCombiner}}; for integer or numeric responses it +picks a \code{\link{MeanResponseCombiner}}.} \item{forestResponseCombiner}{A response combiner that's used to combine predictions across trees into one final result (regression example; average the prediction of each tree into a single number). See -\code{\link{CR_FunctionCombiner}} or -\code{\link{MeanResponseCombiner}}. If you don't specify one, this function -tries to pick one based on the response. For \code{\link{CR_Response}} it -picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric -responses it picks a \code{\link{MeanResponseCombiner}}.} +\code{\link{CR_FunctionCombiner}} or \code{\link{MeanResponseCombiner}}. If +you don't specify one, this function tries to pick one based on the +response. For \code{\link{CR_Response}} it picks a +\code{\link{CR_FunctionCombiner}}; for integer or numeric responses it +picks a \code{\link{MeanResponseCombiner}}.} \item{ntree}{An integer that specifies how many trees should be trained.} @@ -66,6 +66,21 @@ as large as \code{nodeSize}.} controls tree length; by default \code{maxNodeDepth} is an extremely high number and tree depth is controlled by \code{nodeSize}.} +\item{na.penalty}{This parameter controls whether predictor variables with +NAs should be penalized when being considered for a best split. Best splits +(and the associated score) are determined on only non-NA data; the penalty +is to take the best split identified, and to randomly assign any NAs +(according to the proportion of data split left and right), and then +recalculate the corresponding split score, when is then compared with the +other split candiate variables. This penalty adds some computational time, +so it may be disabled for some variables. \code{na.penalty} may be +specified as a vector of logicals indicating, for each predictor variable, +whether the penalty should be applied to that variable. If it's length 1 +then it applies to all variables. Alternatively, a single numeric value may +be provided to indicate a threshold whereby the penalty is activated only +if the proportion of NAs for that variable in the training set exceeds that +threshold.} + \item{splitPureNodes}{This parameter determines whether the algorithm will split a pure node. If set to FALSE, then before every split it will check that every response is the same, and if so, not split. If set to TRUE it @@ -160,7 +175,7 @@ data <- data.frame(x1, x2) forest <- train(CR_Response(delta, u) ~ x1 + x2, data, LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2), - CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, + CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) ypred <- predict(forest, newData)