From 30d90605176addbad5778560e350f971d1a67223 Mon Sep 17 00:00:00 2001 From: Joel Therrien Date: Thu, 6 Jun 2019 15:53:25 -0700 Subject: [PATCH] Cleanup function defintions and documentation --- DESCRIPTION | 2 +- NAMESPACE | 7 ++-- R/CR_Response.R | 2 +- R/cr_components.R | 20 ----------- R/cr_naiveConcordance.R | 19 +++++++++-- R/cr_predictions.R | 9 +++-- R/create_java_covariates.R | 2 +- R/java_classes_directory.R | 7 ++-- R/{load_forest.R => loadForest.R} | 21 +++++++----- R/misc.R | 16 +-------- R/predict.R | 10 +++--- R/recover_forest.R | 37 -------------------- R/regressionComponents.R | 33 +++++++----------- R/{save_forest.R => saveForest.R} | 8 ++--- R/train.R | 49 ++++++++++++--------------- R/wrapFunction.R | 12 ------- man/CR_FunctionCombiner.Rd | 11 ------ man/CR_Response.Rd | 2 +- man/CR_ResponseCombiner.Rd | 11 ------ man/MeanResponseCombiner.Rd | 13 +------ man/WeightedVarianceSplitFinder.Rd | 21 ++++++------ man/convertRListToJava.Rd | 27 --------------- man/{load_forest.Rd => loadForest.Rd} | 16 ++++----- man/naiveConcordance.Rd | 20 +++++++++-- man/predict.JRandomForest.Rd | 10 +++--- man/{save_forest.Rd => saveForest.Rd} | 14 ++++---- man/train.Rd | 34 +++++++++++-------- tests/testthat/test_saving_loading.R | 4 +-- tests/testthat/test_saving_offline.R | 2 +- 29 files changed, 160 insertions(+), 279 deletions(-) rename R/{load_forest.R => loadForest.R} (73%) delete mode 100644 R/recover_forest.R rename R/{save_forest.R => saveForest.R} (94%) delete mode 100644 R/wrapFunction.R delete mode 100644 man/convertRListToJava.Rd rename man/{load_forest.Rd => loadForest.Rd} (60%) rename man/{save_forest.Rd => saveForest.Rd} (76%) diff --git a/DESCRIPTION b/DESCRIPTION index 2188eef..2bc850f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: largeRCRF Type: Package Title: Large Random Competing Risk Forests, Java Implementation Run in R -Version: 0.0.0.9036 +Version: 0.0.0.9037 Authors@R: person("Joel", "Therrien", email = "joel@joeltherrien.ca", role = c("aut", "cre")) Description: This package is used for training competing risk random forests on larger scale datasets. It currently only supports training models, running predictions, plotting those predictions (they are curves), diff --git a/NAMESPACE b/NAMESPACE index 0892a93..22ff28f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,15 +25,12 @@ export(LogRankSplitFinder) export(MeanResponseCombiner) export(Numeric) export(WeightedVarianceSplitFinder) -export(convertRListToJava) export(extractCHF) export(extractCIF) export(extractMortalities) export(extractSurvivorCurve) -export(load_covariate_list_from_settings) -export(load_forest) -export(load_forest_args_provided) +export(loadForest) export(naiveConcordance) -export(save_forest) +export(saveForest) export(train) import(rJava) diff --git a/R/CR_Response.R b/R/CR_Response.R index e4be1a5..6e29d00 100644 --- a/R/CR_Response.R +++ b/R/CR_Response.R @@ -28,7 +28,7 @@ #' delta <- ifelse(u == T1, 1, ifelse(u == T2, 2, 0)) #' #' responses <- CR_Response(delta, u) -#' # Then use responses in train +#' # Then use responses in train or naiveConcordance CR_Response <- function(delta, u, C = NULL){ if(is.null(C)){ return(Java_CompetingRiskResponses(delta, u)) diff --git a/R/cr_components.R b/R/cr_components.R index 19c70c8..a958812 100644 --- a/R/cr_components.R +++ b/R/cr_components.R @@ -10,16 +10,6 @@ #' The user only needs to pass this object into \code{\link{train}} as the #' \code{forestResponseCombiner} parameter. #' -#' @return A response combiner object to be used in \code{\link{train}}; not -#' useful on its own. However, internally, a response combiner object is a -#' list consisting of the following objects: \describe{ -#' \item{\code{javaObject}}{The java object used in the algorithm} -#' \item{\code{call}}{The call (used in \code{print})} -#' \item{\code{outputClass}}{The R class of the outputs; used in -#' \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R -#' function that converts a Java prediction from the combiner into R output -#' that is readable by a user.} } -#' #' @param events A vector of integers specifying which competing risk events's #' functions should be processed. This should correspond to all of the #' competing risk events that can occur, from 1 to the largest number. @@ -76,16 +66,6 @@ CR_FunctionCombiner <- function(events, times = NULL){ #' The user only needs to pass this object into \code{\link{train}} as the #' \code{nodeResponseCombiner} parameter. #' -#' @return A response combiner object to be used in \code{\link{train}}; not -#' useful on its own. However, internally, a response combiner object is a -#' list consisting of the following objects: \describe{ -#' \item{\code{javaObject}}{The java object used in the algorithm} -#' \item{\code{call}}{The call (used in \code{print})} -#' \item{\code{outputClass}}{The R class of the outputs; used in -#' \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R -#' function that converts a Java prediction from the combiner into R output -#' that is readable by a user.} } -#' #' @param events A vector of integers specifying which competing risk events's #' functions should be processed. This should correspond to all of the #' competing risk events that can occur, from 1 to the largest number. diff --git a/R/cr_naiveConcordance.R b/R/cr_naiveConcordance.R index 356e4f6..eb6fc77 100644 --- a/R/cr_naiveConcordance.R +++ b/R/cr_naiveConcordance.R @@ -2,8 +2,8 @@ #' Naive Concordance #' #' Used to calculate a concordance index error. The user needs to supply a list -#' of mortalities, with each item in the list being a vector for the specific -#' events. To calculate mortalities a user should look to +#' of mortalities, with each item in the list being a vector for the +#' corresponding event. To calculate mortalities a user should look to #' \code{\link{extractMortalities}}. #' #' @return A vector of 1 minus the concordance scores, with each element @@ -16,6 +16,21 @@ #' list should correspond to one of the events in the order of event 1 to J, #' and should be a vector of the same length as responses. #' @export +#' @examples +#' data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6) +#' +#' model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1) +#' +#' newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6) +#' predictions <- predict(model, newData) +#' +#' mortalities <- list( +#' extractMortalities(predictions, 1, 6), +#' extractMortalities(predictions, 2, 6) +#' ) +#' +#' naiveConcordance(CR_Response(newData$delta, newData$T), mortalities) +#' naiveConcordance <- function(responses, predictedMortalities){ if(is.null(responses)){ stop("responses cannot be null") diff --git a/R/cr_predictions.R b/R/cr_predictions.R index ee0b7e8..6d0f03f 100644 --- a/R/cr_predictions.R +++ b/R/cr_predictions.R @@ -1,6 +1,7 @@ - -convertCompetingRiskFunctionsSlow <- function(javaObject, forest){ +# Internal function used to convert the Java functions into R functions +# Provided for use as a parameter in CR_FunctionCombiner & CR_ResponseCombiner +convertCompetingRiskFunctions <- compiler::cmpfun(function(javaObject, forest){ events <- forest$params$forestResponseCombiner$events lst <- list(javaObject = javaObject, events = events) @@ -24,9 +25,7 @@ convertCompetingRiskFunctionsSlow <- function(javaObject, forest){ class(lst) <- "CompetingRiskFunctions" return(lst) -} - -convertCompetingRiskFunctions <- compiler::cmpfun(convertCompetingRiskFunctionsSlow) +}) #' Competing Risk Predictions diff --git a/R/create_java_covariates.R b/R/create_java_covariates.R index a096d98..3c60a25 100644 --- a/R/create_java_covariates.R +++ b/R/create_java_covariates.R @@ -1,4 +1,4 @@ -# These functions are not exported, so I won't create their documentation either. +# These functions are not exported, so I won't provide their documentation either. # I.e. it's not a mistake that the documentation below lacks the " ' " on each line. # Covariates diff --git a/R/java_classes_directory.R b/R/java_classes_directory.R index f382cf7..c94728f 100644 --- a/R/java_classes_directory.R +++ b/R/java_classes_directory.R @@ -1,5 +1,6 @@ -# This file keeps track of the different Java classes used -# Whenever refactoring happens in the Java code, this file should be updated and (hopefully) nothing will break. +# This file keeps track of the different Java classes used. Whenever refactoring +# happens in the Java code, this file should be updated and (hopefully) nothing +# will break. # General Java objects .class_Object <- "java/lang/Object" @@ -51,7 +52,7 @@ # When a class object is returned, rJava often often wants L prepended and ; appended. # So a list that returns "java/lang/Object" should show "Ljava/lang/Object;" -# This function does that +# This function does that. makeResponse <- function(className){ return(paste0("L", className, ";")) } \ No newline at end of file diff --git a/R/load_forest.R b/R/loadForest.R similarity index 73% rename from R/load_forest.R rename to R/loadForest.R index 22aa554..524a24f 100644 --- a/R/load_forest.R +++ b/R/loadForest.R @@ -2,12 +2,12 @@ #' Load Random Forest #' -#' Loads a random forest that was saved using \code{\link{save_forest}}. +#' Loads a random forest that was saved using \code{\link{saveForest}}. #' #' @param forest The directory created that saved the previous forest. #' @return A JForest object; see \code{\link{train}} for details. #' @export -#' @seealso \code{\link{train}}, \code{\link{save_forest}} +#' @seealso \code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}} #' @examples #' # Regression Example #' x1 <- rnorm(1000) @@ -18,9 +18,9 @@ #' forest <- train(y ~ x1 + x2, data, #' ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) #' -#' save_forest(forest, "trees") -#' new_forest <- load_forest("trees") -load_forest <- function(directory){ +#' saveForest(forest, "trees") +#' new_forest <- loadForest("trees") +loadForest <- function(directory){ # First load the response combiners and the split finders nodeResponseCombiner.java <- .jcall(.class_DataUtils, makeResponse(.class_Object), "loadObject", paste0(directory, "/nodeResponseCombiner.jData")) @@ -42,15 +42,20 @@ load_forest <- function(directory){ params$splitFinder$javaObject <- splitFinder.java params$forestResponseCombiner$javaObject <- forestResponseCombiner.java - forest <- load_forest_args_provided(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, call, + forest <- loadForestArgumentsSpecified(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, call, params$ntree, params$numberOfSplits, params$mtry, params$nodeSize, params$maxNodeDepth, params$splitPureNodes) return(forest) } -#' @export -load_forest_args_provided <- function(treeDirectory, nodeResponseCombiner, splitFinder, forestResponseCombiner, +# Internal function - if you really need to use it yourself (say to load forests +# saved directly through the Java interface into R), then look at the loadForest +# function to see how this function is used. I'm also open to writing a function +# that uses the Java version's settings yaml file to recreate the forest, but +# I'd appreciate knowing that someone's going to use it first (email me; see +# README). +loadForestArgumentsSpecified <- function(treeDirectory, nodeResponseCombiner, splitFinder, forestResponseCombiner, covariateList.java, call, ntree, numberOfSplits, mtry, nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE){ params <- list( diff --git a/R/misc.R b/R/misc.R index 2f931ed..c818704 100644 --- a/R/misc.R +++ b/R/misc.R @@ -1,18 +1,4 @@ -#' convertRListToJava -#' -#' An internal function that converts an R list of rJava objects into a -#' java.util.List rJava object containing those objects. It's used internally, -#' and is only available because it's used in some examples that demonstrate what -#' other objects do. -#' @param lst The R list containing rJava objects -#' @export -#' @return An rJava List object to be used internally. -#' @keywords internal -#' @examples -#' x <- Numeric(1:5) -#' class(x) -#' x <- convertRListToJava(x) -#' class(x) +# Internal function convertRListToJava <- function(lst){ javaList <- .jnew(.class_ArrayList, as.integer(length(lst))) javaList <- .jcast(javaList, .class_List) diff --git a/R/predict.R b/R/predict.R index bf54099..383aa7e 100644 --- a/R/predict.R +++ b/R/predict.R @@ -11,8 +11,9 @@ #' the dataset after the forest is trained. #' @param parallel A logical indicating whether multiple cores should be #' utilized when making the predictions. Available as an option because it's -#' been observed by this author that using Java's \code{parallelStream} can be -#' unstable on some systems. Default value is \code{TRUE}. +#' been observed that using Java's \code{parallelStream} can be unstable on +#' some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you +#' get strange errors while predicting. #' @param out.of.bag A logical indicating whether predictions should be based on #' 'out of bag' trees; set only to \code{TRUE} if you're running predictions #' on data that was used in the training. Default value is \code{FALSE}. @@ -26,7 +27,7 @@ #' y <- 1 + x1 + x2 + rnorm(1000) #' #' data <- data.frame(x1, x2, y) -#' forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) +#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) #' #' # Fix x2 to be 0 #' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0) @@ -46,8 +47,7 @@ #' #' data <- data.frame(x1, x2) #' -#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, -#' LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) +#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) #' ypred <- predict(forest, newData) predict.JRandomForest <- function(forest, newData=NULL, parallel=TRUE, out.of.bag=FALSE){ diff --git a/R/recover_forest.R b/R/recover_forest.R deleted file mode 100644 index fc6f267..0000000 --- a/R/recover_forest.R +++ /dev/null @@ -1,37 +0,0 @@ - -recover_forest_predictable <- function(tree_directory, settingsPath) { - - settings.java <- load_settings(settingsPath) - - nodeResponseCombiner.java <- .jcall(settings.java, makeResponse(.class_ResponseCombiner), "getResponseCombiner") - splitFinder.java <- .jcall(settings.java, makeResponse(.class_SplitFinder), "getSplitFinder") - forestResponseCombiner.java <- .jcall(settings.java, makeResponse(.class_ResponseCombiner), "getTreeCombiner") - - covariateList <- .jcall(settings.java, makeResponse(.class_List), "getCovariates") - - params <- readRDS(paste0(directory, "/parameters.rData")) - call <- readRDS(paste0(directory, "/call.rData")) - - params$nodeResponseCombiner$javaObject <- nodeResponseCombiner.java - params$splitFinder$javaObject <- splitFinder.java - params$forestResponseCombiner$javaObject <- forestResponseCombiner.java - - forest <- load_forest_args_provided(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, params, call) - - return(forest) - -} - -load_settings <- function(settingsPath) { - settingsFile <- .jnew(.class_File, settingsPath) - settings.java <- .jcall(.class_Settings, makeResponse(.class_Settings), "load", settingsFile) - - return(settings.java) -} - -#' @export -load_covariate_list_from_settings <- function(settingsPath){ - settings.java = load_settings(settingsPath) - covariateList <- .jcall(settings.java, makeResponse(.class_List), "getCovariates") - return(covariateList) -} \ No newline at end of file diff --git a/R/regressionComponents.R b/R/regressionComponents.R index 6d304d6..094d9cf 100644 --- a/R/regressionComponents.R +++ b/R/regressionComponents.R @@ -3,20 +3,21 @@ #' #' This split finder is used in regression random forests. When a split is made, #' this finder computes the sample variance in each group (divided by n, not -#' n-1); it then minimizes the the sum of these variances, each of them weighted -#' by their sample size divided by the total sample size of that node. +#' n-1); it then minimizes the sum of these variances, each of them weighted by +#' their sample size divided by the total sample size of that node. #' #' @note There are other split finders that are used in regression random #' forests that are not included in this package. This package is oriented -#' toward the competing risk side of survival analysis; the regression options -#' are provided as an example of how extensible the back-end Java package is. -#' If you are interested in using this package for regression (or other uses), -#' feel free to write your own components. It's really not hard to write these -#' components; the WeightedVarianceSplitFinder Java class is quite short; most -#' of the code is to reuse calculations from previous considered splits. +#' toward the competing risks side of survival analysis; the regression +#' options are provided as an example of how extensible the back-end Java +#' package is. If you are interested in using this package for regression (or +#' other uses), feel free to write your own components. It's not too hard to +#' write these components; the WeightedVarianceSplitFinder Java class is quite +#' short; most of the code is to reuse calculations from previous considered +#' splits. I (the author) am also willing to assist if you have any questions. #' @export -#' @return A split finder object to be used in \code{\link{train}}; not -#' useful on its own. +#' @return A split finder object to be used in \code{\link{train}}; not useful +#' on its own. #' @examples #' splitFinder <- WeightedVarianceSplitFinder() #' # You would then use it in train() @@ -41,16 +42,6 @@ WeightedVarianceSplitFinder <- function(){ #' \code{forestResponseCombiner} parameters in \code{\link{train}} when doing #' regression. #' @export -#' @return A response combiner object to be used in \code{\link{train}}; not -#' useful on its own. However, internally, a response combiner object is a -#' list consisting of the following objects: -#' \describe{ -#' \item{\code{javaObject}}{The java object used in the algorithm} -#' \item{\code{call}}{The call (used in \code{print})} -#' \item{\code{outputClass}}{The R class of the outputs; used in \code{\link{predict.JRandomForest}}} -#' \item{\code{convertToRFunction}}{An R function that converts a Java prediction from the combiner into R output that is readable by a user.} -#' } -#' #' @examples #' responseCombiner <- MeanResponseCombiner() #' # You would then use it in train() @@ -58,7 +49,7 @@ WeightedVarianceSplitFinder <- function(){ #' # However; I'll show an internal Java method to make it clear what it does #' # Note that you should never have to do the following #' x <- 1:3 -#' x <- convertRListToJava(Numeric(x)) +#' x <- largeRCRF:::convertRListToJava(Numeric(x)) #' #' # will output a Java object containing 2 #' output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x) diff --git a/R/save_forest.R b/R/saveForest.R similarity index 94% rename from R/save_forest.R rename to R/saveForest.R index d5c1d3a..4b2eb71 100644 --- a/R/save_forest.R +++ b/R/saveForest.R @@ -13,7 +13,7 @@ #' default. #' @export -#' @seealso \code{\link{train}}, \code{\link{load_forest}} +#' @seealso \code{\link{train}}, \code{\link{loadForest}} #' @examples #' # Regression Example #' x1 <- rnorm(1000) @@ -24,9 +24,9 @@ #' forest <- train(y ~ x1 + x2, data, #' ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) #' -#' save_forest(forest, "trees") -#' new_forest <- load_forest("trees") -save_forest <- function(forest, directory, overwrite=FALSE){ +#' saveForest(forest, "trees") +#' new_forest <- loadForest("trees") +saveForest <- function(forest, directory, overwrite=FALSE){ check_and_create_directory(directory, overwrite) saveTrees(forest, directory) diff --git a/R/train.R b/R/train.R index a70752a..2756130 100644 --- a/R/train.R +++ b/R/train.R @@ -1,4 +1,5 @@ +# Internal function to calculate how many CPU cores are available. getCores <- function(){ cores <- NA if (requireNamespace("parallel", quietly = TRUE)){ @@ -22,7 +23,10 @@ getCores <- function(){ #' response you plug in. \code{splitFinder} should work on the responses you are #' providing; \code{nodeResponseCombiner} should combine these responses into #' some intermediate product, and \code{forestResponseCombiner} combines these -#' intermediate products into the final output product. +#' intermediate products into the final output product. Note that +#' \code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred +#' from the data (so feel free to not specify them), and \code{splitFinder} can +#' be inferred but you might want to change its default. #' #' @param responses An R list of the responses. See \code{\link{CR_Response}} #' for an example function. @@ -34,7 +38,7 @@ getCores <- function(){ #' forest training algorithm. See \code{\link{Competing Risk Split Finders}} #' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, #' this function tries to pick one based on the response. For -#' \code{\link{CR_Response}} wihtout censor times, it will pick a +#' \code{\link{CR_Response}} without censor times, it will pick a #' \code{\link{LogRankSplitFinder}}; while if censor times were provided it #' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric #' responses it picks a \code{\link{WeightedVarianceSplitFinder}}. @@ -63,23 +67,24 @@ getCores <- function(){ #' randomly chosen to be tried in the splitting process. This value must be at #' least 1. #' @param nodeSize The algorithm will not attempt to split a node that has -#' observations less than 2*\code{nodeSize}; this results in terminal nodes -#' having a size of roughly \code{nodeSize} (true sizes may be both smaller or -#' greater). This value must be at least 1. +#' observations less than 2*\code{nodeSize}; this guarantees that any two +#' sibling terminal nodes together have an average size of at least +#' \code{nodeSize}; note that it doesn't guarantee that every node is at least +#' as large as \code{nodeSize}. #' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it -#' helps keep trees shorter; by default maxNodeDepth is an extremely high +#' controls tree length; by default \code{maxNodeDepth} is an extremely high #' number and tree depth is controlled by \code{nodeSize}. #' @param splitPureNodes This parameter determines whether the algorithm will #' split a pure node. If set to FALSE, then before every split it will check #' that every response is the same, and if so, not split. If set to TRUE it -#' forgoes that check and just splits. Prediction accuracy won't change under -#' any sensible \code{nodeResponseCombiner} as all terminal nodes from a split +#' forgoes that check and splits it. Prediction accuracy won't change under +#' any sensible \code{nodeResponseCombiner}; as all terminal nodes from a split #' pure node should give the same prediction, so this parameter only affects #' performance. If your response is continuous you'll likely experience faster #' train times by setting it to TRUE. Default value is TRUE. #' @param savePath If set, this parameter will save each tree of the random #' forest in this directory as the forest is trained. Use this parameter if -#' you need to save memory while training. See also \code{\link{load_forest}} +#' you need to save memory while training. See also \code{\link{loadForest}} #' @param savePath.overwrite This parameter controls the behaviour for what #' happens if \code{savePath} is pointing to an existing directory. If set to #' \code{warn} (default) then \code{train} refuses to proceed. If set to @@ -93,12 +98,12 @@ getCores <- function(){ #' a crash. #' @param cores This parameter specifies how many trees will be simultaneously #' trained. By default the package attempts to detect how many cores you have -#' by using the \code{parallel} package, and using all of them. You may +#' by using the \code{parallel} package and using all of them. You may #' specify a lower number if you wish. It is not recommended to specify a #' number greater than the number of available cores as this will hurt #' performance with no available benefit. #' @param randomSeed This parameter specifies a random seed if reproducible, -#' deterministic forests are desired. The number o1 +#' deterministic forests are desired. #' @export #' @return A \code{JRandomForest} object. You may call \code{predict} or #' \code{print} on it. @@ -135,8 +140,8 @@ getCores <- function(){ #' #' data <- data.frame(x1, x2) #' -#' forest <- train(CompetingRiskResponses(delta, u) ~ x1 + x2, data, -#' LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) +#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, +#' LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) #' ypred <- predict(forest, newData) train <- function(x, ...) UseMethod("train") @@ -280,20 +285,6 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef forestObject <- list(call=match.call(), params=params, javaObject=forest.java, covariateList=dataset$covariateList) - # TODO - remove redundant code if tests pass - #forestObject$params <- list( - # splitFinder=splitFinder, - # nodeResponseCombiner=nodeResponseCombiner, - # forestResponseCombiner=forestResponseCombiner, - # ntree=ntree, - # numberOfSplits=numberOfSplits, - # mtry=mtry, - # nodeSize=nodeSize, - # splitPureNodes=splitPureNodes, - # maxNodeDepth = maxNodeDepth, - # savePath=savePath - #) - class(forestObject) <- "JRandomForest" return(forestObject) @@ -304,7 +295,9 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef #' @rdname train #' @export -#' @param formula You may specify the response and covariates as a formula instead; make sure the response in the formula is still properly constructed; see \code{responses} +#' @param formula You may specify the response and covariates as a formula +#' instead; make sure the response in the formula is still properly +#' constructed; see \code{responses} train.formula <- function(formula, covariateData, ...){ # Having an R copy of the data loaded at the same time can be wasteful; we diff --git a/R/wrapFunction.R b/R/wrapFunction.R deleted file mode 100644 index 1151fac..0000000 --- a/R/wrapFunction.R +++ /dev/null @@ -1,12 +0,0 @@ -wrapFunction <- function(mf){ - f <- function(x){ - - y <- vector(mode="numeric", length=length(x)) - for(i in 1:length(x)){ - y[i] <- .jcall(mf, "D", "evaluate", x[i]) - } - - return(y) - - } -} diff --git a/man/CR_FunctionCombiner.Rd b/man/CR_FunctionCombiner.Rd index 8ece552..929ec68 100644 --- a/man/CR_FunctionCombiner.Rd +++ b/man/CR_FunctionCombiner.Rd @@ -17,17 +17,6 @@ faster performance when predicting, however if the times are not exhaustive then the resulting curves will not update at that point (they'll be flat). If left blank, the package will default to using all of the time points.} } -\value{ -A response combiner object to be used in \code{\link{train}}; not - useful on its own. However, internally, a response combiner object is a - list consisting of the following objects: \describe{ - \item{\code{javaObject}}{The java object used in the algorithm} - \item{\code{call}}{The call (used in \code{print})} - \item{\code{outputClass}}{The R class of the outputs; used in - \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R - function that converts a Java prediction from the combiner into R output - that is readable by a user.} } -} \description{ Creates a CompetingRiskFunctionCombiner rJava object, which is used internally for constructing a forest. The forest uses it when creating diff --git a/man/CR_Response.Rd b/man/CR_Response.Rd index bd05ac6..a09a694 100644 --- a/man/CR_Response.Rd +++ b/man/CR_Response.Rd @@ -38,5 +38,5 @@ u <- pmin(T1, T2, C) delta <- ifelse(u == T1, 1, ifelse(u == T2, 2, 0)) responses <- CR_Response(delta, u) -# Then use responses in train +# Then use responses in train or naiveConcordance } diff --git a/man/CR_ResponseCombiner.Rd b/man/CR_ResponseCombiner.Rd index 30cb366..61a53bb 100644 --- a/man/CR_ResponseCombiner.Rd +++ b/man/CR_ResponseCombiner.Rd @@ -11,17 +11,6 @@ CR_ResponseCombiner(events) functions should be processed. This should correspond to all of the competing risk events that can occur, from 1 to the largest number.} } -\value{ -A response combiner object to be used in \code{\link{train}}; not - useful on its own. However, internally, a response combiner object is a - list consisting of the following objects: \describe{ - \item{\code{javaObject}}{The java object used in the algorithm} - \item{\code{call}}{The call (used in \code{print})} - \item{\code{outputClass}}{The R class of the outputs; used in - \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R - function that converts a Java prediction from the combiner into R output - that is readable by a user.} } -} \description{ Creates a CompetingRiskResponseCombiner rJava object, which is used internally for constructing a forest. It is used when each tree in the forest diff --git a/man/MeanResponseCombiner.Rd b/man/MeanResponseCombiner.Rd index 9629125..19cb8e1 100644 --- a/man/MeanResponseCombiner.Rd +++ b/man/MeanResponseCombiner.Rd @@ -6,17 +6,6 @@ \usage{ MeanResponseCombiner() } -\value{ -A response combiner object to be used in \code{\link{train}}; not - useful on its own. However, internally, a response combiner object is a - list consisting of the following objects: - \describe{ - \item{\code{javaObject}}{The java object used in the algorithm} - \item{\code{call}}{The call (used in \code{print})} - \item{\code{outputClass}}{The R class of the outputs; used in \code{\link{predict.JRandomForest}}} - \item{\code{convertToRFunction}}{An R function that converts a Java prediction from the combiner into R output that is readable by a user.} -} -} \description{ This response combiner is used in regression random forests, where the response in the data is a single number that needs to be averaged in each @@ -32,7 +21,7 @@ responseCombiner <- MeanResponseCombiner() # However; I'll show an internal Java method to make it clear what it does # Note that you should never have to do the following x <- 1:3 -x <- convertRListToJava(Numeric(x)) +x <- largeRCRF:::convertRListToJava(Numeric(x)) # will output a Java object containing 2 output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x) diff --git a/man/WeightedVarianceSplitFinder.Rd b/man/WeightedVarianceSplitFinder.Rd index 1ce6c55..7338ad4 100644 --- a/man/WeightedVarianceSplitFinder.Rd +++ b/man/WeightedVarianceSplitFinder.Rd @@ -7,24 +7,25 @@ WeightedVarianceSplitFinder() } \value{ -A split finder object to be used in \code{\link{train}}; not - useful on its own. +A split finder object to be used in \code{\link{train}}; not useful + on its own. } \description{ This split finder is used in regression random forests. When a split is made, this finder computes the sample variance in each group (divided by n, not -n-1); it then minimizes the the sum of these variances, each of them weighted -by their sample size divided by the total sample size of that node. +n-1); it then minimizes the sum of these variances, each of them weighted by +their sample size divided by the total sample size of that node. } \note{ There are other split finders that are used in regression random forests that are not included in this package. This package is oriented - toward the competing risk side of survival analysis; the regression options - are provided as an example of how extensible the back-end Java package is. - If you are interested in using this package for regression (or other uses), - feel free to write your own components. It's really not hard to write these - components; the WeightedVarianceSplitFinder Java class is quite short; most - of the code is to reuse calculations from previous considered splits. + toward the competing risks side of survival analysis; the regression + options are provided as an example of how extensible the back-end Java + package is. If you are interested in using this package for regression (or + other uses), feel free to write your own components. It's not too hard to + write these components; the WeightedVarianceSplitFinder Java class is quite + short; most of the code is to reuse calculations from previous considered + splits. I (the author) am also willing to assist if you have any questions. } \examples{ splitFinder <- WeightedVarianceSplitFinder() diff --git a/man/convertRListToJava.Rd b/man/convertRListToJava.Rd deleted file mode 100644 index 6283943..0000000 --- a/man/convertRListToJava.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/misc.R -\name{convertRListToJava} -\alias{convertRListToJava} -\title{convertRListToJava} -\usage{ -convertRListToJava(lst) -} -\arguments{ -\item{lst}{The R list containing rJava objects} -} -\value{ -An rJava List object to be used internally. -} -\description{ -An internal function that converts an R list of rJava objects into a -java.util.List rJava object containing those objects. It's used internally, -and is only available because it's used in some examples that demonstrate what -other objects do. -} -\examples{ -x <- Numeric(1:5) -class(x) -x <- convertRListToJava(x) -class(x) -} -\keyword{internal} diff --git a/man/load_forest.Rd b/man/loadForest.Rd similarity index 60% rename from man/load_forest.Rd rename to man/loadForest.Rd index 0625c82..ceaa29c 100644 --- a/man/load_forest.Rd +++ b/man/loadForest.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/load_forest.R -\name{load_forest} -\alias{load_forest} +% Please edit documentation in R/loadForest.R +\name{loadForest} +\alias{loadForest} \title{Load Random Forest} \usage{ -load_forest(directory) +loadForest(directory) } \arguments{ \item{forest}{The directory created that saved the previous forest.} @@ -13,7 +13,7 @@ load_forest(directory) A JForest object; see \code{\link{train}} for details. } \description{ -Loads a random forest that was saved using \code{\link{save_forest}}. +Loads a random forest that was saved using \code{\link{saveForest}}. } \examples{ # Regression Example @@ -25,9 +25,9 @@ data <- data.frame(x1, x2, y) forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) -save_forest(forest, "trees") -new_forest <- load_forest("trees") +saveForest(forest, "trees") +new_forest <- loadForest("trees") } \seealso{ -\code{\link{train}}, \code{\link{save_forest}} +\code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}} } diff --git a/man/naiveConcordance.Rd b/man/naiveConcordance.Rd index feb8e50..e289e54 100644 --- a/man/naiveConcordance.Rd +++ b/man/naiveConcordance.Rd @@ -21,7 +21,23 @@ A vector of 1 minus the concordance scores, with each element } \description{ Used to calculate a concordance index error. The user needs to supply a list -of mortalities, with each item in the list being a vector for the specific -events. To calculate mortalities a user should look to +of mortalities, with each item in the list being a vector for the +corresponding event. To calculate mortalities a user should look to \code{\link{extractMortalities}}. } +\examples{ +data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6) + +model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1) + +newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6) +predictions <- predict(model, newData) + +mortalities <- list( + extractMortalities(predictions, 1, 6), + extractMortalities(predictions, 2, 6) +) + +naiveConcordance(CR_Response(newData$delta, newData$T), mortalities) + +} diff --git a/man/predict.JRandomForest.Rd b/man/predict.JRandomForest.Rd index 6041b1c..7bf71d8 100644 --- a/man/predict.JRandomForest.Rd +++ b/man/predict.JRandomForest.Rd @@ -17,8 +17,9 @@ the dataset after the forest is trained.} \item{parallel}{A logical indicating whether multiple cores should be utilized when making the predictions. Available as an option because it's -been observed by this author that using Java's \code{parallelStream} can be -unstable on some systems. Default value is \code{TRUE}.} +been observed that using Java's \code{parallelStream} can be unstable on +some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you +get strange errors while predicting.} \item{out.of.bag}{A logical indicating whether predictions should be based on 'out of bag' trees; set only to \code{TRUE} if you're running predictions @@ -38,7 +39,7 @@ x2 <- rnorm(1000) y <- 1 + x1 + x2 + rnorm(1000) data <- data.frame(x1, x2, y) -forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) +forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) # Fix x2 to be 0 newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0) @@ -58,8 +59,7 @@ delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0)) data <- data.frame(x1, x2) -forest <- train(CR_Response(delta, u) ~ x1 + x2, data, -LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) +forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) ypred <- predict(forest, newData) } diff --git a/man/save_forest.Rd b/man/saveForest.Rd similarity index 76% rename from man/save_forest.Rd rename to man/saveForest.Rd index 39a8ef4..0efd402 100644 --- a/man/save_forest.Rd +++ b/man/saveForest.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/save_forest.R -\name{save_forest} -\alias{save_forest} +% Please edit documentation in R/saveForest.R +\name{saveForest} +\alias{saveForest} \title{Save Random Forests} \usage{ -save_forest(forest, directory, overwrite = FALSE) +saveForest(forest, directory, overwrite = FALSE) } \arguments{ \item{forest}{The forest to save.} @@ -30,9 +30,9 @@ data <- data.frame(x1, x2, y) forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) -save_forest(forest, "trees") -new_forest <- load_forest("trees") +saveForest(forest, "trees") +new_forest <- loadForest("trees") } \seealso{ -\code{\link{train}}, \code{\link{load_forest}} +\code{\link{train}}, \code{\link{loadForest}} } diff --git a/man/train.Rd b/man/train.Rd index c5e3ad7..e6beb19 100644 --- a/man/train.Rd +++ b/man/train.Rd @@ -32,7 +32,7 @@ response as well).} forest training algorithm. See \code{\link{Competing Risk Split Finders}} or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one, this function tries to pick one based on the response. For -\code{\link{CR_Response}} wihtout censor times, it will pick a +\code{\link{CR_Response}} without censor times, it will pick a \code{\link{LogRankSplitFinder}}; while if censor times were provided it will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric responses it picks a \code{\link{WeightedVarianceSplitFinder}}.} @@ -67,26 +67,27 @@ randomly chosen to be tried in the splitting process. This value must be at least 1.} \item{nodeSize}{The algorithm will not attempt to split a node that has -observations less than 2*\code{nodeSize}; this results in terminal nodes -having a size of roughly \code{nodeSize} (true sizes may be both smaller or -greater). This value must be at least 1.} +observations less than 2*\code{nodeSize}; this guarantees that any two +sibling terminal nodes together have an average size of at least +\code{nodeSize}; note that it doesn't guarantee that every node is at least +as large as \code{nodeSize}.} \item{maxNodeDepth}{This parameter is analogous to \code{nodeSize} in that it -helps keep trees shorter; by default maxNodeDepth is an extremely high +controls tree length; by default \code{maxNodeDepth} is an extremely high number and tree depth is controlled by \code{nodeSize}.} \item{splitPureNodes}{This parameter determines whether the algorithm will split a pure node. If set to FALSE, then before every split it will check that every response is the same, and if so, not split. If set to TRUE it -forgoes that check and just splits. Prediction accuracy won't change under -any sensible \code{nodeResponseCombiner} as all terminal nodes from a split +forgoes that check and splits it. Prediction accuracy won't change under +any sensible \code{nodeResponseCombiner}; as all terminal nodes from a split pure node should give the same prediction, so this parameter only affects performance. If your response is continuous you'll likely experience faster train times by setting it to TRUE. Default value is TRUE.} \item{savePath}{If set, this parameter will save each tree of the random forest in this directory as the forest is trained. Use this parameter if -you need to save memory while training. See also \code{\link{load_forest}}} +you need to save memory while training. See also \code{\link{loadForest}}} \item{savePath.overwrite}{This parameter controls the behaviour for what happens if \code{savePath} is pointing to an existing directory. If set to @@ -102,15 +103,17 @@ a crash.} \item{cores}{This parameter specifies how many trees will be simultaneously trained. By default the package attempts to detect how many cores you have -by using the \code{parallel} package, and using all of them. You may +by using the \code{parallel} package and using all of them. You may specify a lower number if you wish. It is not recommended to specify a number greater than the number of available cores as this will hurt performance with no available benefit.} \item{randomSeed}{This parameter specifies a random seed if reproducible, -deterministic forests are desired. The number o1} +deterministic forests are desired.} -\item{formula}{You may specify the response and covariates as a formula instead; make sure the response in the formula is still properly constructed; see \code{responses}} +\item{formula}{You may specify the response and covariates as a formula +instead; make sure the response in the formula is still properly +constructed; see \code{responses}} } \value{ A \code{JRandomForest} object. You may call \code{predict} or @@ -124,7 +127,10 @@ parameters. Make sure these are compatible with each other, and with the response you plug in. \code{splitFinder} should work on the responses you are providing; \code{nodeResponseCombiner} should combine these responses into some intermediate product, and \code{forestResponseCombiner} combines these -intermediate products into the final output product. +intermediate products into the final output product. Note that +\code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred +from the data (so feel free to not specify them), and \code{splitFinder} can +be inferred but you might want to change its default. } \note{ If saving memory is a concern, you can replace \code{covariateData} @@ -160,8 +166,8 @@ delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0)) data <- data.frame(x1, x2) -forest <- train(CompetingRiskResponses(delta, u) ~ x1 + x2, data, -LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) +forest <- train(CR_Response(delta, u) ~ x1 + x2, data, +LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10) newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) ypred <- predict(forest, newData) } diff --git a/tests/testthat/test_saving_loading.R b/tests/testthat/test_saving_loading.R index 10e09a8..19ae578 100644 --- a/tests/testthat/test_saving_loading.R +++ b/tests/testthat/test_saving_loading.R @@ -13,8 +13,8 @@ test_that("Can save & load regression example", { ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) - save_forest(forest, "trees_saving_loading") - new_forest <- load_forest("trees_saving_loading") + saveForest(forest, "trees_saving_loading") + new_forest <- loadForest("trees_saving_loading") # try making a little prediction to verify it works newData <- data.frame(x1=seq(from=-3, to=3, by=0.5), x2=0) diff --git a/tests/testthat/test_saving_offline.R b/tests/testthat/test_saving_offline.R index 50d39af..9d2ac9b 100644 --- a/tests/testthat/test_saving_offline.R +++ b/tests/testthat/test_saving_offline.R @@ -20,7 +20,7 @@ test_that("Can save a random forest while training, and use it afterward", { predictions <- predict(forest, newData) # Also make sure we can load the forest too - newforest <- load_forest("trees") + newforest <- loadForest("trees") predictions <- predict(newforest, newData)