Cleanup function defintions and documentation
This commit is contained in:
parent
a3551694bd
commit
30d9060517
29 changed files with 160 additions and 279 deletions
|
@ -1,7 +1,7 @@
|
|||
Package: largeRCRF
|
||||
Type: Package
|
||||
Title: Large Random Competing Risk Forests, Java Implementation Run in R
|
||||
Version: 0.0.0.9036
|
||||
Version: 0.0.0.9037
|
||||
Authors@R: person("Joel", "Therrien", email = "joel@joeltherrien.ca", role = c("aut", "cre"))
|
||||
Description: This package is used for training competing risk random forests on larger scale datasets.
|
||||
It currently only supports training models, running predictions, plotting those predictions (they are curves),
|
||||
|
|
|
@ -25,15 +25,12 @@ export(LogRankSplitFinder)
|
|||
export(MeanResponseCombiner)
|
||||
export(Numeric)
|
||||
export(WeightedVarianceSplitFinder)
|
||||
export(convertRListToJava)
|
||||
export(extractCHF)
|
||||
export(extractCIF)
|
||||
export(extractMortalities)
|
||||
export(extractSurvivorCurve)
|
||||
export(load_covariate_list_from_settings)
|
||||
export(load_forest)
|
||||
export(load_forest_args_provided)
|
||||
export(loadForest)
|
||||
export(naiveConcordance)
|
||||
export(save_forest)
|
||||
export(saveForest)
|
||||
export(train)
|
||||
import(rJava)
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
#' delta <- ifelse(u == T1, 1, ifelse(u == T2, 2, 0))
|
||||
#'
|
||||
#' responses <- CR_Response(delta, u)
|
||||
#' # Then use responses in train
|
||||
#' # Then use responses in train or naiveConcordance
|
||||
CR_Response <- function(delta, u, C = NULL){
|
||||
if(is.null(C)){
|
||||
return(Java_CompetingRiskResponses(delta, u))
|
||||
|
|
|
@ -10,16 +10,6 @@
|
|||
#' The user only needs to pass this object into \code{\link{train}} as the
|
||||
#' \code{forestResponseCombiner} parameter.
|
||||
#'
|
||||
#' @return A response combiner object to be used in \code{\link{train}}; not
|
||||
#' useful on its own. However, internally, a response combiner object is a
|
||||
#' list consisting of the following objects: \describe{
|
||||
#' \item{\code{javaObject}}{The java object used in the algorithm}
|
||||
#' \item{\code{call}}{The call (used in \code{print})}
|
||||
#' \item{\code{outputClass}}{The R class of the outputs; used in
|
||||
#' \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
|
||||
#' function that converts a Java prediction from the combiner into R output
|
||||
#' that is readable by a user.} }
|
||||
#'
|
||||
#' @param events A vector of integers specifying which competing risk events's
|
||||
#' functions should be processed. This should correspond to all of the
|
||||
#' competing risk events that can occur, from 1 to the largest number.
|
||||
|
@ -76,16 +66,6 @@ CR_FunctionCombiner <- function(events, times = NULL){
|
|||
#' The user only needs to pass this object into \code{\link{train}} as the
|
||||
#' \code{nodeResponseCombiner} parameter.
|
||||
#'
|
||||
#' @return A response combiner object to be used in \code{\link{train}}; not
|
||||
#' useful on its own. However, internally, a response combiner object is a
|
||||
#' list consisting of the following objects: \describe{
|
||||
#' \item{\code{javaObject}}{The java object used in the algorithm}
|
||||
#' \item{\code{call}}{The call (used in \code{print})}
|
||||
#' \item{\code{outputClass}}{The R class of the outputs; used in
|
||||
#' \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
|
||||
#' function that converts a Java prediction from the combiner into R output
|
||||
#' that is readable by a user.} }
|
||||
#'
|
||||
#' @param events A vector of integers specifying which competing risk events's
|
||||
#' functions should be processed. This should correspond to all of the
|
||||
#' competing risk events that can occur, from 1 to the largest number.
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
#' Naive Concordance
|
||||
#'
|
||||
#' Used to calculate a concordance index error. The user needs to supply a list
|
||||
#' of mortalities, with each item in the list being a vector for the specific
|
||||
#' events. To calculate mortalities a user should look to
|
||||
#' of mortalities, with each item in the list being a vector for the
|
||||
#' corresponding event. To calculate mortalities a user should look to
|
||||
#' \code{\link{extractMortalities}}.
|
||||
#'
|
||||
#' @return A vector of 1 minus the concordance scores, with each element
|
||||
|
@ -16,6 +16,21 @@
|
|||
#' list should correspond to one of the events in the order of event 1 to J,
|
||||
#' and should be a vector of the same length as responses.
|
||||
#' @export
|
||||
#' @examples
|
||||
#' data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6)
|
||||
#'
|
||||
#' model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1)
|
||||
#'
|
||||
#' newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6)
|
||||
#' predictions <- predict(model, newData)
|
||||
#'
|
||||
#' mortalities <- list(
|
||||
#' extractMortalities(predictions, 1, 6),
|
||||
#' extractMortalities(predictions, 2, 6)
|
||||
#' )
|
||||
#'
|
||||
#' naiveConcordance(CR_Response(newData$delta, newData$T), mortalities)
|
||||
#'
|
||||
naiveConcordance <- function(responses, predictedMortalities){
|
||||
if(is.null(responses)){
|
||||
stop("responses cannot be null")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
|
||||
|
||||
convertCompetingRiskFunctionsSlow <- function(javaObject, forest){
|
||||
# Internal function used to convert the Java functions into R functions
|
||||
# Provided for use as a parameter in CR_FunctionCombiner & CR_ResponseCombiner
|
||||
convertCompetingRiskFunctions <- compiler::cmpfun(function(javaObject, forest){
|
||||
events <- forest$params$forestResponseCombiner$events
|
||||
lst <- list(javaObject = javaObject, events = events)
|
||||
|
||||
|
@ -24,9 +25,7 @@ convertCompetingRiskFunctionsSlow <- function(javaObject, forest){
|
|||
|
||||
class(lst) <- "CompetingRiskFunctions"
|
||||
return(lst)
|
||||
}
|
||||
|
||||
convertCompetingRiskFunctions <- compiler::cmpfun(convertCompetingRiskFunctionsSlow)
|
||||
})
|
||||
|
||||
|
||||
#' Competing Risk Predictions
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# These functions are not exported, so I won't create their documentation either.
|
||||
# These functions are not exported, so I won't provide their documentation either.
|
||||
# I.e. it's not a mistake that the documentation below lacks the " ' " on each line.
|
||||
|
||||
# Covariates
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# This file keeps track of the different Java classes used
|
||||
# Whenever refactoring happens in the Java code, this file should be updated and (hopefully) nothing will break.
|
||||
# This file keeps track of the different Java classes used. Whenever refactoring
|
||||
# happens in the Java code, this file should be updated and (hopefully) nothing
|
||||
# will break.
|
||||
|
||||
# General Java objects
|
||||
.class_Object <- "java/lang/Object"
|
||||
|
@ -51,7 +52,7 @@
|
|||
|
||||
# When a class object is returned, rJava often often wants L prepended and ; appended.
|
||||
# So a list that returns "java/lang/Object" should show "Ljava/lang/Object;"
|
||||
# This function does that
|
||||
# This function does that.
|
||||
makeResponse <- function(className){
|
||||
return(paste0("L", className, ";"))
|
||||
}
|
|
@ -2,12 +2,12 @@
|
|||
|
||||
#' Load Random Forest
|
||||
#'
|
||||
#' Loads a random forest that was saved using \code{\link{save_forest}}.
|
||||
#' Loads a random forest that was saved using \code{\link{saveForest}}.
|
||||
#'
|
||||
#' @param forest The directory created that saved the previous forest.
|
||||
#' @return A JForest object; see \code{\link{train}} for details.
|
||||
#' @export
|
||||
#' @seealso \code{\link{train}}, \code{\link{save_forest}}
|
||||
#' @seealso \code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}}
|
||||
#' @examples
|
||||
#' # Regression Example
|
||||
#' x1 <- rnorm(1000)
|
||||
|
@ -18,9 +18,9 @@
|
|||
#' forest <- train(y ~ x1 + x2, data,
|
||||
#' ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#'
|
||||
#' save_forest(forest, "trees")
|
||||
#' new_forest <- load_forest("trees")
|
||||
load_forest <- function(directory){
|
||||
#' saveForest(forest, "trees")
|
||||
#' new_forest <- loadForest("trees")
|
||||
loadForest <- function(directory){
|
||||
|
||||
# First load the response combiners and the split finders
|
||||
nodeResponseCombiner.java <- .jcall(.class_DataUtils, makeResponse(.class_Object), "loadObject", paste0(directory, "/nodeResponseCombiner.jData"))
|
||||
|
@ -42,15 +42,20 @@ load_forest <- function(directory){
|
|||
params$splitFinder$javaObject <- splitFinder.java
|
||||
params$forestResponseCombiner$javaObject <- forestResponseCombiner.java
|
||||
|
||||
forest <- load_forest_args_provided(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, call,
|
||||
forest <- loadForestArgumentsSpecified(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, call,
|
||||
params$ntree, params$numberOfSplits, params$mtry, params$nodeSize, params$maxNodeDepth, params$splitPureNodes)
|
||||
|
||||
return(forest)
|
||||
|
||||
}
|
||||
|
||||
#' @export
|
||||
load_forest_args_provided <- function(treeDirectory, nodeResponseCombiner, splitFinder, forestResponseCombiner,
|
||||
# Internal function - if you really need to use it yourself (say to load forests
|
||||
# saved directly through the Java interface into R), then look at the loadForest
|
||||
# function to see how this function is used. I'm also open to writing a function
|
||||
# that uses the Java version's settings yaml file to recreate the forest, but
|
||||
# I'd appreciate knowing that someone's going to use it first (email me; see
|
||||
# README).
|
||||
loadForestArgumentsSpecified <- function(treeDirectory, nodeResponseCombiner, splitFinder, forestResponseCombiner,
|
||||
covariateList.java, call, ntree, numberOfSplits, mtry, nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE){
|
||||
|
||||
params <- list(
|
16
R/misc.R
16
R/misc.R
|
@ -1,18 +1,4 @@
|
|||
#' convertRListToJava
|
||||
#'
|
||||
#' An internal function that converts an R list of rJava objects into a
|
||||
#' java.util.List rJava object containing those objects. It's used internally,
|
||||
#' and is only available because it's used in some examples that demonstrate what
|
||||
#' other objects do.
|
||||
#' @param lst The R list containing rJava objects
|
||||
#' @export
|
||||
#' @return An rJava List object to be used internally.
|
||||
#' @keywords internal
|
||||
#' @examples
|
||||
#' x <- Numeric(1:5)
|
||||
#' class(x)
|
||||
#' x <- convertRListToJava(x)
|
||||
#' class(x)
|
||||
# Internal function
|
||||
convertRListToJava <- function(lst){
|
||||
javaList <- .jnew(.class_ArrayList, as.integer(length(lst)))
|
||||
javaList <- .jcast(javaList, .class_List)
|
||||
|
|
10
R/predict.R
10
R/predict.R
|
@ -11,8 +11,9 @@
|
|||
#' the dataset after the forest is trained.
|
||||
#' @param parallel A logical indicating whether multiple cores should be
|
||||
#' utilized when making the predictions. Available as an option because it's
|
||||
#' been observed by this author that using Java's \code{parallelStream} can be
|
||||
#' unstable on some systems. Default value is \code{TRUE}.
|
||||
#' been observed that using Java's \code{parallelStream} can be unstable on
|
||||
#' some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you
|
||||
#' get strange errors while predicting.
|
||||
#' @param out.of.bag A logical indicating whether predictions should be based on
|
||||
#' 'out of bag' trees; set only to \code{TRUE} if you're running predictions
|
||||
#' on data that was used in the training. Default value is \code{FALSE}.
|
||||
|
@ -26,7 +27,7 @@
|
|||
#' y <- 1 + x1 + x2 + rnorm(1000)
|
||||
#'
|
||||
#' data <- data.frame(x1, x2, y)
|
||||
#' forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#'
|
||||
#' # Fix x2 to be 0
|
||||
#' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
|
||||
|
@ -46,8 +47,7 @@
|
|||
#'
|
||||
#' data <- data.frame(x1, x2)
|
||||
#'
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
#' LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
predict.JRandomForest <- function(forest, newData=NULL, parallel=TRUE, out.of.bag=FALSE){
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
|
||||
recover_forest_predictable <- function(tree_directory, settingsPath) {
|
||||
|
||||
settings.java <- load_settings(settingsPath)
|
||||
|
||||
nodeResponseCombiner.java <- .jcall(settings.java, makeResponse(.class_ResponseCombiner), "getResponseCombiner")
|
||||
splitFinder.java <- .jcall(settings.java, makeResponse(.class_SplitFinder), "getSplitFinder")
|
||||
forestResponseCombiner.java <- .jcall(settings.java, makeResponse(.class_ResponseCombiner), "getTreeCombiner")
|
||||
|
||||
covariateList <- .jcall(settings.java, makeResponse(.class_List), "getCovariates")
|
||||
|
||||
params <- readRDS(paste0(directory, "/parameters.rData"))
|
||||
call <- readRDS(paste0(directory, "/call.rData"))
|
||||
|
||||
params$nodeResponseCombiner$javaObject <- nodeResponseCombiner.java
|
||||
params$splitFinder$javaObject <- splitFinder.java
|
||||
params$forestResponseCombiner$javaObject <- forestResponseCombiner.java
|
||||
|
||||
forest <- load_forest_args_provided(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, params, call)
|
||||
|
||||
return(forest)
|
||||
|
||||
}
|
||||
|
||||
load_settings <- function(settingsPath) {
|
||||
settingsFile <- .jnew(.class_File, settingsPath)
|
||||
settings.java <- .jcall(.class_Settings, makeResponse(.class_Settings), "load", settingsFile)
|
||||
|
||||
return(settings.java)
|
||||
}
|
||||
|
||||
#' @export
|
||||
load_covariate_list_from_settings <- function(settingsPath){
|
||||
settings.java = load_settings(settingsPath)
|
||||
covariateList <- .jcall(settings.java, makeResponse(.class_List), "getCovariates")
|
||||
return(covariateList)
|
||||
}
|
|
@ -3,20 +3,21 @@
|
|||
#'
|
||||
#' This split finder is used in regression random forests. When a split is made,
|
||||
#' this finder computes the sample variance in each group (divided by n, not
|
||||
#' n-1); it then minimizes the the sum of these variances, each of them weighted
|
||||
#' by their sample size divided by the total sample size of that node.
|
||||
#' n-1); it then minimizes the sum of these variances, each of them weighted by
|
||||
#' their sample size divided by the total sample size of that node.
|
||||
#'
|
||||
#' @note There are other split finders that are used in regression random
|
||||
#' forests that are not included in this package. This package is oriented
|
||||
#' toward the competing risk side of survival analysis; the regression options
|
||||
#' are provided as an example of how extensible the back-end Java package is.
|
||||
#' If you are interested in using this package for regression (or other uses),
|
||||
#' feel free to write your own components. It's really not hard to write these
|
||||
#' components; the WeightedVarianceSplitFinder Java class is quite short; most
|
||||
#' of the code is to reuse calculations from previous considered splits.
|
||||
#' toward the competing risks side of survival analysis; the regression
|
||||
#' options are provided as an example of how extensible the back-end Java
|
||||
#' package is. If you are interested in using this package for regression (or
|
||||
#' other uses), feel free to write your own components. It's not too hard to
|
||||
#' write these components; the WeightedVarianceSplitFinder Java class is quite
|
||||
#' short; most of the code is to reuse calculations from previous considered
|
||||
#' splits. I (the author) am also willing to assist if you have any questions.
|
||||
#' @export
|
||||
#' @return A split finder object to be used in \code{\link{train}}; not
|
||||
#' useful on its own.
|
||||
#' @return A split finder object to be used in \code{\link{train}}; not useful
|
||||
#' on its own.
|
||||
#' @examples
|
||||
#' splitFinder <- WeightedVarianceSplitFinder()
|
||||
#' # You would then use it in train()
|
||||
|
@ -41,16 +42,6 @@ WeightedVarianceSplitFinder <- function(){
|
|||
#' \code{forestResponseCombiner} parameters in \code{\link{train}} when doing
|
||||
#' regression.
|
||||
#' @export
|
||||
#' @return A response combiner object to be used in \code{\link{train}}; not
|
||||
#' useful on its own. However, internally, a response combiner object is a
|
||||
#' list consisting of the following objects:
|
||||
#' \describe{
|
||||
#' \item{\code{javaObject}}{The java object used in the algorithm}
|
||||
#' \item{\code{call}}{The call (used in \code{print})}
|
||||
#' \item{\code{outputClass}}{The R class of the outputs; used in \code{\link{predict.JRandomForest}}}
|
||||
#' \item{\code{convertToRFunction}}{An R function that converts a Java prediction from the combiner into R output that is readable by a user.}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' responseCombiner <- MeanResponseCombiner()
|
||||
#' # You would then use it in train()
|
||||
|
@ -58,7 +49,7 @@ WeightedVarianceSplitFinder <- function(){
|
|||
#' # However; I'll show an internal Java method to make it clear what it does
|
||||
#' # Note that you should never have to do the following
|
||||
#' x <- 1:3
|
||||
#' x <- convertRListToJava(Numeric(x))
|
||||
#' x <- largeRCRF:::convertRListToJava(Numeric(x))
|
||||
#'
|
||||
#' # will output a Java object containing 2
|
||||
#' output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x)
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#' default.
|
||||
|
||||
#' @export
|
||||
#' @seealso \code{\link{train}}, \code{\link{load_forest}}
|
||||
#' @seealso \code{\link{train}}, \code{\link{loadForest}}
|
||||
#' @examples
|
||||
#' # Regression Example
|
||||
#' x1 <- rnorm(1000)
|
||||
|
@ -24,9 +24,9 @@
|
|||
#' forest <- train(y ~ x1 + x2, data,
|
||||
#' ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#'
|
||||
#' save_forest(forest, "trees")
|
||||
#' new_forest <- load_forest("trees")
|
||||
save_forest <- function(forest, directory, overwrite=FALSE){
|
||||
#' saveForest(forest, "trees")
|
||||
#' new_forest <- loadForest("trees")
|
||||
saveForest <- function(forest, directory, overwrite=FALSE){
|
||||
check_and_create_directory(directory, overwrite)
|
||||
|
||||
saveTrees(forest, directory)
|
49
R/train.R
49
R/train.R
|
@ -1,4 +1,5 @@
|
|||
|
||||
# Internal function to calculate how many CPU cores are available.
|
||||
getCores <- function(){
|
||||
cores <- NA
|
||||
if (requireNamespace("parallel", quietly = TRUE)){
|
||||
|
@ -22,7 +23,10 @@ getCores <- function(){
|
|||
#' response you plug in. \code{splitFinder} should work on the responses you are
|
||||
#' providing; \code{nodeResponseCombiner} should combine these responses into
|
||||
#' some intermediate product, and \code{forestResponseCombiner} combines these
|
||||
#' intermediate products into the final output product.
|
||||
#' intermediate products into the final output product. Note that
|
||||
#' \code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
|
||||
#' from the data (so feel free to not specify them), and \code{splitFinder} can
|
||||
#' be inferred but you might want to change its default.
|
||||
#'
|
||||
#' @param responses An R list of the responses. See \code{\link{CR_Response}}
|
||||
#' for an example function.
|
||||
|
@ -34,7 +38,7 @@ getCores <- function(){
|
|||
#' forest training algorithm. See \code{\link{Competing Risk Split Finders}}
|
||||
#' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
|
||||
#' this function tries to pick one based on the response. For
|
||||
#' \code{\link{CR_Response}} wihtout censor times, it will pick a
|
||||
#' \code{\link{CR_Response}} without censor times, it will pick a
|
||||
#' \code{\link{LogRankSplitFinder}}; while if censor times were provided it
|
||||
#' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{WeightedVarianceSplitFinder}}.
|
||||
|
@ -63,23 +67,24 @@ getCores <- function(){
|
|||
#' randomly chosen to be tried in the splitting process. This value must be at
|
||||
#' least 1.
|
||||
#' @param nodeSize The algorithm will not attempt to split a node that has
|
||||
#' observations less than 2*\code{nodeSize}; this results in terminal nodes
|
||||
#' having a size of roughly \code{nodeSize} (true sizes may be both smaller or
|
||||
#' greater). This value must be at least 1.
|
||||
#' observations less than 2*\code{nodeSize}; this guarantees that any two
|
||||
#' sibling terminal nodes together have an average size of at least
|
||||
#' \code{nodeSize}; note that it doesn't guarantee that every node is at least
|
||||
#' as large as \code{nodeSize}.
|
||||
#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
|
||||
#' helps keep trees shorter; by default maxNodeDepth is an extremely high
|
||||
#' controls tree length; by default \code{maxNodeDepth} is an extremely high
|
||||
#' number and tree depth is controlled by \code{nodeSize}.
|
||||
#' @param splitPureNodes This parameter determines whether the algorithm will
|
||||
#' split a pure node. If set to FALSE, then before every split it will check
|
||||
#' that every response is the same, and if so, not split. If set to TRUE it
|
||||
#' forgoes that check and just splits. Prediction accuracy won't change under
|
||||
#' any sensible \code{nodeResponseCombiner} as all terminal nodes from a split
|
||||
#' forgoes that check and splits it. Prediction accuracy won't change under
|
||||
#' any sensible \code{nodeResponseCombiner}; as all terminal nodes from a split
|
||||
#' pure node should give the same prediction, so this parameter only affects
|
||||
#' performance. If your response is continuous you'll likely experience faster
|
||||
#' train times by setting it to TRUE. Default value is TRUE.
|
||||
#' @param savePath If set, this parameter will save each tree of the random
|
||||
#' forest in this directory as the forest is trained. Use this parameter if
|
||||
#' you need to save memory while training. See also \code{\link{load_forest}}
|
||||
#' you need to save memory while training. See also \code{\link{loadForest}}
|
||||
#' @param savePath.overwrite This parameter controls the behaviour for what
|
||||
#' happens if \code{savePath} is pointing to an existing directory. If set to
|
||||
#' \code{warn} (default) then \code{train} refuses to proceed. If set to
|
||||
|
@ -93,12 +98,12 @@ getCores <- function(){
|
|||
#' a crash.
|
||||
#' @param cores This parameter specifies how many trees will be simultaneously
|
||||
#' trained. By default the package attempts to detect how many cores you have
|
||||
#' by using the \code{parallel} package, and using all of them. You may
|
||||
#' by using the \code{parallel} package and using all of them. You may
|
||||
#' specify a lower number if you wish. It is not recommended to specify a
|
||||
#' number greater than the number of available cores as this will hurt
|
||||
#' performance with no available benefit.
|
||||
#' @param randomSeed This parameter specifies a random seed if reproducible,
|
||||
#' deterministic forests are desired. The number o1
|
||||
#' deterministic forests are desired.
|
||||
#' @export
|
||||
#' @return A \code{JRandomForest} object. You may call \code{predict} or
|
||||
#' \code{print} on it.
|
||||
|
@ -135,8 +140,8 @@ getCores <- function(){
|
|||
#'
|
||||
#' data <- data.frame(x1, x2)
|
||||
#'
|
||||
#' forest <- train(CompetingRiskResponses(delta, u) ~ x1 + x2, data,
|
||||
#' LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
#' LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
train <- function(x, ...) UseMethod("train")
|
||||
|
@ -280,20 +285,6 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
|
|||
|
||||
forestObject <- list(call=match.call(), params=params, javaObject=forest.java, covariateList=dataset$covariateList)
|
||||
|
||||
# TODO - remove redundant code if tests pass
|
||||
#forestObject$params <- list(
|
||||
# splitFinder=splitFinder,
|
||||
# nodeResponseCombiner=nodeResponseCombiner,
|
||||
# forestResponseCombiner=forestResponseCombiner,
|
||||
# ntree=ntree,
|
||||
# numberOfSplits=numberOfSplits,
|
||||
# mtry=mtry,
|
||||
# nodeSize=nodeSize,
|
||||
# splitPureNodes=splitPureNodes,
|
||||
# maxNodeDepth = maxNodeDepth,
|
||||
# savePath=savePath
|
||||
#)
|
||||
|
||||
class(forestObject) <- "JRandomForest"
|
||||
return(forestObject)
|
||||
|
||||
|
@ -304,7 +295,9 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
|
|||
|
||||
#' @rdname train
|
||||
#' @export
|
||||
#' @param formula You may specify the response and covariates as a formula instead; make sure the response in the formula is still properly constructed; see \code{responses}
|
||||
#' @param formula You may specify the response and covariates as a formula
|
||||
#' instead; make sure the response in the formula is still properly
|
||||
#' constructed; see \code{responses}
|
||||
train.formula <- function(formula, covariateData, ...){
|
||||
|
||||
# Having an R copy of the data loaded at the same time can be wasteful; we
|
||||
|
|
|
@ -1,12 +0,0 @@
|
|||
wrapFunction <- function(mf){
|
||||
f <- function(x){
|
||||
|
||||
y <- vector(mode="numeric", length=length(x))
|
||||
for(i in 1:length(x)){
|
||||
y[i] <- .jcall(mf, "D", "evaluate", x[i])
|
||||
}
|
||||
|
||||
return(y)
|
||||
|
||||
}
|
||||
}
|
|
@ -17,17 +17,6 @@ faster performance when predicting, however if the times are not exhaustive
|
|||
then the resulting curves will not update at that point (they'll be flat).
|
||||
If left blank, the package will default to using all of the time points.}
|
||||
}
|
||||
\value{
|
||||
A response combiner object to be used in \code{\link{train}}; not
|
||||
useful on its own. However, internally, a response combiner object is a
|
||||
list consisting of the following objects: \describe{
|
||||
\item{\code{javaObject}}{The java object used in the algorithm}
|
||||
\item{\code{call}}{The call (used in \code{print})}
|
||||
\item{\code{outputClass}}{The R class of the outputs; used in
|
||||
\code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
|
||||
function that converts a Java prediction from the combiner into R output
|
||||
that is readable by a user.} }
|
||||
}
|
||||
\description{
|
||||
Creates a CompetingRiskFunctionCombiner rJava object, which is used
|
||||
internally for constructing a forest. The forest uses it when creating
|
||||
|
|
|
@ -38,5 +38,5 @@ u <- pmin(T1, T2, C)
|
|||
delta <- ifelse(u == T1, 1, ifelse(u == T2, 2, 0))
|
||||
|
||||
responses <- CR_Response(delta, u)
|
||||
# Then use responses in train
|
||||
# Then use responses in train or naiveConcordance
|
||||
}
|
||||
|
|
|
@ -11,17 +11,6 @@ CR_ResponseCombiner(events)
|
|||
functions should be processed. This should correspond to all of the
|
||||
competing risk events that can occur, from 1 to the largest number.}
|
||||
}
|
||||
\value{
|
||||
A response combiner object to be used in \code{\link{train}}; not
|
||||
useful on its own. However, internally, a response combiner object is a
|
||||
list consisting of the following objects: \describe{
|
||||
\item{\code{javaObject}}{The java object used in the algorithm}
|
||||
\item{\code{call}}{The call (used in \code{print})}
|
||||
\item{\code{outputClass}}{The R class of the outputs; used in
|
||||
\code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
|
||||
function that converts a Java prediction from the combiner into R output
|
||||
that is readable by a user.} }
|
||||
}
|
||||
\description{
|
||||
Creates a CompetingRiskResponseCombiner rJava object, which is used
|
||||
internally for constructing a forest. It is used when each tree in the forest
|
||||
|
|
|
@ -6,17 +6,6 @@
|
|||
\usage{
|
||||
MeanResponseCombiner()
|
||||
}
|
||||
\value{
|
||||
A response combiner object to be used in \code{\link{train}}; not
|
||||
useful on its own. However, internally, a response combiner object is a
|
||||
list consisting of the following objects:
|
||||
\describe{
|
||||
\item{\code{javaObject}}{The java object used in the algorithm}
|
||||
\item{\code{call}}{The call (used in \code{print})}
|
||||
\item{\code{outputClass}}{The R class of the outputs; used in \code{\link{predict.JRandomForest}}}
|
||||
\item{\code{convertToRFunction}}{An R function that converts a Java prediction from the combiner into R output that is readable by a user.}
|
||||
}
|
||||
}
|
||||
\description{
|
||||
This response combiner is used in regression random forests, where the
|
||||
response in the data is a single number that needs to be averaged in each
|
||||
|
@ -32,7 +21,7 @@ responseCombiner <- MeanResponseCombiner()
|
|||
# However; I'll show an internal Java method to make it clear what it does
|
||||
# Note that you should never have to do the following
|
||||
x <- 1:3
|
||||
x <- convertRListToJava(Numeric(x))
|
||||
x <- largeRCRF:::convertRListToJava(Numeric(x))
|
||||
|
||||
# will output a Java object containing 2
|
||||
output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x)
|
||||
|
|
|
@ -7,24 +7,25 @@
|
|||
WeightedVarianceSplitFinder()
|
||||
}
|
||||
\value{
|
||||
A split finder object to be used in \code{\link{train}}; not
|
||||
useful on its own.
|
||||
A split finder object to be used in \code{\link{train}}; not useful
|
||||
on its own.
|
||||
}
|
||||
\description{
|
||||
This split finder is used in regression random forests. When a split is made,
|
||||
this finder computes the sample variance in each group (divided by n, not
|
||||
n-1); it then minimizes the the sum of these variances, each of them weighted
|
||||
by their sample size divided by the total sample size of that node.
|
||||
n-1); it then minimizes the sum of these variances, each of them weighted by
|
||||
their sample size divided by the total sample size of that node.
|
||||
}
|
||||
\note{
|
||||
There are other split finders that are used in regression random
|
||||
forests that are not included in this package. This package is oriented
|
||||
toward the competing risk side of survival analysis; the regression options
|
||||
are provided as an example of how extensible the back-end Java package is.
|
||||
If you are interested in using this package for regression (or other uses),
|
||||
feel free to write your own components. It's really not hard to write these
|
||||
components; the WeightedVarianceSplitFinder Java class is quite short; most
|
||||
of the code is to reuse calculations from previous considered splits.
|
||||
toward the competing risks side of survival analysis; the regression
|
||||
options are provided as an example of how extensible the back-end Java
|
||||
package is. If you are interested in using this package for regression (or
|
||||
other uses), feel free to write your own components. It's not too hard to
|
||||
write these components; the WeightedVarianceSplitFinder Java class is quite
|
||||
short; most of the code is to reuse calculations from previous considered
|
||||
splits. I (the author) am also willing to assist if you have any questions.
|
||||
}
|
||||
\examples{
|
||||
splitFinder <- WeightedVarianceSplitFinder()
|
||||
|
|
|
@ -1,27 +0,0 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/misc.R
|
||||
\name{convertRListToJava}
|
||||
\alias{convertRListToJava}
|
||||
\title{convertRListToJava}
|
||||
\usage{
|
||||
convertRListToJava(lst)
|
||||
}
|
||||
\arguments{
|
||||
\item{lst}{The R list containing rJava objects}
|
||||
}
|
||||
\value{
|
||||
An rJava List object to be used internally.
|
||||
}
|
||||
\description{
|
||||
An internal function that converts an R list of rJava objects into a
|
||||
java.util.List rJava object containing those objects. It's used internally,
|
||||
and is only available because it's used in some examples that demonstrate what
|
||||
other objects do.
|
||||
}
|
||||
\examples{
|
||||
x <- Numeric(1:5)
|
||||
class(x)
|
||||
x <- convertRListToJava(x)
|
||||
class(x)
|
||||
}
|
||||
\keyword{internal}
|
|
@ -1,10 +1,10 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/load_forest.R
|
||||
\name{load_forest}
|
||||
\alias{load_forest}
|
||||
% Please edit documentation in R/loadForest.R
|
||||
\name{loadForest}
|
||||
\alias{loadForest}
|
||||
\title{Load Random Forest}
|
||||
\usage{
|
||||
load_forest(directory)
|
||||
loadForest(directory)
|
||||
}
|
||||
\arguments{
|
||||
\item{forest}{The directory created that saved the previous forest.}
|
||||
|
@ -13,7 +13,7 @@ load_forest(directory)
|
|||
A JForest object; see \code{\link{train}} for details.
|
||||
}
|
||||
\description{
|
||||
Loads a random forest that was saved using \code{\link{save_forest}}.
|
||||
Loads a random forest that was saved using \code{\link{saveForest}}.
|
||||
}
|
||||
\examples{
|
||||
# Regression Example
|
||||
|
@ -25,9 +25,9 @@ data <- data.frame(x1, x2, y)
|
|||
forest <- train(y ~ x1 + x2, data,
|
||||
ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
|
||||
save_forest(forest, "trees")
|
||||
new_forest <- load_forest("trees")
|
||||
saveForest(forest, "trees")
|
||||
new_forest <- loadForest("trees")
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{train}}, \code{\link{save_forest}}
|
||||
\code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}}
|
||||
}
|
|
@ -21,7 +21,23 @@ A vector of 1 minus the concordance scores, with each element
|
|||
}
|
||||
\description{
|
||||
Used to calculate a concordance index error. The user needs to supply a list
|
||||
of mortalities, with each item in the list being a vector for the specific
|
||||
events. To calculate mortalities a user should look to
|
||||
of mortalities, with each item in the list being a vector for the
|
||||
corresponding event. To calculate mortalities a user should look to
|
||||
\code{\link{extractMortalities}}.
|
||||
}
|
||||
\examples{
|
||||
data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6)
|
||||
|
||||
model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1)
|
||||
|
||||
newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6)
|
||||
predictions <- predict(model, newData)
|
||||
|
||||
mortalities <- list(
|
||||
extractMortalities(predictions, 1, 6),
|
||||
extractMortalities(predictions, 2, 6)
|
||||
)
|
||||
|
||||
naiveConcordance(CR_Response(newData$delta, newData$T), mortalities)
|
||||
|
||||
}
|
||||
|
|
|
@ -17,8 +17,9 @@ the dataset after the forest is trained.}
|
|||
|
||||
\item{parallel}{A logical indicating whether multiple cores should be
|
||||
utilized when making the predictions. Available as an option because it's
|
||||
been observed by this author that using Java's \code{parallelStream} can be
|
||||
unstable on some systems. Default value is \code{TRUE}.}
|
||||
been observed that using Java's \code{parallelStream} can be unstable on
|
||||
some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you
|
||||
get strange errors while predicting.}
|
||||
|
||||
\item{out.of.bag}{A logical indicating whether predictions should be based on
|
||||
'out of bag' trees; set only to \code{TRUE} if you're running predictions
|
||||
|
@ -38,7 +39,7 @@ x2 <- rnorm(1000)
|
|||
y <- 1 + x1 + x2 + rnorm(1000)
|
||||
|
||||
data <- data.frame(x1, x2, y)
|
||||
forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
|
||||
# Fix x2 to be 0
|
||||
newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
|
||||
|
@ -58,8 +59,7 @@ delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
|
|||
|
||||
data <- data.frame(x1, x2)
|
||||
|
||||
forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
ypred <- predict(forest, newData)
|
||||
}
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/save_forest.R
|
||||
\name{save_forest}
|
||||
\alias{save_forest}
|
||||
% Please edit documentation in R/saveForest.R
|
||||
\name{saveForest}
|
||||
\alias{saveForest}
|
||||
\title{Save Random Forests}
|
||||
\usage{
|
||||
save_forest(forest, directory, overwrite = FALSE)
|
||||
saveForest(forest, directory, overwrite = FALSE)
|
||||
}
|
||||
\arguments{
|
||||
\item{forest}{The forest to save.}
|
||||
|
@ -30,9 +30,9 @@ data <- data.frame(x1, x2, y)
|
|||
forest <- train(y ~ x1 + x2, data,
|
||||
ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
|
||||
save_forest(forest, "trees")
|
||||
new_forest <- load_forest("trees")
|
||||
saveForest(forest, "trees")
|
||||
new_forest <- loadForest("trees")
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{train}}, \code{\link{load_forest}}
|
||||
\code{\link{train}}, \code{\link{loadForest}}
|
||||
}
|
34
man/train.Rd
34
man/train.Rd
|
@ -32,7 +32,7 @@ response as well).}
|
|||
forest training algorithm. See \code{\link{Competing Risk Split Finders}}
|
||||
or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
|
||||
this function tries to pick one based on the response. For
|
||||
\code{\link{CR_Response}} wihtout censor times, it will pick a
|
||||
\code{\link{CR_Response}} without censor times, it will pick a
|
||||
\code{\link{LogRankSplitFinder}}; while if censor times were provided it
|
||||
will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
|
||||
responses it picks a \code{\link{WeightedVarianceSplitFinder}}.}
|
||||
|
@ -67,26 +67,27 @@ randomly chosen to be tried in the splitting process. This value must be at
|
|||
least 1.}
|
||||
|
||||
\item{nodeSize}{The algorithm will not attempt to split a node that has
|
||||
observations less than 2*\code{nodeSize}; this results in terminal nodes
|
||||
having a size of roughly \code{nodeSize} (true sizes may be both smaller or
|
||||
greater). This value must be at least 1.}
|
||||
observations less than 2*\code{nodeSize}; this guarantees that any two
|
||||
sibling terminal nodes together have an average size of at least
|
||||
\code{nodeSize}; note that it doesn't guarantee that every node is at least
|
||||
as large as \code{nodeSize}.}
|
||||
|
||||
\item{maxNodeDepth}{This parameter is analogous to \code{nodeSize} in that it
|
||||
helps keep trees shorter; by default maxNodeDepth is an extremely high
|
||||
controls tree length; by default \code{maxNodeDepth} is an extremely high
|
||||
number and tree depth is controlled by \code{nodeSize}.}
|
||||
|
||||
\item{splitPureNodes}{This parameter determines whether the algorithm will
|
||||
split a pure node. If set to FALSE, then before every split it will check
|
||||
that every response is the same, and if so, not split. If set to TRUE it
|
||||
forgoes that check and just splits. Prediction accuracy won't change under
|
||||
any sensible \code{nodeResponseCombiner} as all terminal nodes from a split
|
||||
forgoes that check and splits it. Prediction accuracy won't change under
|
||||
any sensible \code{nodeResponseCombiner}; as all terminal nodes from a split
|
||||
pure node should give the same prediction, so this parameter only affects
|
||||
performance. If your response is continuous you'll likely experience faster
|
||||
train times by setting it to TRUE. Default value is TRUE.}
|
||||
|
||||
\item{savePath}{If set, this parameter will save each tree of the random
|
||||
forest in this directory as the forest is trained. Use this parameter if
|
||||
you need to save memory while training. See also \code{\link{load_forest}}}
|
||||
you need to save memory while training. See also \code{\link{loadForest}}}
|
||||
|
||||
\item{savePath.overwrite}{This parameter controls the behaviour for what
|
||||
happens if \code{savePath} is pointing to an existing directory. If set to
|
||||
|
@ -102,15 +103,17 @@ a crash.}
|
|||
|
||||
\item{cores}{This parameter specifies how many trees will be simultaneously
|
||||
trained. By default the package attempts to detect how many cores you have
|
||||
by using the \code{parallel} package, and using all of them. You may
|
||||
by using the \code{parallel} package and using all of them. You may
|
||||
specify a lower number if you wish. It is not recommended to specify a
|
||||
number greater than the number of available cores as this will hurt
|
||||
performance with no available benefit.}
|
||||
|
||||
\item{randomSeed}{This parameter specifies a random seed if reproducible,
|
||||
deterministic forests are desired. The number o1}
|
||||
deterministic forests are desired.}
|
||||
|
||||
\item{formula}{You may specify the response and covariates as a formula instead; make sure the response in the formula is still properly constructed; see \code{responses}}
|
||||
\item{formula}{You may specify the response and covariates as a formula
|
||||
instead; make sure the response in the formula is still properly
|
||||
constructed; see \code{responses}}
|
||||
}
|
||||
\value{
|
||||
A \code{JRandomForest} object. You may call \code{predict} or
|
||||
|
@ -124,7 +127,10 @@ parameters. Make sure these are compatible with each other, and with the
|
|||
response you plug in. \code{splitFinder} should work on the responses you are
|
||||
providing; \code{nodeResponseCombiner} should combine these responses into
|
||||
some intermediate product, and \code{forestResponseCombiner} combines these
|
||||
intermediate products into the final output product.
|
||||
intermediate products into the final output product. Note that
|
||||
\code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
|
||||
from the data (so feel free to not specify them), and \code{splitFinder} can
|
||||
be inferred but you might want to change its default.
|
||||
}
|
||||
\note{
|
||||
If saving memory is a concern, you can replace \code{covariateData}
|
||||
|
@ -160,8 +166,8 @@ delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
|
|||
|
||||
data <- data.frame(x1, x2)
|
||||
|
||||
forest <- train(CompetingRiskResponses(delta, u) ~ x1 + x2, data,
|
||||
LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
ypred <- predict(forest, newData)
|
||||
}
|
||||
|
|
|
@ -13,8 +13,8 @@ test_that("Can save & load regression example", {
|
|||
ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
|
||||
|
||||
save_forest(forest, "trees_saving_loading")
|
||||
new_forest <- load_forest("trees_saving_loading")
|
||||
saveForest(forest, "trees_saving_loading")
|
||||
new_forest <- loadForest("trees_saving_loading")
|
||||
|
||||
# try making a little prediction to verify it works
|
||||
newData <- data.frame(x1=seq(from=-3, to=3, by=0.5), x2=0)
|
||||
|
|
|
@ -20,7 +20,7 @@ test_that("Can save a random forest while training, and use it afterward", {
|
|||
predictions <- predict(forest, newData)
|
||||
|
||||
# Also make sure we can load the forest too
|
||||
newforest <- load_forest("trees")
|
||||
newforest <- loadForest("trees")
|
||||
predictions <- predict(newforest, newData)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue