Cleanup function defintions and documentation

This commit is contained in:
Joel Therrien 2019-06-06 15:53:25 -07:00
parent a3551694bd
commit 30d9060517
29 changed files with 160 additions and 279 deletions

View file

@ -1,7 +1,7 @@
Package: largeRCRF
Type: Package
Title: Large Random Competing Risk Forests, Java Implementation Run in R
Version: 0.0.0.9036
Version: 0.0.0.9037
Authors@R: person("Joel", "Therrien", email = "joel@joeltherrien.ca", role = c("aut", "cre"))
Description: This package is used for training competing risk random forests on larger scale datasets.
It currently only supports training models, running predictions, plotting those predictions (they are curves),

View file

@ -25,15 +25,12 @@ export(LogRankSplitFinder)
export(MeanResponseCombiner)
export(Numeric)
export(WeightedVarianceSplitFinder)
export(convertRListToJava)
export(extractCHF)
export(extractCIF)
export(extractMortalities)
export(extractSurvivorCurve)
export(load_covariate_list_from_settings)
export(load_forest)
export(load_forest_args_provided)
export(loadForest)
export(naiveConcordance)
export(save_forest)
export(saveForest)
export(train)
import(rJava)

View file

@ -28,7 +28,7 @@
#' delta <- ifelse(u == T1, 1, ifelse(u == T2, 2, 0))
#'
#' responses <- CR_Response(delta, u)
#' # Then use responses in train
#' # Then use responses in train or naiveConcordance
CR_Response <- function(delta, u, C = NULL){
if(is.null(C)){
return(Java_CompetingRiskResponses(delta, u))

View file

@ -10,16 +10,6 @@
#' The user only needs to pass this object into \code{\link{train}} as the
#' \code{forestResponseCombiner} parameter.
#'
#' @return A response combiner object to be used in \code{\link{train}}; not
#' useful on its own. However, internally, a response combiner object is a
#' list consisting of the following objects: \describe{
#' \item{\code{javaObject}}{The java object used in the algorithm}
#' \item{\code{call}}{The call (used in \code{print})}
#' \item{\code{outputClass}}{The R class of the outputs; used in
#' \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
#' function that converts a Java prediction from the combiner into R output
#' that is readable by a user.} }
#'
#' @param events A vector of integers specifying which competing risk events's
#' functions should be processed. This should correspond to all of the
#' competing risk events that can occur, from 1 to the largest number.
@ -76,16 +66,6 @@ CR_FunctionCombiner <- function(events, times = NULL){
#' The user only needs to pass this object into \code{\link{train}} as the
#' \code{nodeResponseCombiner} parameter.
#'
#' @return A response combiner object to be used in \code{\link{train}}; not
#' useful on its own. However, internally, a response combiner object is a
#' list consisting of the following objects: \describe{
#' \item{\code{javaObject}}{The java object used in the algorithm}
#' \item{\code{call}}{The call (used in \code{print})}
#' \item{\code{outputClass}}{The R class of the outputs; used in
#' \code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
#' function that converts a Java prediction from the combiner into R output
#' that is readable by a user.} }
#'
#' @param events A vector of integers specifying which competing risk events's
#' functions should be processed. This should correspond to all of the
#' competing risk events that can occur, from 1 to the largest number.

View file

@ -2,8 +2,8 @@
#' Naive Concordance
#'
#' Used to calculate a concordance index error. The user needs to supply a list
#' of mortalities, with each item in the list being a vector for the specific
#' events. To calculate mortalities a user should look to
#' of mortalities, with each item in the list being a vector for the
#' corresponding event. To calculate mortalities a user should look to
#' \code{\link{extractMortalities}}.
#'
#' @return A vector of 1 minus the concordance scores, with each element
@ -16,6 +16,21 @@
#' list should correspond to one of the events in the order of event 1 to J,
#' and should be a vector of the same length as responses.
#' @export
#' @examples
#' data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6)
#'
#' model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1)
#'
#' newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6)
#' predictions <- predict(model, newData)
#'
#' mortalities <- list(
#' extractMortalities(predictions, 1, 6),
#' extractMortalities(predictions, 2, 6)
#' )
#'
#' naiveConcordance(CR_Response(newData$delta, newData$T), mortalities)
#'
naiveConcordance <- function(responses, predictedMortalities){
if(is.null(responses)){
stop("responses cannot be null")

View file

@ -1,6 +1,7 @@
convertCompetingRiskFunctionsSlow <- function(javaObject, forest){
# Internal function used to convert the Java functions into R functions
# Provided for use as a parameter in CR_FunctionCombiner & CR_ResponseCombiner
convertCompetingRiskFunctions <- compiler::cmpfun(function(javaObject, forest){
events <- forest$params$forestResponseCombiner$events
lst <- list(javaObject = javaObject, events = events)
@ -24,9 +25,7 @@ convertCompetingRiskFunctionsSlow <- function(javaObject, forest){
class(lst) <- "CompetingRiskFunctions"
return(lst)
}
convertCompetingRiskFunctions <- compiler::cmpfun(convertCompetingRiskFunctionsSlow)
})
#' Competing Risk Predictions

View file

@ -1,4 +1,4 @@
# These functions are not exported, so I won't create their documentation either.
# These functions are not exported, so I won't provide their documentation either.
# I.e. it's not a mistake that the documentation below lacks the " ' " on each line.
# Covariates

View file

@ -1,5 +1,6 @@
# This file keeps track of the different Java classes used
# Whenever refactoring happens in the Java code, this file should be updated and (hopefully) nothing will break.
# This file keeps track of the different Java classes used. Whenever refactoring
# happens in the Java code, this file should be updated and (hopefully) nothing
# will break.
# General Java objects
.class_Object <- "java/lang/Object"
@ -51,7 +52,7 @@
# When a class object is returned, rJava often often wants L prepended and ; appended.
# So a list that returns "java/lang/Object" should show "Ljava/lang/Object;"
# This function does that
# This function does that.
makeResponse <- function(className){
return(paste0("L", className, ";"))
}

View file

@ -2,12 +2,12 @@
#' Load Random Forest
#'
#' Loads a random forest that was saved using \code{\link{save_forest}}.
#' Loads a random forest that was saved using \code{\link{saveForest}}.
#'
#' @param forest The directory created that saved the previous forest.
#' @return A JForest object; see \code{\link{train}} for details.
#' @export
#' @seealso \code{\link{train}}, \code{\link{save_forest}}
#' @seealso \code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}}
#' @examples
#' # Regression Example
#' x1 <- rnorm(1000)
@ -18,9 +18,9 @@
#' forest <- train(y ~ x1 + x2, data,
#' ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
#'
#' save_forest(forest, "trees")
#' new_forest <- load_forest("trees")
load_forest <- function(directory){
#' saveForest(forest, "trees")
#' new_forest <- loadForest("trees")
loadForest <- function(directory){
# First load the response combiners and the split finders
nodeResponseCombiner.java <- .jcall(.class_DataUtils, makeResponse(.class_Object), "loadObject", paste0(directory, "/nodeResponseCombiner.jData"))
@ -42,15 +42,20 @@ load_forest <- function(directory){
params$splitFinder$javaObject <- splitFinder.java
params$forestResponseCombiner$javaObject <- forestResponseCombiner.java
forest <- load_forest_args_provided(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, call,
forest <- loadForestArgumentsSpecified(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, call,
params$ntree, params$numberOfSplits, params$mtry, params$nodeSize, params$maxNodeDepth, params$splitPureNodes)
return(forest)
}
#' @export
load_forest_args_provided <- function(treeDirectory, nodeResponseCombiner, splitFinder, forestResponseCombiner,
# Internal function - if you really need to use it yourself (say to load forests
# saved directly through the Java interface into R), then look at the loadForest
# function to see how this function is used. I'm also open to writing a function
# that uses the Java version's settings yaml file to recreate the forest, but
# I'd appreciate knowing that someone's going to use it first (email me; see
# README).
loadForestArgumentsSpecified <- function(treeDirectory, nodeResponseCombiner, splitFinder, forestResponseCombiner,
covariateList.java, call, ntree, numberOfSplits, mtry, nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE){
params <- list(

View file

@ -1,18 +1,4 @@
#' convertRListToJava
#'
#' An internal function that converts an R list of rJava objects into a
#' java.util.List rJava object containing those objects. It's used internally,
#' and is only available because it's used in some examples that demonstrate what
#' other objects do.
#' @param lst The R list containing rJava objects
#' @export
#' @return An rJava List object to be used internally.
#' @keywords internal
#' @examples
#' x <- Numeric(1:5)
#' class(x)
#' x <- convertRListToJava(x)
#' class(x)
# Internal function
convertRListToJava <- function(lst){
javaList <- .jnew(.class_ArrayList, as.integer(length(lst)))
javaList <- .jcast(javaList, .class_List)

View file

@ -11,8 +11,9 @@
#' the dataset after the forest is trained.
#' @param parallel A logical indicating whether multiple cores should be
#' utilized when making the predictions. Available as an option because it's
#' been observed by this author that using Java's \code{parallelStream} can be
#' unstable on some systems. Default value is \code{TRUE}.
#' been observed that using Java's \code{parallelStream} can be unstable on
#' some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you
#' get strange errors while predicting.
#' @param out.of.bag A logical indicating whether predictions should be based on
#' 'out of bag' trees; set only to \code{TRUE} if you're running predictions
#' on data that was used in the training. Default value is \code{FALSE}.
@ -26,7 +27,7 @@
#' y <- 1 + x1 + x2 + rnorm(1000)
#'
#' data <- data.frame(x1, x2, y)
#' forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
#'
#' # Fix x2 to be 0
#' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
@ -46,8 +47,7 @@
#'
#' data <- data.frame(x1, x2)
#'
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
#' LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
#' ypred <- predict(forest, newData)
predict.JRandomForest <- function(forest, newData=NULL, parallel=TRUE, out.of.bag=FALSE){

View file

@ -1,37 +0,0 @@
recover_forest_predictable <- function(tree_directory, settingsPath) {
settings.java <- load_settings(settingsPath)
nodeResponseCombiner.java <- .jcall(settings.java, makeResponse(.class_ResponseCombiner), "getResponseCombiner")
splitFinder.java <- .jcall(settings.java, makeResponse(.class_SplitFinder), "getSplitFinder")
forestResponseCombiner.java <- .jcall(settings.java, makeResponse(.class_ResponseCombiner), "getTreeCombiner")
covariateList <- .jcall(settings.java, makeResponse(.class_List), "getCovariates")
params <- readRDS(paste0(directory, "/parameters.rData"))
call <- readRDS(paste0(directory, "/call.rData"))
params$nodeResponseCombiner$javaObject <- nodeResponseCombiner.java
params$splitFinder$javaObject <- splitFinder.java
params$forestResponseCombiner$javaObject <- forestResponseCombiner.java
forest <- load_forest_args_provided(directory, params$nodeResponseCombiner, params$splitFinder, params$forestResponseCombiner, covariateList, params, call)
return(forest)
}
load_settings <- function(settingsPath) {
settingsFile <- .jnew(.class_File, settingsPath)
settings.java <- .jcall(.class_Settings, makeResponse(.class_Settings), "load", settingsFile)
return(settings.java)
}
#' @export
load_covariate_list_from_settings <- function(settingsPath){
settings.java = load_settings(settingsPath)
covariateList <- .jcall(settings.java, makeResponse(.class_List), "getCovariates")
return(covariateList)
}

View file

@ -3,20 +3,21 @@
#'
#' This split finder is used in regression random forests. When a split is made,
#' this finder computes the sample variance in each group (divided by n, not
#' n-1); it then minimizes the the sum of these variances, each of them weighted
#' by their sample size divided by the total sample size of that node.
#' n-1); it then minimizes the sum of these variances, each of them weighted by
#' their sample size divided by the total sample size of that node.
#'
#' @note There are other split finders that are used in regression random
#' forests that are not included in this package. This package is oriented
#' toward the competing risk side of survival analysis; the regression options
#' are provided as an example of how extensible the back-end Java package is.
#' If you are interested in using this package for regression (or other uses),
#' feel free to write your own components. It's really not hard to write these
#' components; the WeightedVarianceSplitFinder Java class is quite short; most
#' of the code is to reuse calculations from previous considered splits.
#' toward the competing risks side of survival analysis; the regression
#' options are provided as an example of how extensible the back-end Java
#' package is. If you are interested in using this package for regression (or
#' other uses), feel free to write your own components. It's not too hard to
#' write these components; the WeightedVarianceSplitFinder Java class is quite
#' short; most of the code is to reuse calculations from previous considered
#' splits. I (the author) am also willing to assist if you have any questions.
#' @export
#' @return A split finder object to be used in \code{\link{train}}; not
#' useful on its own.
#' @return A split finder object to be used in \code{\link{train}}; not useful
#' on its own.
#' @examples
#' splitFinder <- WeightedVarianceSplitFinder()
#' # You would then use it in train()
@ -41,16 +42,6 @@ WeightedVarianceSplitFinder <- function(){
#' \code{forestResponseCombiner} parameters in \code{\link{train}} when doing
#' regression.
#' @export
#' @return A response combiner object to be used in \code{\link{train}}; not
#' useful on its own. However, internally, a response combiner object is a
#' list consisting of the following objects:
#' \describe{
#' \item{\code{javaObject}}{The java object used in the algorithm}
#' \item{\code{call}}{The call (used in \code{print})}
#' \item{\code{outputClass}}{The R class of the outputs; used in \code{\link{predict.JRandomForest}}}
#' \item{\code{convertToRFunction}}{An R function that converts a Java prediction from the combiner into R output that is readable by a user.}
#' }
#'
#' @examples
#' responseCombiner <- MeanResponseCombiner()
#' # You would then use it in train()
@ -58,7 +49,7 @@ WeightedVarianceSplitFinder <- function(){
#' # However; I'll show an internal Java method to make it clear what it does
#' # Note that you should never have to do the following
#' x <- 1:3
#' x <- convertRListToJava(Numeric(x))
#' x <- largeRCRF:::convertRListToJava(Numeric(x))
#'
#' # will output a Java object containing 2
#' output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x)

View file

@ -13,7 +13,7 @@
#' default.
#' @export
#' @seealso \code{\link{train}}, \code{\link{load_forest}}
#' @seealso \code{\link{train}}, \code{\link{loadForest}}
#' @examples
#' # Regression Example
#' x1 <- rnorm(1000)
@ -24,9 +24,9 @@
#' forest <- train(y ~ x1 + x2, data,
#' ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
#'
#' save_forest(forest, "trees")
#' new_forest <- load_forest("trees")
save_forest <- function(forest, directory, overwrite=FALSE){
#' saveForest(forest, "trees")
#' new_forest <- loadForest("trees")
saveForest <- function(forest, directory, overwrite=FALSE){
check_and_create_directory(directory, overwrite)
saveTrees(forest, directory)

View file

@ -1,4 +1,5 @@
# Internal function to calculate how many CPU cores are available.
getCores <- function(){
cores <- NA
if (requireNamespace("parallel", quietly = TRUE)){
@ -22,7 +23,10 @@ getCores <- function(){
#' response you plug in. \code{splitFinder} should work on the responses you are
#' providing; \code{nodeResponseCombiner} should combine these responses into
#' some intermediate product, and \code{forestResponseCombiner} combines these
#' intermediate products into the final output product.
#' intermediate products into the final output product. Note that
#' \code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
#' from the data (so feel free to not specify them), and \code{splitFinder} can
#' be inferred but you might want to change its default.
#'
#' @param responses An R list of the responses. See \code{\link{CR_Response}}
#' for an example function.
@ -34,7 +38,7 @@ getCores <- function(){
#' forest training algorithm. See \code{\link{Competing Risk Split Finders}}
#' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
#' this function tries to pick one based on the response. For
#' \code{\link{CR_Response}} wihtout censor times, it will pick a
#' \code{\link{CR_Response}} without censor times, it will pick a
#' \code{\link{LogRankSplitFinder}}; while if censor times were provided it
#' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
#' responses it picks a \code{\link{WeightedVarianceSplitFinder}}.
@ -63,23 +67,24 @@ getCores <- function(){
#' randomly chosen to be tried in the splitting process. This value must be at
#' least 1.
#' @param nodeSize The algorithm will not attempt to split a node that has
#' observations less than 2*\code{nodeSize}; this results in terminal nodes
#' having a size of roughly \code{nodeSize} (true sizes may be both smaller or
#' greater). This value must be at least 1.
#' observations less than 2*\code{nodeSize}; this guarantees that any two
#' sibling terminal nodes together have an average size of at least
#' \code{nodeSize}; note that it doesn't guarantee that every node is at least
#' as large as \code{nodeSize}.
#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
#' helps keep trees shorter; by default maxNodeDepth is an extremely high
#' controls tree length; by default \code{maxNodeDepth} is an extremely high
#' number and tree depth is controlled by \code{nodeSize}.
#' @param splitPureNodes This parameter determines whether the algorithm will
#' split a pure node. If set to FALSE, then before every split it will check
#' that every response is the same, and if so, not split. If set to TRUE it
#' forgoes that check and just splits. Prediction accuracy won't change under
#' any sensible \code{nodeResponseCombiner} as all terminal nodes from a split
#' forgoes that check and splits it. Prediction accuracy won't change under
#' any sensible \code{nodeResponseCombiner}; as all terminal nodes from a split
#' pure node should give the same prediction, so this parameter only affects
#' performance. If your response is continuous you'll likely experience faster
#' train times by setting it to TRUE. Default value is TRUE.
#' @param savePath If set, this parameter will save each tree of the random
#' forest in this directory as the forest is trained. Use this parameter if
#' you need to save memory while training. See also \code{\link{load_forest}}
#' you need to save memory while training. See also \code{\link{loadForest}}
#' @param savePath.overwrite This parameter controls the behaviour for what
#' happens if \code{savePath} is pointing to an existing directory. If set to
#' \code{warn} (default) then \code{train} refuses to proceed. If set to
@ -93,12 +98,12 @@ getCores <- function(){
#' a crash.
#' @param cores This parameter specifies how many trees will be simultaneously
#' trained. By default the package attempts to detect how many cores you have
#' by using the \code{parallel} package, and using all of them. You may
#' by using the \code{parallel} package and using all of them. You may
#' specify a lower number if you wish. It is not recommended to specify a
#' number greater than the number of available cores as this will hurt
#' performance with no available benefit.
#' @param randomSeed This parameter specifies a random seed if reproducible,
#' deterministic forests are desired. The number o1
#' deterministic forests are desired.
#' @export
#' @return A \code{JRandomForest} object. You may call \code{predict} or
#' \code{print} on it.
@ -135,8 +140,8 @@ getCores <- function(){
#'
#' data <- data.frame(x1, x2)
#'
#' forest <- train(CompetingRiskResponses(delta, u) ~ x1 + x2, data,
#' LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
#' LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
#' ypred <- predict(forest, newData)
train <- function(x, ...) UseMethod("train")
@ -280,20 +285,6 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
forestObject <- list(call=match.call(), params=params, javaObject=forest.java, covariateList=dataset$covariateList)
# TODO - remove redundant code if tests pass
#forestObject$params <- list(
# splitFinder=splitFinder,
# nodeResponseCombiner=nodeResponseCombiner,
# forestResponseCombiner=forestResponseCombiner,
# ntree=ntree,
# numberOfSplits=numberOfSplits,
# mtry=mtry,
# nodeSize=nodeSize,
# splitPureNodes=splitPureNodes,
# maxNodeDepth = maxNodeDepth,
# savePath=savePath
#)
class(forestObject) <- "JRandomForest"
return(forestObject)
@ -304,7 +295,9 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
#' @rdname train
#' @export
#' @param formula You may specify the response and covariates as a formula instead; make sure the response in the formula is still properly constructed; see \code{responses}
#' @param formula You may specify the response and covariates as a formula
#' instead; make sure the response in the formula is still properly
#' constructed; see \code{responses}
train.formula <- function(formula, covariateData, ...){
# Having an R copy of the data loaded at the same time can be wasteful; we

View file

@ -1,12 +0,0 @@
wrapFunction <- function(mf){
f <- function(x){
y <- vector(mode="numeric", length=length(x))
for(i in 1:length(x)){
y[i] <- .jcall(mf, "D", "evaluate", x[i])
}
return(y)
}
}

View file

@ -17,17 +17,6 @@ faster performance when predicting, however if the times are not exhaustive
then the resulting curves will not update at that point (they'll be flat).
If left blank, the package will default to using all of the time points.}
}
\value{
A response combiner object to be used in \code{\link{train}}; not
useful on its own. However, internally, a response combiner object is a
list consisting of the following objects: \describe{
\item{\code{javaObject}}{The java object used in the algorithm}
\item{\code{call}}{The call (used in \code{print})}
\item{\code{outputClass}}{The R class of the outputs; used in
\code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
function that converts a Java prediction from the combiner into R output
that is readable by a user.} }
}
\description{
Creates a CompetingRiskFunctionCombiner rJava object, which is used
internally for constructing a forest. The forest uses it when creating

View file

@ -38,5 +38,5 @@ u <- pmin(T1, T2, C)
delta <- ifelse(u == T1, 1, ifelse(u == T2, 2, 0))
responses <- CR_Response(delta, u)
# Then use responses in train
# Then use responses in train or naiveConcordance
}

View file

@ -11,17 +11,6 @@ CR_ResponseCombiner(events)
functions should be processed. This should correspond to all of the
competing risk events that can occur, from 1 to the largest number.}
}
\value{
A response combiner object to be used in \code{\link{train}}; not
useful on its own. However, internally, a response combiner object is a
list consisting of the following objects: \describe{
\item{\code{javaObject}}{The java object used in the algorithm}
\item{\code{call}}{The call (used in \code{print})}
\item{\code{outputClass}}{The R class of the outputs; used in
\code{\link{predict.JRandomForest}}} \item{\code{convertToRFunction}}{An R
function that converts a Java prediction from the combiner into R output
that is readable by a user.} }
}
\description{
Creates a CompetingRiskResponseCombiner rJava object, which is used
internally for constructing a forest. It is used when each tree in the forest

View file

@ -6,17 +6,6 @@
\usage{
MeanResponseCombiner()
}
\value{
A response combiner object to be used in \code{\link{train}}; not
useful on its own. However, internally, a response combiner object is a
list consisting of the following objects:
\describe{
\item{\code{javaObject}}{The java object used in the algorithm}
\item{\code{call}}{The call (used in \code{print})}
\item{\code{outputClass}}{The R class of the outputs; used in \code{\link{predict.JRandomForest}}}
\item{\code{convertToRFunction}}{An R function that converts a Java prediction from the combiner into R output that is readable by a user.}
}
}
\description{
This response combiner is used in regression random forests, where the
response in the data is a single number that needs to be averaged in each
@ -32,7 +21,7 @@ responseCombiner <- MeanResponseCombiner()
# However; I'll show an internal Java method to make it clear what it does
# Note that you should never have to do the following
x <- 1:3
x <- convertRListToJava(Numeric(x))
x <- largeRCRF:::convertRListToJava(Numeric(x))
# will output a Java object containing 2
output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x)

View file

@ -7,24 +7,25 @@
WeightedVarianceSplitFinder()
}
\value{
A split finder object to be used in \code{\link{train}}; not
useful on its own.
A split finder object to be used in \code{\link{train}}; not useful
on its own.
}
\description{
This split finder is used in regression random forests. When a split is made,
this finder computes the sample variance in each group (divided by n, not
n-1); it then minimizes the the sum of these variances, each of them weighted
by their sample size divided by the total sample size of that node.
n-1); it then minimizes the sum of these variances, each of them weighted by
their sample size divided by the total sample size of that node.
}
\note{
There are other split finders that are used in regression random
forests that are not included in this package. This package is oriented
toward the competing risk side of survival analysis; the regression options
are provided as an example of how extensible the back-end Java package is.
If you are interested in using this package for regression (or other uses),
feel free to write your own components. It's really not hard to write these
components; the WeightedVarianceSplitFinder Java class is quite short; most
of the code is to reuse calculations from previous considered splits.
toward the competing risks side of survival analysis; the regression
options are provided as an example of how extensible the back-end Java
package is. If you are interested in using this package for regression (or
other uses), feel free to write your own components. It's not too hard to
write these components; the WeightedVarianceSplitFinder Java class is quite
short; most of the code is to reuse calculations from previous considered
splits. I (the author) am also willing to assist if you have any questions.
}
\examples{
splitFinder <- WeightedVarianceSplitFinder()

View file

@ -1,27 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/misc.R
\name{convertRListToJava}
\alias{convertRListToJava}
\title{convertRListToJava}
\usage{
convertRListToJava(lst)
}
\arguments{
\item{lst}{The R list containing rJava objects}
}
\value{
An rJava List object to be used internally.
}
\description{
An internal function that converts an R list of rJava objects into a
java.util.List rJava object containing those objects. It's used internally,
and is only available because it's used in some examples that demonstrate what
other objects do.
}
\examples{
x <- Numeric(1:5)
class(x)
x <- convertRListToJava(x)
class(x)
}
\keyword{internal}

View file

@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/load_forest.R
\name{load_forest}
\alias{load_forest}
% Please edit documentation in R/loadForest.R
\name{loadForest}
\alias{loadForest}
\title{Load Random Forest}
\usage{
load_forest(directory)
loadForest(directory)
}
\arguments{
\item{forest}{The directory created that saved the previous forest.}
@ -13,7 +13,7 @@ load_forest(directory)
A JForest object; see \code{\link{train}} for details.
}
\description{
Loads a random forest that was saved using \code{\link{save_forest}}.
Loads a random forest that was saved using \code{\link{saveForest}}.
}
\examples{
# Regression Example
@ -25,9 +25,9 @@ data <- data.frame(x1, x2, y)
forest <- train(y ~ x1 + x2, data,
ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
save_forest(forest, "trees")
new_forest <- load_forest("trees")
saveForest(forest, "trees")
new_forest <- loadForest("trees")
}
\seealso{
\code{\link{train}}, \code{\link{save_forest}}
\code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}}
}

View file

@ -21,7 +21,23 @@ A vector of 1 minus the concordance scores, with each element
}
\description{
Used to calculate a concordance index error. The user needs to supply a list
of mortalities, with each item in the list being a vector for the specific
events. To calculate mortalities a user should look to
of mortalities, with each item in the list being a vector for the
corresponding event. To calculate mortalities a user should look to
\code{\link{extractMortalities}}.
}
\examples{
data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6)
model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1)
newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6)
predictions <- predict(model, newData)
mortalities <- list(
extractMortalities(predictions, 1, 6),
extractMortalities(predictions, 2, 6)
)
naiveConcordance(CR_Response(newData$delta, newData$T), mortalities)
}

View file

@ -17,8 +17,9 @@ the dataset after the forest is trained.}
\item{parallel}{A logical indicating whether multiple cores should be
utilized when making the predictions. Available as an option because it's
been observed by this author that using Java's \code{parallelStream} can be
unstable on some systems. Default value is \code{TRUE}.}
been observed that using Java's \code{parallelStream} can be unstable on
some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you
get strange errors while predicting.}
\item{out.of.bag}{A logical indicating whether predictions should be based on
'out of bag' trees; set only to \code{TRUE} if you're running predictions
@ -38,7 +39,7 @@ x2 <- rnorm(1000)
y <- 1 + x1 + x2 + rnorm(1000)
data <- data.frame(x1, x2, y)
forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
# Fix x2 to be 0
newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
@ -58,8 +59,7 @@ delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
data <- data.frame(x1, x2)
forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
ypred <- predict(forest, newData)
}

View file

@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/save_forest.R
\name{save_forest}
\alias{save_forest}
% Please edit documentation in R/saveForest.R
\name{saveForest}
\alias{saveForest}
\title{Save Random Forests}
\usage{
save_forest(forest, directory, overwrite = FALSE)
saveForest(forest, directory, overwrite = FALSE)
}
\arguments{
\item{forest}{The forest to save.}
@ -30,9 +30,9 @@ data <- data.frame(x1, x2, y)
forest <- train(y ~ x1 + x2, data,
ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
save_forest(forest, "trees")
new_forest <- load_forest("trees")
saveForest(forest, "trees")
new_forest <- loadForest("trees")
}
\seealso{
\code{\link{train}}, \code{\link{load_forest}}
\code{\link{train}}, \code{\link{loadForest}}
}

View file

@ -32,7 +32,7 @@ response as well).}
forest training algorithm. See \code{\link{Competing Risk Split Finders}}
or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
this function tries to pick one based on the response. For
\code{\link{CR_Response}} wihtout censor times, it will pick a
\code{\link{CR_Response}} without censor times, it will pick a
\code{\link{LogRankSplitFinder}}; while if censor times were provided it
will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
responses it picks a \code{\link{WeightedVarianceSplitFinder}}.}
@ -67,26 +67,27 @@ randomly chosen to be tried in the splitting process. This value must be at
least 1.}
\item{nodeSize}{The algorithm will not attempt to split a node that has
observations less than 2*\code{nodeSize}; this results in terminal nodes
having a size of roughly \code{nodeSize} (true sizes may be both smaller or
greater). This value must be at least 1.}
observations less than 2*\code{nodeSize}; this guarantees that any two
sibling terminal nodes together have an average size of at least
\code{nodeSize}; note that it doesn't guarantee that every node is at least
as large as \code{nodeSize}.}
\item{maxNodeDepth}{This parameter is analogous to \code{nodeSize} in that it
helps keep trees shorter; by default maxNodeDepth is an extremely high
controls tree length; by default \code{maxNodeDepth} is an extremely high
number and tree depth is controlled by \code{nodeSize}.}
\item{splitPureNodes}{This parameter determines whether the algorithm will
split a pure node. If set to FALSE, then before every split it will check
that every response is the same, and if so, not split. If set to TRUE it
forgoes that check and just splits. Prediction accuracy won't change under
any sensible \code{nodeResponseCombiner} as all terminal nodes from a split
forgoes that check and splits it. Prediction accuracy won't change under
any sensible \code{nodeResponseCombiner}; as all terminal nodes from a split
pure node should give the same prediction, so this parameter only affects
performance. If your response is continuous you'll likely experience faster
train times by setting it to TRUE. Default value is TRUE.}
\item{savePath}{If set, this parameter will save each tree of the random
forest in this directory as the forest is trained. Use this parameter if
you need to save memory while training. See also \code{\link{load_forest}}}
you need to save memory while training. See also \code{\link{loadForest}}}
\item{savePath.overwrite}{This parameter controls the behaviour for what
happens if \code{savePath} is pointing to an existing directory. If set to
@ -102,15 +103,17 @@ a crash.}
\item{cores}{This parameter specifies how many trees will be simultaneously
trained. By default the package attempts to detect how many cores you have
by using the \code{parallel} package, and using all of them. You may
by using the \code{parallel} package and using all of them. You may
specify a lower number if you wish. It is not recommended to specify a
number greater than the number of available cores as this will hurt
performance with no available benefit.}
\item{randomSeed}{This parameter specifies a random seed if reproducible,
deterministic forests are desired. The number o1}
deterministic forests are desired.}
\item{formula}{You may specify the response and covariates as a formula instead; make sure the response in the formula is still properly constructed; see \code{responses}}
\item{formula}{You may specify the response and covariates as a formula
instead; make sure the response in the formula is still properly
constructed; see \code{responses}}
}
\value{
A \code{JRandomForest} object. You may call \code{predict} or
@ -124,7 +127,10 @@ parameters. Make sure these are compatible with each other, and with the
response you plug in. \code{splitFinder} should work on the responses you are
providing; \code{nodeResponseCombiner} should combine these responses into
some intermediate product, and \code{forestResponseCombiner} combines these
intermediate products into the final output product.
intermediate products into the final output product. Note that
\code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
from the data (so feel free to not specify them), and \code{splitFinder} can
be inferred but you might want to change its default.
}
\note{
If saving memory is a concern, you can replace \code{covariateData}
@ -160,8 +166,8 @@ delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
data <- data.frame(x1, x2)
forest <- train(CompetingRiskResponses(delta, u) ~ x1 + x2, data,
LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
ypred <- predict(forest, newData)
}

View file

@ -13,8 +13,8 @@ test_that("Can save & load regression example", {
ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
save_forest(forest, "trees_saving_loading")
new_forest <- load_forest("trees_saving_loading")
saveForest(forest, "trees_saving_loading")
new_forest <- loadForest("trees_saving_loading")
# try making a little prediction to verify it works
newData <- data.frame(x1=seq(from=-3, to=3, by=0.5), x2=0)

View file

@ -20,7 +20,7 @@ test_that("Can save a random forest while training, and use it afterward", {
predictions <- predict(forest, newData)
# Also make sure we can load the forest too
newforest <- load_forest("trees")
newforest <- loadForest("trees")
predictions <- predict(newforest, newData)