Version 1.0.1; cleaned up documentation.
Moving towards a CRAN release.
This commit is contained in:
parent
8f140beeb7
commit
7a759d9dea
1022 changed files with 1180 additions and 237 deletions
|
@ -1,2 +1,3 @@
|
|||
^.*\.Rproj$
|
||||
^\.Rproj\.user$
|
||||
copyJar
|
17
DESCRIPTION
17
DESCRIPTION
|
@ -1,12 +1,19 @@
|
|||
Package: largeRCRF
|
||||
Type: Package
|
||||
Title: Large Random Competing Risk Forests, Java Implementation Run in R
|
||||
Version: 1.0.0.0
|
||||
Authors@R: person("Joel", "Therrien", email = "joel@joeltherrien.ca", role = c("aut", "cre"))
|
||||
Title: Large Random Competing Risks Forests
|
||||
Version: 1.0.1
|
||||
Authors@R: c(
|
||||
person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")),
|
||||
person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs"))
|
||||
)
|
||||
Description: This package is used for training competing risk random forests on larger scale datasets.
|
||||
It currently only supports training models, running predictions, plotting those predictions (they are curves),
|
||||
It currently supports training models, running predictions, plotting those predictions (they are curves),
|
||||
and some simple error analysis using concordance measures.
|
||||
License: GPL-3
|
||||
Copyright: All provided source code is copyrighted and owned by Joel Therrien.
|
||||
There are two dependencies (partially provided) used in the Java code; both
|
||||
of which are licensed under the Apache 2.0 License. Please see the NOTICE
|
||||
file for more information.
|
||||
Encoding: UTF-8
|
||||
LazyData: true
|
||||
Imports:
|
||||
|
@ -14,6 +21,6 @@ Imports:
|
|||
Suggests:
|
||||
parallel,
|
||||
testthat
|
||||
Depends: R (>= 3.4.2)
|
||||
Depends: R (>= 3.4.0)
|
||||
SystemRequirements: Java JDK 1.8 or higher
|
||||
RoxygenNote: 6.1.1
|
||||
|
|
|
@ -15,8 +15,6 @@ S3method(print,CompetingRiskFunctions.List)
|
|||
S3method(print,JRandomForest)
|
||||
S3method(print,ResponseCombiner)
|
||||
S3method(print,SplitFinder)
|
||||
S3method(train,default)
|
||||
S3method(train,formula)
|
||||
export(CR_FunctionCombiner)
|
||||
export(CR_Response)
|
||||
export(CR_ResponseCombiner)
|
||||
|
|
44
NOTICE
Normal file
44
NOTICE
Normal file
|
@ -0,0 +1,44 @@
|
|||
# largeRCRF
|
||||
Copyright 2018-2019 Joel Therrien
|
||||
|
||||
largeRCRF is licensed under the GPL-3 license.
|
||||
|
||||
largeRCRF contains in object form some Java classes from the
|
||||
Apache Commons CSV project and Jackson JSON processor; none of
|
||||
which were modified.
|
||||
|
||||
Their copyright notices are displayed below, and the license files
|
||||
they provided may be found in the licenses/ subfolder.
|
||||
|
||||
|
||||
|
||||
Apache Commons CSV
|
||||
Copyright 2005-2017 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Jackson JSON processor
|
||||
|
||||
Jackson is a high-performance, Free/Open Source JSON processing library.
|
||||
It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has
|
||||
been in development since 2007.
|
||||
It is currently developed by a community of developers, as well as supported
|
||||
commercially by FasterXML.com.
|
||||
|
||||
## Licensing
|
||||
|
||||
Jackson core and extension components may be licensed under different licenses.
|
||||
To find the details that apply to this artifact see the accompanying LICENSE file.
|
||||
For more information, including possible other licensing options, contact
|
||||
FasterXML.com (http://fasterxml.com).
|
||||
|
||||
## Credits
|
||||
|
||||
A list of contributors may be found from CREDITS file, which is included
|
||||
in some artifacts (usually source distributions); but is always available
|
||||
from the source code management (SCM) system project uses.
|
|
@ -48,7 +48,7 @@ extractCIF <- function (x, event) {
|
|||
|
||||
#' @export
|
||||
extractCIF.CompetingRiskFunctions <- function(prediction, event){
|
||||
fun <- stepfun(prediction$time.interest, c(0, prediction$cif[,event]))
|
||||
fun <- stats::stepfun(prediction$time.interest, c(0, prediction$cif[,event]))
|
||||
|
||||
class(fun) <- "function"
|
||||
attr(fun, "call") <- sys.call()
|
||||
|
@ -70,7 +70,7 @@ extractCHF <- function (x, event) {
|
|||
|
||||
#' @export
|
||||
extractCHF.CompetingRiskFunctions <- function(prediction, event){
|
||||
fun <- stepfun(prediction$time.interest, c(0, prediction$chf[,event]))
|
||||
fun <- stats::stepfun(prediction$time.interest, c(0, prediction$chf[,event]))
|
||||
|
||||
class(fun) <- "function"
|
||||
attr(fun, "call") <- sys.call()
|
||||
|
@ -93,7 +93,7 @@ extractSurvivorCurve <- function (x) {
|
|||
|
||||
#' @export
|
||||
extractSurvivorCurve.CompetingRiskFunctions <- function(prediction){
|
||||
fun <- stepfun(prediction$time.interest, c(1, prediction$survivorCurve))
|
||||
fun <- stats::stepfun(prediction$time.interest, c(1, prediction$survivorCurve))
|
||||
|
||||
class(fun) <- "function"
|
||||
attr(fun, "call") <- sys.call()
|
||||
|
|
|
@ -4,10 +4,10 @@
|
|||
#'
|
||||
#' Loads a random forest that was saved using \code{\link{saveForest}}.
|
||||
#'
|
||||
#' @param forest The directory created that saved the previous forest.
|
||||
#' @param directory The directory created that saved the previous forest.
|
||||
#' @return A JForest object; see \code{\link{train}} for details.
|
||||
#' @export
|
||||
#' @seealso \code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}}
|
||||
#' @seealso \code{\link{train}}, \code{\link{saveForest}}
|
||||
#' @examples
|
||||
#' # Regression Example
|
||||
#' x1 <- rnorm(1000)
|
||||
|
|
4
R/misc.R
4
R/misc.R
|
@ -78,10 +78,10 @@ plot.JMatrixPlottable <- function(mat, add=FALSE, type="s", xlab="Time", ylab=NU
|
|||
|
||||
}
|
||||
|
||||
plot(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
|
||||
graphics::plot(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
|
||||
}
|
||||
else{
|
||||
points(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
|
||||
graphics::points(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -28,7 +28,8 @@
|
|||
#' y <- 1 + x1 + x2 + rnorm(1000)
|
||||
#'
|
||||
#' data <- data.frame(x1, x2, y)
|
||||
#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5,
|
||||
#' mtry = 1, nodeSize = 5)
|
||||
#'
|
||||
#' # Fix x2 to be 0
|
||||
#' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
|
||||
|
@ -48,7 +49,8 @@
|
|||
#'
|
||||
#' data <- data.frame(x1, x2)
|
||||
#'
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100,
|
||||
#' numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
predict.JRandomForest <- function(forest, newData=NULL, parallel=TRUE, out.of.bag=NULL){
|
||||
|
|
|
@ -46,15 +46,6 @@ WeightedVarianceSplitFinder <- function(){
|
|||
#' responseCombiner <- MeanResponseCombiner()
|
||||
#' # You would then use it in train()
|
||||
#'
|
||||
#' # However; I'll show an internal Java method to make it clear what it does
|
||||
#' # Note that you should never have to do the following
|
||||
#' x <- 1:3
|
||||
#' x <- largeRCRF:::convertRListToJava(Numeric(x))
|
||||
#'
|
||||
#' # will output a Java object containing 2
|
||||
#' output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x)
|
||||
#' responseCombiner$convertToRFunction(output)
|
||||
#'
|
||||
MeanResponseCombiner <- function(){
|
||||
javaObject <- .jnew(.class_MeanResponseCombiner)
|
||||
javaObject <- .jcast(javaObject, .class_ResponseCombiner)
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#' Save Random Forests
|
||||
#'
|
||||
#' Saves a random forest for later use, given that the base R
|
||||
#' \code{\link{base::save}} function doesn't work for this package.
|
||||
#' \code{\link[base]{save}} function doesn't work for this package.
|
||||
#'
|
||||
#' @param forest The forest to save.
|
||||
#' @param directory The directory that should be created to save the trees in.
|
||||
|
@ -24,8 +24,8 @@
|
|||
#' forest <- train(y ~ x1 + x2, data,
|
||||
#' ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#'
|
||||
#' saveForest(forest, "trees")
|
||||
#' new_forest <- loadForest("trees")
|
||||
#' saveForest(forest, "saved_forest")
|
||||
#' new_forest <- loadForest("saved_forest")
|
||||
saveForest <- function(forest, directory, overwrite=FALSE){
|
||||
check_and_create_directory(directory, overwrite)
|
||||
|
||||
|
|
332
R/train.R
332
R/train.R
|
@ -14,148 +14,11 @@ getCores <- function(){
|
|||
return(cores)
|
||||
}
|
||||
|
||||
#' Train Random Forests
|
||||
#'
|
||||
#' Trains the random forest. The type of response the random forest can be
|
||||
#' trained on varies depending on the \code{splitFinder},
|
||||
#' \code{nodeResponseCombiner}, and the \code{forestResponseCombiner}
|
||||
#' parameters. Make sure these are compatible with each other, and with the
|
||||
#' response you plug in. \code{splitFinder} should work on the responses you are
|
||||
#' providing; \code{nodeResponseCombiner} should combine these responses into
|
||||
#' some intermediate product, and \code{forestResponseCombiner} combines these
|
||||
#' intermediate products into the final output product. Note that
|
||||
#' \code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
|
||||
#' from the data (so feel free to not specify them), and \code{splitFinder} can
|
||||
#' be inferred but you might want to change its default.
|
||||
#'
|
||||
#' @param responses An R list of the responses. See \code{\link{CR_Response}}
|
||||
#' for an example function.
|
||||
#' @param data A data.frame containing the columns of the predictors and
|
||||
#' responses; not relevant if you're not using the formula version of
|
||||
#' \code{train}.
|
||||
#' @param covariateData A data.frame containing only the columns of the
|
||||
#' covariates you wish to use in your training (not relevant if you're using
|
||||
#' the formula version of \code{train}).
|
||||
#' @param splitFinder A split finder that's used to score splits in the random
|
||||
#' forest training algorithm. See \code{\link{Competing Risk Split Finders}}
|
||||
#' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
|
||||
#' this function tries to pick one based on the response. For
|
||||
#' \code{\link{CR_Response}} without censor times, it will pick a
|
||||
#' \code{\link{LogRankSplitFinder}}; while if censor times were provided it
|
||||
#' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{WeightedVarianceSplitFinder}}.
|
||||
#' @param nodeResponseCombiner A response combiner that's used to combine
|
||||
#' responses for each terminal node in a tree (regression example; average the
|
||||
#' observations in each tree into a single number). See
|
||||
#' \code{\link{CompetingRiskResponseCombiner}} or
|
||||
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
#' tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
#' picks a \code{\link{CompetingRiskResponseCombiner}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' @param forestResponseCombiner A response combiner that's used to combine
|
||||
#' predictions across trees into one final result (regression example; average
|
||||
#' the prediction of each tree into a single number). See
|
||||
#' \code{\link{CompetingRiskFunctionCombiner}} or
|
||||
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
#' tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
#' picks a \code{\link{CompetingRiskFunctionCombiner}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' @param ntree An integer that specifies how many trees should be trained.
|
||||
#' @param numberOfSplits A tuning parameter specifying how many random splits
|
||||
#' should be tried for a covariate; a value of 0 means all splits will be
|
||||
#' tried (with an exception for factors, who might have too many splits to
|
||||
#' feasibly compute).
|
||||
#' @param mtry A tuning parameter specifying how many covariates will be
|
||||
#' randomly chosen to be tried in the splitting process. This value must be at
|
||||
#' least 1.
|
||||
#' @param nodeSize The algorithm will not attempt to split a node that has
|
||||
#' observations less than 2*\code{nodeSize}; this guarantees that any two
|
||||
#' sibling terminal nodes together have an average size of at least
|
||||
#' \code{nodeSize}; note that it doesn't guarantee that every node is at least
|
||||
#' as large as \code{nodeSize}.
|
||||
#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
|
||||
#' controls tree length; by default \code{maxNodeDepth} is an extremely high
|
||||
#' number and tree depth is controlled by \code{nodeSize}.
|
||||
#' @param splitPureNodes This parameter determines whether the algorithm will
|
||||
#' split a pure node. If set to FALSE, then before every split it will check
|
||||
#' that every response is the same, and if so, not split. If set to TRUE it
|
||||
#' forgoes that check and splits it. Prediction accuracy won't change under
|
||||
#' any sensible \code{nodeResponseCombiner}; as all terminal nodes from a
|
||||
#' split pure node should give the same prediction, so this parameter only
|
||||
#' affects performance. If your response is continuous you'll likely
|
||||
#' experience faster train times by setting it to TRUE. Default value is TRUE.
|
||||
#' @param savePath If set, this parameter will save each tree of the random
|
||||
#' forest in this directory as the forest is trained. Use this parameter if
|
||||
#' you need to save memory while training. See also \code{\link{loadForest}}
|
||||
#' @param savePath.overwrite This parameter controls the behaviour for what
|
||||
#' happens if \code{savePath} is pointing to an existing directory. If set to
|
||||
#' \code{warn} (default) then \code{train} refuses to proceed. If set to
|
||||
#' \code{delete} then all the contents in that folder are deleted for the new
|
||||
#' forest to be trained. Note that all contents are deleted, even those files
|
||||
#' not related to \code{largeRCRF}. Use only if you're sure it's safe. If set
|
||||
#' to \code{merge}, then the files describing the forest (such as its
|
||||
#' parameters) are overwritten but the saved trees are not. The algorithm
|
||||
#' assumes (without checking) that the existing trees are from a previous run
|
||||
#' and starts from where it left off. This option is useful if recovering from
|
||||
#' a crash.
|
||||
#' @param cores This parameter specifies how many trees will be simultaneously
|
||||
#' trained. By default the package attempts to detect how many cores you have
|
||||
#' by using the \code{parallel} package and using all of them. You may specify
|
||||
#' a lower number if you wish. It is not recommended to specify a number
|
||||
#' greater than the number of available cores as this will hurt performance
|
||||
#' with no available benefit.
|
||||
#' @param randomSeed This parameter specifies a random seed if reproducible,
|
||||
#' deterministic forests are desired.
|
||||
#' @param displayProgress A logical indicating whether the progress should be
|
||||
#' displayed to console; default is \code{TRUE}. Useful to set to FALSE in
|
||||
#' some automated situations.
|
||||
#' @export
|
||||
#' @return A \code{JRandomForest} object. You may call \code{predict} or
|
||||
#' \code{print} on it.
|
||||
#' @seealso \code{\link{predict.JRandomForest}}
|
||||
#' @note If saving memory is a concern, you can replace \code{covariateData} or
|
||||
#' \code{data} with an environment containing one element called \code{data}
|
||||
#' as the actual dataset. After the data has been imported into Java, but
|
||||
#' before the forest training begins, the dataset in the environment is
|
||||
#' deleted, freeing up memory in R.
|
||||
#' @examples
|
||||
#' # Regression Example
|
||||
#' x1 <- rnorm(1000)
|
||||
#' x2 <- rnorm(1000)
|
||||
#' y <- 1 + x1 + x2 + rnorm(1000)
|
||||
#'
|
||||
#' data <- data.frame(x1, x2, y)
|
||||
#' forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#'
|
||||
#' # Fix x2 to be 0
|
||||
#' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
#'
|
||||
#' plot(ypred ~ newData$x1, type="l")
|
||||
#'
|
||||
#' # Competing Risk Example
|
||||
#' x1 <- abs(rnorm(1000))
|
||||
#' x2 <- abs(rnorm(1000))
|
||||
#'
|
||||
#' T1 <- rexp(1000, rate=x1)
|
||||
#' T2 <- rweibull(1000, shape=x1, scale=x2)
|
||||
#' C <- rexp(1000)
|
||||
#' u <- pmin(T1, T2, C)
|
||||
#' delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
|
||||
#'
|
||||
#' data <- data.frame(x1, x2)
|
||||
#'
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
#' LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||||
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
train <- function(x, ...) UseMethod("train")
|
||||
|
||||
|
||||
|
||||
#' @rdname train
|
||||
#' @export
|
||||
train.default <- function(responses, covariateData, splitFinder = splitFinderDefault(responses), nodeResponseCombiner = nodeResponseCombinerDefault(responses), forestResponseCombiner = forestResponseCombinerDefault(responses), ntree, numberOfSplits, mtry, nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL, savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(), randomSeed = NULL, displayProgress = TRUE){
|
||||
train.internal <- function(responses, covariateData, splitFinder,
|
||||
nodeResponseCombiner, forestResponseCombiner, ntree,
|
||||
numberOfSplits, mtry, nodeSize, maxNodeDepth,
|
||||
splitPureNodes, savePath, savePath.overwrite,
|
||||
cores, randomSeed, displayProgress){
|
||||
|
||||
# Some quick checks on parameters
|
||||
ntree <- as.integer(ntree)
|
||||
|
@ -187,6 +50,19 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
|
|||
if(is.null(savePath.overwrite) | length(savePath.overwrite)==0 | !(savePath.overwrite[1] %in% c("warn", "delete", "merge"))){
|
||||
stop("savePath.overwrite must be one of c(\"warn\", \"delete\", \"merge\")")
|
||||
}
|
||||
|
||||
if(is.null(splitFinder)){
|
||||
splitFinder <- splitFinderDefault(responses)
|
||||
}
|
||||
|
||||
if(is.null(nodeResponseCombiner)){
|
||||
nodeResponseCombiner <- nodeResponseCombinerDefault(responses)
|
||||
}
|
||||
|
||||
if(is.null(forestResponseCombiner)){
|
||||
forestResponseCombiner <- forestResponseCombinerDefault(responses)
|
||||
}
|
||||
|
||||
|
||||
|
||||
if(class(nodeResponseCombiner) != "ResponseCombiner"){
|
||||
|
@ -287,9 +163,8 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
forestObject <- list(call=match.call(), params=params, javaObject=forest.java, covariateList=dataset$covariateList, dataset=dataset$dataset)
|
||||
forestObject <- list(params=params, javaObject=forest.java,
|
||||
covariateList=dataset$covariateList, dataset=dataset$dataset)
|
||||
|
||||
class(forestObject) <- "JRandomForest"
|
||||
return(forestObject)
|
||||
|
@ -297,14 +172,147 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#' @rdname train
|
||||
#' @export
|
||||
#' Train Random Forests
|
||||
#'
|
||||
#' Trains the random forest. The type of response the random forest can be
|
||||
#' trained on varies depending on the \code{splitFinder},
|
||||
#' \code{nodeResponseCombiner}, and the \code{forestResponseCombiner}
|
||||
#' parameters. Make sure these are compatible with each other, and with the
|
||||
#' response you plug in. \code{splitFinder} should work on the responses you are
|
||||
#' providing; \code{nodeResponseCombiner} should combine these responses into
|
||||
#' some intermediate product, and \code{forestResponseCombiner} combines these
|
||||
#' intermediate products into the final output product. Note that
|
||||
#' \code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
|
||||
#' from the data (so feel free to not specify them), and \code{splitFinder} can
|
||||
#' be inferred but you might want to change its default.
|
||||
#'
|
||||
#' @param formula You may specify the response and covariates as a formula
|
||||
#' instead; make sure the response in the formula is still properly
|
||||
#' constructed; see \code{responses}
|
||||
train.formula <- function(formula, data, ...){
|
||||
#' @param data A data.frame containing the columns of the predictors and
|
||||
#' responses.
|
||||
#' @param splitFinder A split finder that's used to score splits in the random
|
||||
#' forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
|
||||
#' or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
|
||||
#' this function tries to pick one based on the response. For
|
||||
#' \code{\link{CR_Response}} without censor times, it will pick a
|
||||
#' \code{\link{LogRankSplitFinder}}; while if censor times were provided it
|
||||
#' will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{WeightedVarianceSplitFinder}}.
|
||||
#' @param nodeResponseCombiner A response combiner that's used to combine
|
||||
#' responses for each terminal node in a tree (regression example; average the
|
||||
#' observations in each tree into a single number). See
|
||||
#' \code{\link{CR_ResponseCombiner}} or
|
||||
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
#' tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
#' picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' @param forestResponseCombiner A response combiner that's used to combine
|
||||
#' predictions across trees into one final result (regression example; average
|
||||
#' the prediction of each tree into a single number). See
|
||||
#' \code{\link{CR_FunctionCombiner}} or
|
||||
#' \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||||
#' tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||||
#' picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
|
||||
#' responses it picks a \code{\link{MeanResponseCombiner}}.
|
||||
#' @param ntree An integer that specifies how many trees should be trained.
|
||||
#' @param numberOfSplits A tuning parameter specifying how many random splits
|
||||
#' should be tried for a covariate; a value of 0 means all splits will be
|
||||
#' tried (with an exception for factors, who might have too many splits to
|
||||
#' feasibly compute).
|
||||
#' @param mtry A tuning parameter specifying how many covariates will be
|
||||
#' randomly chosen to be tried in the splitting process. This value must be at
|
||||
#' least 1.
|
||||
#' @param nodeSize The algorithm will not attempt to split a node that has
|
||||
#' observations less than 2*\code{nodeSize}; this guarantees that any two
|
||||
#' sibling terminal nodes together have an average size of at least
|
||||
#' \code{nodeSize}; note that it doesn't guarantee that every node is at least
|
||||
#' as large as \code{nodeSize}.
|
||||
#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
|
||||
#' controls tree length; by default \code{maxNodeDepth} is an extremely high
|
||||
#' number and tree depth is controlled by \code{nodeSize}.
|
||||
#' @param splitPureNodes This parameter determines whether the algorithm will
|
||||
#' split a pure node. If set to FALSE, then before every split it will check
|
||||
#' that every response is the same, and if so, not split. If set to TRUE it
|
||||
#' forgoes that check and splits it. Prediction accuracy won't change under
|
||||
#' any sensible \code{nodeResponseCombiner}; as all terminal nodes from a
|
||||
#' split pure node should give the same prediction, so this parameter only
|
||||
#' affects performance. If your response is continuous you'll likely
|
||||
#' experience faster train times by setting it to TRUE. Default value is TRUE.
|
||||
#' @param savePath If set, this parameter will save each tree of the random
|
||||
#' forest in this directory as the forest is trained. Use this parameter if
|
||||
#' you need to save memory while training. See also \code{\link{loadForest}}
|
||||
#' @param savePath.overwrite This parameter controls the behaviour for what
|
||||
#' happens if \code{savePath} is pointing to an existing directory. If set to
|
||||
#' \code{warn} (default) then \code{train} refuses to proceed. If set to
|
||||
#' \code{delete} then all the contents in that folder are deleted for the new
|
||||
#' forest to be trained. Note that all contents are deleted, even those files
|
||||
#' not related to \code{largeRCRF}. Use only if you're sure it's safe. If set
|
||||
#' to \code{merge}, then the files describing the forest (such as its
|
||||
#' parameters) are overwritten but the saved trees are not. The algorithm
|
||||
#' assumes (without checking) that the existing trees are from a previous run
|
||||
#' and starts from where it left off. This option is useful if recovering from
|
||||
#' a crash.
|
||||
#' @param cores This parameter specifies how many trees will be simultaneously
|
||||
#' trained. By default the package attempts to detect how many cores you have
|
||||
#' by using the \code{parallel} package and using all of them. You may specify
|
||||
#' a lower number if you wish. It is not recommended to specify a number
|
||||
#' greater than the number of available cores as this will hurt performance
|
||||
#' with no available benefit.
|
||||
#' @param randomSeed This parameter specifies a random seed if reproducible,
|
||||
#' deterministic forests are desired.
|
||||
#' @param displayProgress A logical indicating whether the progress should be
|
||||
#' displayed to console; default is \code{TRUE}. Useful to set to FALSE in
|
||||
#' some automated situations.
|
||||
#' @export
|
||||
#' @return A \code{JRandomForest} object. You may call \code{predict} or
|
||||
#' \code{print} on it.
|
||||
#' @seealso \code{\link{predict.JRandomForest}}
|
||||
#' @note If saving memory is a concern, you can replace \code{covariateData} or
|
||||
#' \code{data} with an environment containing one element called \code{data}
|
||||
#' as the actual dataset. After the data has been imported into Java, but
|
||||
#' before the forest training begins, the dataset in the environment is
|
||||
#' deleted, freeing up memory in R.
|
||||
#' @examples
|
||||
#' # Regression Example
|
||||
#' x1 <- rnorm(1000)
|
||||
#' x2 <- rnorm(1000)
|
||||
#' y <- 1 + x1 + x2 + rnorm(1000)
|
||||
#'
|
||||
#' data <- data.frame(x1, x2, y)
|
||||
#' forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(),
|
||||
#' MeanResponseCombiner(), MeanResponseCombiner(), ntree=100,
|
||||
#' numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||||
#'
|
||||
#' # Fix x2 to be 0
|
||||
#' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
#'
|
||||
#' plot(ypred ~ newData$x1, type="l")
|
||||
#'
|
||||
#' # Competing Risk Example
|
||||
#' x1 <- abs(rnorm(1000))
|
||||
#' x2 <- abs(rnorm(1000))
|
||||
#'
|
||||
#' T1 <- rexp(1000, rate=x1)
|
||||
#' T2 <- rweibull(1000, shape=x1, scale=x2)
|
||||
#' C <- rexp(1000)
|
||||
#' u <- pmin(T1, T2, C)
|
||||
#' delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
|
||||
#'
|
||||
#' data <- data.frame(x1, x2)
|
||||
#'
|
||||
#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
|
||||
#' LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
|
||||
#' CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5,
|
||||
#' mtry=1, nodeSize=10)
|
||||
#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||||
#' ypred <- predict(forest, newData)
|
||||
train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
|
||||
forestResponseCombiner = NULL, ntree, numberOfSplits, mtry,
|
||||
nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL,
|
||||
savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(),
|
||||
randomSeed = NULL, displayProgress = TRUE){
|
||||
|
||||
# Having an R copy of the data loaded at the same time can be wasteful; we
|
||||
# also allow users to provide an environment of the data which gets removed
|
||||
|
@ -343,10 +351,10 @@ train.formula <- function(formula, data, ...){
|
|||
}
|
||||
|
||||
# Includes responses which we may need to later cut out
|
||||
mf <- model.frame(formula=formula, data=data, na.action=na.pass)
|
||||
mf <- stats::model.frame(formula=formula, data=data, na.action=stats::na.pass)
|
||||
|
||||
if(is.null(responses)){
|
||||
responses <- model.response(mf)
|
||||
responses <- stats::model.response(mf)
|
||||
}
|
||||
|
||||
# remove any response variables
|
||||
|
@ -356,13 +364,25 @@ train.formula <- function(formula, data, ...){
|
|||
if(!is.null(env)){
|
||||
env$data <- mf
|
||||
rm(data)
|
||||
forest <- train.default(responses, env, ...)
|
||||
forest <- train.internal(responses, env, splitFinder = splitFinder,
|
||||
nodeResponseCombiner = nodeResponseCombiner,
|
||||
forestResponseCombiner = forestResponseCombiner,
|
||||
ntree = ntree, numberOfSplits = numberOfSplits,
|
||||
mtry = mtry, nodeSize = nodeSize, maxNodeDepth = maxNodeDepth,
|
||||
splitPureNodes = splitPureNodes, savePath = savePath,
|
||||
savePath.overwrite = savePath.overwrite, cores = cores,
|
||||
randomSeed = randomSeed, displayProgress = displayProgress)
|
||||
} else{
|
||||
forest <- train.default(responses, mf, ...)
|
||||
forest <- train.internal(responses, mf, splitFinder = splitFinder,
|
||||
nodeResponseCombiner = nodeResponseCombiner,
|
||||
forestResponseCombiner = forestResponseCombiner,
|
||||
ntree = ntree, numberOfSplits = numberOfSplits,
|
||||
mtry = mtry, nodeSize = nodeSize, maxNodeDepth = maxNodeDepth,
|
||||
splitPureNodes = splitPureNodes, savePath = savePath,
|
||||
savePath.overwrite = savePath.overwrite, cores = cores,
|
||||
randomSeed = randomSeed, displayProgress = displayProgress)
|
||||
}
|
||||
|
||||
|
||||
|
||||
forest$call <- match.call()
|
||||
forest$formula <- formula
|
||||
|
||||
|
|
9
R/zzz.R
9
R/zzz.R
|
@ -1,6 +1,11 @@
|
|||
.onLoad <- function(libname, pkgname) {
|
||||
# rJava needs to be initialized with the path to the class files
|
||||
.jpackage(pkgname, lib.loc=libname, morePaths = "inst/java/")
|
||||
# rJava expects a folder called 'java' to be present in the root of the
|
||||
# package directory; but this isn't true when devtools::test() is run (instead
|
||||
# it's in the inst folder which won't be present if the package was actually
|
||||
# loaded from the R library). Thus with morePaths we make sure it loads up the
|
||||
# jar in this situation.
|
||||
.jpackage(pkgname, lib.loc=libname, morePaths="inst/java/largeRCRF-1.0-SNAPSHOT.jar")
|
||||
|
||||
}
|
||||
|
||||
#' @import rJava
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue