171 lines
7.7 KiB
Text
171 lines
7.7 KiB
Text
|
% Generated by roxygen2: do not edit by hand
|
||
|
% Please edit documentation in R/train.R
|
||
|
\name{train}
|
||
|
\alias{train}
|
||
|
\alias{train.default}
|
||
|
\alias{train.formula}
|
||
|
\title{Train Random Forests}
|
||
|
\usage{
|
||
|
train(x, ...)
|
||
|
|
||
|
\method{train}{default}(responses, covariateData,
|
||
|
splitFinder = splitFinderDefault(responses),
|
||
|
nodeResponseCombiner = nodeResponseCombinerDefault(responses),
|
||
|
forestResponseCombiner = forestResponseCombinerDefault(responses),
|
||
|
ntree, numberOfSplits, mtry, nodeSize, maxNodeDepth = 1e+05,
|
||
|
splitPureNodes = TRUE, savePath = NULL,
|
||
|
savePath.overwrite = c("warn", "delete", "merge"),
|
||
|
cores = getCores(), randomSeed = NULL)
|
||
|
|
||
|
\method{train}{formula}(formula, covariateData, ...)
|
||
|
}
|
||
|
\arguments{
|
||
|
\item{responses}{An R list of the responses. See \code{\link{CR_Response}}
|
||
|
for an example function.}
|
||
|
|
||
|
\item{covariateData}{A data.frame containing only the columns of the
|
||
|
covariates you wish to use in your training (unless you're using the
|
||
|
\code{formula} version of \code{train}, in which case it should contain the
|
||
|
response as well).}
|
||
|
|
||
|
\item{splitFinder}{A split finder that's used to score splits in the random
|
||
|
forest training algorithm. See \code{\link{Competing Risk Split Finders}}
|
||
|
or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
|
||
|
this function tries to pick one based on the response. For
|
||
|
\code{\link{CR_Response}} wihtout censor times, it will pick a
|
||
|
\code{\link{LogRankSplitFinder}}; while if censor times were provided it
|
||
|
will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
|
||
|
responses it picks a \code{\link{WeightedVarianceSplitFinder}}.}
|
||
|
|
||
|
\item{nodeResponseCombiner}{A response combiner that's used to combine
|
||
|
responses for each terminal node in a tree (regression example; average the
|
||
|
observations in each tree into a single number). See
|
||
|
\code{\link{CompetingRiskResponseCombiner}} or
|
||
|
\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||
|
tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||
|
picks a \code{\link{CompetingRiskResponseCombiner}}; for integer or numeric
|
||
|
responses it picks a \code{\link{MeanResponseCombiner}}.}
|
||
|
|
||
|
\item{forestResponseCombiner}{A response combiner that's used to combine
|
||
|
predictions across trees into one final result (regression example; average
|
||
|
the prediction of each tree into a single number). See
|
||
|
\code{\link{CompetingRiskFunctionCombiner}} or
|
||
|
\code{\link{MeanResponseCombiner}}. If you don't specify one, this function
|
||
|
tries to pick one based on the response. For \code{\link{CR_Response}} it
|
||
|
picks a \code{\link{CompetingRiskFunctionCombiner}}; for integer or numeric
|
||
|
responses it picks a \code{\link{MeanResponseCombiner}}.}
|
||
|
|
||
|
\item{ntree}{An integer that specifies how many trees should be trained.}
|
||
|
|
||
|
\item{numberOfSplits}{A tuning parameter specifying how many random splits
|
||
|
should be tried for a covariate; a value of 0 means all splits will be
|
||
|
tried (with an exception for factors, who might have too many splits to
|
||
|
feasibly compute).}
|
||
|
|
||
|
\item{mtry}{A tuning parameter specifying how many covariates will be
|
||
|
randomly chosen to be tried in the splitting process. This value must be at
|
||
|
least 1.}
|
||
|
|
||
|
\item{nodeSize}{The algorithm will not attempt to split a node that has
|
||
|
observations less than 2*\code{nodeSize}; this results in terminal nodes
|
||
|
having a size of roughly \code{nodeSize} (true sizes may be both smaller or
|
||
|
greater). This value must be at least 1.}
|
||
|
|
||
|
\item{maxNodeDepth}{This parameter is analogous to \code{nodeSize} in that it
|
||
|
helps keep trees shorter; by default maxNodeDepth is an extremely high
|
||
|
number and tree depth is controlled by \code{nodeSize}.}
|
||
|
|
||
|
\item{splitPureNodes}{This parameter determines whether the algorithm will
|
||
|
split a pure node. If set to FALSE, then before every split it will check
|
||
|
that every response is the same, and if so, not split. If set to TRUE it
|
||
|
forgoes that check and just splits. Prediction accuracy won't change under
|
||
|
any sensible \code{nodeResponseCombiner} as all terminal nodes from a split
|
||
|
pure node should give the same prediction, so this parameter only affects
|
||
|
performance. If your response is continuous you'll likely experience faster
|
||
|
train times by setting it to TRUE. Default value is TRUE.}
|
||
|
|
||
|
\item{savePath}{If set, this parameter will save each tree of the random
|
||
|
forest in this directory as the forest is trained. Use this parameter if
|
||
|
you need to save memory while training. See also \code{\link{load_forest}}}
|
||
|
|
||
|
\item{savePath.overwrite}{This parameter controls the behaviour for what
|
||
|
happens if \code{savePath} is pointing to an existing directory. If set to
|
||
|
\code{warn} (default) then \code{train} refuses to proceed. If set to
|
||
|
\code{delete} then all the contents in that folder are deleted for the new
|
||
|
forest to be trained. Note that all contents are deleted, even those files
|
||
|
not related to \code{largeRCRF}. Use only if you're sure it's safe. If set
|
||
|
to \code{merge}, then the files describing the forest (such as its
|
||
|
parameters) are overwritten but the saved trees are not. The algorithm
|
||
|
assumes (without checking) that the existing trees are from a previous run
|
||
|
and starts from where it left off. This option is useful if recovering from
|
||
|
a crash.}
|
||
|
|
||
|
\item{cores}{This parameter specifies how many trees will be simultaneously
|
||
|
trained. By default the package attempts to detect how many cores you have
|
||
|
by using the \code{parallel} package, and using all of them. You may
|
||
|
specify a lower number if you wish. It is not recommended to specify a
|
||
|
number greater than the number of available cores as this will hurt
|
||
|
performance with no available benefit.}
|
||
|
|
||
|
\item{randomSeed}{This parameter specifies a random seed if reproducible,
|
||
|
deterministic forests are desired. The number o1}
|
||
|
|
||
|
\item{formula}{You may specify the response and covariates as a formula instead; make sure the response in the formula is still properly constructed; see \code{responses}}
|
||
|
}
|
||
|
\value{
|
||
|
A \code{JRandomForest} object. You may call \code{predict} or
|
||
|
\code{print} on it.
|
||
|
}
|
||
|
\description{
|
||
|
Trains the random forest. The type of response the random forest can be
|
||
|
trained on varies depending on the \code{splitFinder},
|
||
|
\code{nodeResponseCombiner}, and the \code{forestResponseCombiner}
|
||
|
parameters. Make sure these are compatible with each other, and with the
|
||
|
response you plug in. \code{splitFinder} should work on the responses you are
|
||
|
providing; \code{nodeResponseCombiner} should combine these responses into
|
||
|
some intermediate product, and \code{forestResponseCombiner} combines these
|
||
|
intermediate products into the final output product.
|
||
|
}
|
||
|
\note{
|
||
|
If saving memory is a concern, you can replace \code{covariateData}
|
||
|
with an environment containing one element called \code{data} as the actual
|
||
|
dataset. After the data has been imported into Java, but before the forest
|
||
|
training begins, the dataset in the environment is deleted, freeing up
|
||
|
memory in R.
|
||
|
}
|
||
|
\examples{
|
||
|
# Regression Example
|
||
|
x1 <- rnorm(1000)
|
||
|
x2 <- rnorm(1000)
|
||
|
y <- 1 + x1 + x2 + rnorm(1000)
|
||
|
|
||
|
data <- data.frame(x1, x2, y)
|
||
|
forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
|
||
|
|
||
|
# Fix x2 to be 0
|
||
|
newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
|
||
|
ypred <- predict(forest, newData)
|
||
|
|
||
|
plot(ypred ~ newData$x1, type="l")
|
||
|
|
||
|
# Competing Risk Example
|
||
|
x1 <- abs(rnorm(1000))
|
||
|
x2 <- abs(rnorm(1000))
|
||
|
|
||
|
T1 <- rexp(1000, rate=x1)
|
||
|
T2 <- rweibull(1000, shape=x1, scale=x2)
|
||
|
C <- rexp(1000)
|
||
|
u <- pmin(T1, T2, C)
|
||
|
delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
|
||
|
|
||
|
data <- data.frame(x1, x2)
|
||
|
|
||
|
forest <- train(CompetingRiskResponses(delta, u) ~ x1 + x2, data,
|
||
|
LogRankSplitFinder(1:2), CompetingRiskResponseCombiner(1:2), CompetingRiskFunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
|
||
|
newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
|
||
|
ypred <- predict(forest, newData)
|
||
|
}
|
||
|
\seealso{
|
||
|
\code{\link{predict.JRandomForest}}
|
||
|
}
|