Version 1.0.1; cleaned up documentation.

Moving towards a CRAN release.
2019-06-30 15:07:29 -07:00 · 2019-06-30 15:07:29 -07:00 · 7a759d9dea
commit 7a759d9dea
parent 8f140beeb7
1022 changed files with 1180 additions and 237 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+copyJar
--- a/17
+++ b/17
@ -1,12 +1,19 @@
 Package: largeRCRF
 Type: Package
-Title: Large Random Competing Risk Forests, Java Implementation Run in R
-Version: 1.0.0.0
-Authors@R: person("Joel", "Therrien", email = "joel@joeltherrien.ca", role = c("aut", "cre"))
+Title: Large Random Competing Risks Forests
+Version: 1.0.1
+Authors@R: c(
+    person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")),
+    person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs"))
+    )
 Description: This package is used for training competing risk random forests on larger scale datasets. 
-    It currently only supports training models, running predictions, plotting those predictions (they are curves),
+    It currently supports training models, running predictions, plotting those predictions (they are curves),
    and some simple error analysis using concordance measures.
 License: GPL-3
+Copyright: All provided source code is copyrighted and owned by Joel Therrien. 
+     There are two dependencies (partially provided) used in the Java code; both
+     of which are licensed under the Apache 2.0 License. Please see the NOTICE
+     file for more information.
 Encoding: UTF-8
 LazyData: true
 Imports:
@ -14,6 +21,6 @@ Imports:
 Suggests:
    parallel,
    testthat
-Depends: R (>= 3.4.2)
+Depends: R (>= 3.4.0)
 SystemRequirements: Java JDK 1.8 or higher
 RoxygenNote: 6.1.1
--- a/2
+++ b/2
@ -15,8 +15,6 @@ S3method(print,CompetingRiskFunctions.List)
 S3method(print,JRandomForest)
 S3method(print,ResponseCombiner)
 S3method(print,SplitFinder)
-S3method(train,default)
-S3method(train,formula)
 export(CR_FunctionCombiner)
 export(CR_Response)
 export(CR_ResponseCombiner)
--- a/44
+++ b/44
@ -0,0 +1,44 @@
+# largeRCRF
+Copyright 2018-2019 Joel Therrien
+
+largeRCRF is licensed under the GPL-3 license. 
+
+largeRCRF contains in object form some Java classes from the
+Apache Commons CSV project and Jackson JSON processor; none of
+which were modified.
+
+Their copyright notices are displayed below, and the license files
+they provided may be found in the licenses/ subfolder.
+
+
+
+Apache Commons CSV
+Copyright 2005-2017 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+
+
+
+
+# Jackson JSON processor
+
+Jackson is a high-performance, Free/Open Source JSON processing library.
+It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has
+been in development since 2007.
+It is currently developed by a community of developers, as well as supported
+commercially by FasterXML.com.
+
+## Licensing
+
+Jackson core and extension components may be licensed under different licenses.
+To find the details that apply to this artifact see the accompanying LICENSE file.
+For more information, including possible other licensing options, contact
+FasterXML.com (http://fasterxml.com).
+
+## Credits
+
+A list of contributors may be found from CREDITS file, which is included
+in some artifacts (usually source distributions); but is always available
+from the source code management (SCM) system project uses.
--- a/R/cr_predictions.R
+++ b/R/cr_predictions.R
@ -48,7 +48,7 @@ extractCIF <- function (x, event) {

 #' @export
 extractCIF.CompetingRiskFunctions <- function(prediction, event){
-  fun <- stepfun(prediction$time.interest, c(0, prediction$cif[,event]))
+  fun <- stats::stepfun(prediction$time.interest, c(0, prediction$cif[,event]))
  
  class(fun) <- "function"
  attr(fun, "call") <- sys.call()
@ -70,7 +70,7 @@ extractCHF <- function (x, event) {

 #' @export
 extractCHF.CompetingRiskFunctions <- function(prediction, event){
-  fun <- stepfun(prediction$time.interest, c(0, prediction$chf[,event]))
+  fun <- stats::stepfun(prediction$time.interest, c(0, prediction$chf[,event]))
  
  class(fun) <- "function"
  attr(fun, "call") <- sys.call()
@ -93,7 +93,7 @@ extractSurvivorCurve <- function (x) {

 #' @export
 extractSurvivorCurve.CompetingRiskFunctions <- function(prediction){
-  fun <- stepfun(prediction$time.interest, c(1, prediction$survivorCurve))
+  fun <- stats::stepfun(prediction$time.interest, c(1, prediction$survivorCurve))
  
  class(fun) <- "function"
  attr(fun, "call") <- sys.call()
--- a/R/loadForest.R
+++ b/R/loadForest.R
@ -4,10 +4,10 @@
 #'
 #' Loads a random forest that was saved using \code{\link{saveForest}}.
 #'
-#' @param forest The directory created that saved the previous forest.
+#' @param directory The directory created that saved the previous forest.
 #' @return A JForest object; see \code{\link{train}} for details.
 #' @export
-#' @seealso \code{\link{train}}, \code{\link{saveForest}}, \code{\link{loadForestArg}}
+#' @seealso \code{\link{train}}, \code{\link{saveForest}}
 #' @examples
 #' # Regression Example
 #' x1 <- rnorm(1000)
--- a/R/misc.R
+++ b/R/misc.R
@ -78,10 +78,10 @@ plot.JMatrixPlottable <- function(mat, add=FALSE, type="s", xlab="Time", ylab=NU

    }

-    plot(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
+    graphics::plot(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
  }
  else{
-    points(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
+    graphics::points(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...)
  }

 }
--- a/R/predict.R
+++ b/R/predict.R
@ -28,7 +28,8 @@
 #' y <- 1 + x1 + x2 + rnorm(1000)
 #'
 #' data <- data.frame(x1, x2, y)
-#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
+#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, 
+#'     mtry = 1, nodeSize = 5)
 #'
 #' # Fix x2 to be 0
 #' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
@ -48,7 +49,8 @@
 #'
 #' data <- data.frame(x1, x2)
 #'
-#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
+#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data, ntree=100,
+#'    numberOfSplits=5, mtry=1, nodeSize=10)
 #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
 #' ypred <- predict(forest, newData)
 predict.JRandomForest <- function(forest, newData=NULL, parallel=TRUE, out.of.bag=NULL){
--- a/R/regressionComponents.R
+++ b/R/regressionComponents.R
@ -46,15 +46,6 @@ WeightedVarianceSplitFinder <- function(){
 #' responseCombiner <- MeanResponseCombiner()
 #' # You would then use it in train()
 #'
-#' # However; I'll show an internal Java method to make it clear what it does
-#' # Note that you should never have to do the following
-#' x <- 1:3
-#' x <- largeRCRF:::convertRListToJava(Numeric(x))
-#'
-#' # will output a Java object containing 2
-#' output <- rJava::.jcall(responseCombiner$javaObject, "Ljava/lang/Double;", "combine", x)
-#' responseCombiner$convertToRFunction(output)
-#'
 MeanResponseCombiner <- function(){
  javaObject <- .jnew(.class_MeanResponseCombiner)
  javaObject <- .jcast(javaObject, .class_ResponseCombiner)
--- a/R/saveForest.R
+++ b/R/saveForest.R
@ -3,7 +3,7 @@
 #' Save Random Forests
 #'
 #' Saves a random forest for later use, given that the base R
-#' \code{\link{base::save}} function doesn't work for this package.
+#' \code{\link[base]{save}} function doesn't work for this package.
 #'
 #' @param forest The forest to save.
 #' @param directory The directory that should be created to save the trees in.
@ -24,8 +24,8 @@
 #' forest <- train(y ~ x1 + x2, data,
 #'  ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
 #'
-#' saveForest(forest, "trees")
-#' new_forest <- loadForest("trees")
+#' saveForest(forest, "saved_forest")
+#' new_forest <- loadForest("saved_forest")
 saveForest <- function(forest, directory, overwrite=FALSE){
  check_and_create_directory(directory, overwrite)
  
--- a/R/train.R
+++ b/R/train.R
@ -14,148 +14,11 @@ getCores <- function(){
  return(cores)
 }

-#' Train Random Forests
-#'
-#' Trains the random forest. The type of response the random forest can be
-#' trained on varies depending on the \code{splitFinder},
-#' \code{nodeResponseCombiner}, and the \code{forestResponseCombiner}
-#' parameters. Make sure these are compatible with each other, and with the
-#' response you plug in. \code{splitFinder} should work on the responses you are
-#' providing; \code{nodeResponseCombiner} should combine these responses into
-#' some intermediate product, and \code{forestResponseCombiner} combines these
-#' intermediate products into the final output product. Note that
-#' \code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
-#' from the data (so feel free to not specify them), and \code{splitFinder} can
-#' be inferred but you might want to change its default.
-#'
-#' @param responses An R list of the responses. See \code{\link{CR_Response}}
-#'   for an example function.
-#' @param data A data.frame containing the columns of the predictors and
-#'   responses; not relevant if you're not using the formula version of
-#'   \code{train}.
-#' @param covariateData A data.frame containing only the columns of the
-#'   covariates you wish to use in your training (not relevant if you're using
-#'   the formula version of \code{train}).
-#' @param splitFinder A split finder that's used to score splits in the random
-#'   forest training algorithm. See \code{\link{Competing Risk Split Finders}}
-#'   or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
-#'   this function tries to pick one based on the response. For
-#'   \code{\link{CR_Response}} without censor times, it will pick a
-#'   \code{\link{LogRankSplitFinder}}; while if censor times were provided it
-#'   will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
-#'   responses it picks a \code{\link{WeightedVarianceSplitFinder}}.
-#' @param nodeResponseCombiner A response combiner that's used to combine
-#'   responses for each terminal node in a tree (regression example; average the
-#'   observations in each tree into a single number). See
-#'   \code{\link{CompetingRiskResponseCombiner}} or
-#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
-#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
-#'   picks a \code{\link{CompetingRiskResponseCombiner}}; for integer or numeric
-#'   responses it picks a \code{\link{MeanResponseCombiner}}.
-#' @param forestResponseCombiner A response combiner that's used to combine
-#'   predictions across trees into one final result (regression example; average
-#'   the prediction of each tree into a single number). See
-#'   \code{\link{CompetingRiskFunctionCombiner}} or
-#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
-#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
-#'   picks a \code{\link{CompetingRiskFunctionCombiner}}; for integer or numeric
-#'   responses it picks a \code{\link{MeanResponseCombiner}}.
-#' @param ntree An integer that specifies how many trees should be trained.
-#' @param numberOfSplits A tuning parameter specifying how many random splits
-#'   should be tried for a covariate; a value of 0 means all splits will be
-#'   tried (with an exception for factors, who might have too many splits to
-#'   feasibly compute).
-#' @param mtry A tuning parameter specifying how many covariates will be
-#'   randomly chosen to be tried in the splitting process. This value must be at
-#'   least 1.
-#' @param nodeSize The algorithm will not attempt to split a node that has
-#'   observations less than 2*\code{nodeSize}; this guarantees that any two
-#'   sibling terminal nodes together have an average size of at least
-#'   \code{nodeSize}; note that it doesn't guarantee that every node is at least
-#'   as large as \code{nodeSize}.
-#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
-#'   controls tree length; by default \code{maxNodeDepth} is an extremely high
-#'   number and tree depth is controlled by \code{nodeSize}.
-#' @param splitPureNodes This parameter determines whether the algorithm will
-#'   split a pure node. If set to FALSE, then before every split it will check
-#'   that every response is the same, and if so, not split. If set to TRUE it
-#'   forgoes that check and splits it. Prediction accuracy won't change under
-#'   any sensible \code{nodeResponseCombiner}; as all terminal nodes from a
-#'   split pure node should give the same prediction, so this parameter only
-#'   affects performance. If your response is continuous you'll likely
-#'   experience faster train times by setting it to TRUE. Default value is TRUE.
-#' @param savePath If set, this parameter will save each tree of the random
-#'   forest in this directory as the forest is trained. Use this parameter if
-#'   you need to save memory while training. See also \code{\link{loadForest}}
-#' @param savePath.overwrite This parameter controls the behaviour for what
-#'   happens if \code{savePath} is pointing to an existing directory. If set to
-#'   \code{warn} (default) then \code{train} refuses to proceed. If set to
-#'   \code{delete} then all the contents in that folder are deleted for the new
-#'   forest to be trained. Note that all contents are deleted, even those files
-#'   not related to \code{largeRCRF}. Use only if you're sure it's safe. If set
-#'   to \code{merge}, then the files describing the forest (such as its
-#'   parameters) are overwritten but the saved trees are not. The algorithm
-#'   assumes (without checking) that the existing trees are from a previous run
-#'   and starts from where it left off. This option is useful if recovering from
-#'   a crash.
-#' @param cores This parameter specifies how many trees will be simultaneously
-#'   trained. By default the package attempts to detect how many cores you have
-#'   by using the \code{parallel} package and using all of them. You may specify
-#'   a lower number if you wish. It is not recommended to specify a number
-#'   greater than the number of available cores as this will hurt performance
-#'   with no available benefit.
-#' @param randomSeed This parameter specifies a random seed if reproducible,
-#'   deterministic forests are desired.
-#' @param displayProgress A logical indicating whether the progress should be
-#'   displayed to console; default is \code{TRUE}. Useful to set to FALSE in
-#'   some automated situations.
-#' @export
-#' @return A \code{JRandomForest} object. You may call \code{predict} or
-#'   \code{print} on it.
-#' @seealso \code{\link{predict.JRandomForest}}
-#' @note If saving memory is a concern, you can replace \code{covariateData} or
-#'   \code{data} with an environment containing one element called \code{data}
-#'   as the actual dataset. After the data has been imported into Java, but
-#'   before the forest training begins, the dataset in the environment is
-#'   deleted, freeing up memory in R.
-#' @examples
-#' # Regression Example
-#' x1 <- rnorm(1000)
-#' x2 <- rnorm(1000)
-#' y <- 1 + x1 + x2 + rnorm(1000)
-#'
-#' data <- data.frame(x1, x2, y)
-#' forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(), MeanResponseCombiner(), MeanResponseCombiner(), ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5)
-#'
-#' # Fix x2 to be 0
-#' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
-#' ypred <- predict(forest, newData)
-#'
-#' plot(ypred ~ newData$x1, type="l")
-#'
-#' # Competing Risk Example
-#' x1 <- abs(rnorm(1000))
-#' x2 <- abs(rnorm(1000))
-#'
-#' T1 <- rexp(1000, rate=x1)
-#' T2 <- rweibull(1000, shape=x1, scale=x2)
-#' C <- rexp(1000)
-#' u <- pmin(T1, T2, C)
-#' delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
-#'
-#' data <- data.frame(x1, x2)
-#'
-#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
-#' LogRankSplitFinder(1:2), CR_kResponseCombiner(1:2), CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, mtry=1, nodeSize=10)
-#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
-#' ypred <- predict(forest, newData)
-train <- function(x, ...) UseMethod("train")
-
-
-
-#' @rdname train
-#' @export
-train.default <- function(responses, covariateData, splitFinder = splitFinderDefault(responses), nodeResponseCombiner = nodeResponseCombinerDefault(responses), forestResponseCombiner = forestResponseCombinerDefault(responses), ntree, numberOfSplits, mtry, nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL, savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(), randomSeed = NULL, displayProgress = TRUE){
+train.internal <- function(responses, covariateData, splitFinder, 
+                           nodeResponseCombiner, forestResponseCombiner, ntree, 
+                           numberOfSplits, mtry, nodeSize, maxNodeDepth, 
+                           splitPureNodes, savePath, savePath.overwrite, 
+                           cores, randomSeed, displayProgress){
  
  # Some quick checks on parameters
  ntree <- as.integer(ntree)
@ -187,6 +50,19 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
  if(is.null(savePath.overwrite) | length(savePath.overwrite)==0 | !(savePath.overwrite[1] %in% c("warn", "delete", "merge"))){
    stop("savePath.overwrite must be one of c(\"warn\", \"delete\", \"merge\")")
  }
+  
+  if(is.null(splitFinder)){
+    splitFinder <- splitFinderDefault(responses)
+  }
+  
+  if(is.null(nodeResponseCombiner)){
+    nodeResponseCombiner <- nodeResponseCombinerDefault(responses)
+  }
+  
+  if(is.null(forestResponseCombiner)){
+    forestResponseCombiner <- forestResponseCombinerDefault(responses)
+  }
+  


  if(class(nodeResponseCombiner) != "ResponseCombiner"){
@ -287,9 +163,8 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
  }
  

-  
-
-  forestObject <- list(call=match.call(), params=params, javaObject=forest.java, covariateList=dataset$covariateList, dataset=dataset$dataset)
+  forestObject <- list(params=params, javaObject=forest.java,
+                       covariateList=dataset$covariateList, dataset=dataset$dataset)

  class(forestObject) <- "JRandomForest"
  return(forestObject)
@ -297,14 +172,147 @@ train.default <- function(responses, covariateData, splitFinder = splitFinderDef
 }


-
-
-#' @rdname train
-#' @export
+#' Train Random Forests
+#'
+#' Trains the random forest. The type of response the random forest can be
+#' trained on varies depending on the \code{splitFinder},
+#' \code{nodeResponseCombiner}, and the \code{forestResponseCombiner}
+#' parameters. Make sure these are compatible with each other, and with the
+#' response you plug in. \code{splitFinder} should work on the responses you are
+#' providing; \code{nodeResponseCombiner} should combine these responses into
+#' some intermediate product, and \code{forestResponseCombiner} combines these
+#' intermediate products into the final output product. Note that
+#' \code{nodeResponseCombiner} and \code{forestResponseCombiner} can be inferred
+#' from the data (so feel free to not specify them), and \code{splitFinder} can
+#' be inferred but you might want to change its default.
+#'
 #' @param formula You may specify the response and covariates as a formula
 #'   instead; make sure the response in the formula is still properly
 #'   constructed; see \code{responses}
-train.formula <- function(formula, data, ...){
+#' @param data A data.frame containing the columns of the predictors and
+#'   responses.
+#' @param splitFinder A split finder that's used to score splits in the random
+#'   forest training algorithm. See \code{\link{CompetingRiskSplitFinders}}
+#'   or \code{\link{WeightedVarianceSplitFinder}}. If you don't specify one,
+#'   this function tries to pick one based on the response. For
+#'   \code{\link{CR_Response}} without censor times, it will pick a
+#'   \code{\link{LogRankSplitFinder}}; while if censor times were provided it
+#'   will pick \code{\link{GrayLogRankSplitFinder}}; for integer or numeric
+#'   responses it picks a \code{\link{WeightedVarianceSplitFinder}}.
+#' @param nodeResponseCombiner A response combiner that's used to combine
+#'   responses for each terminal node in a tree (regression example; average the
+#'   observations in each tree into a single number). See
+#'   \code{\link{CR_ResponseCombiner}} or
+#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
+#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
+#'   picks a \code{\link{CR_ResponseCombiner}}; for integer or numeric
+#'   responses it picks a \code{\link{MeanResponseCombiner}}.
+#' @param forestResponseCombiner A response combiner that's used to combine
+#'   predictions across trees into one final result (regression example; average
+#'   the prediction of each tree into a single number). See
+#'   \code{\link{CR_FunctionCombiner}} or
+#'   \code{\link{MeanResponseCombiner}}. If you don't specify one, this function
+#'   tries to pick one based on the response. For \code{\link{CR_Response}} it
+#'   picks a \code{\link{CR_FunctionCombiner}}; for integer or numeric
+#'   responses it picks a \code{\link{MeanResponseCombiner}}.
+#' @param ntree An integer that specifies how many trees should be trained.
+#' @param numberOfSplits A tuning parameter specifying how many random splits
+#'   should be tried for a covariate; a value of 0 means all splits will be
+#'   tried (with an exception for factors, who might have too many splits to
+#'   feasibly compute).
+#' @param mtry A tuning parameter specifying how many covariates will be
+#'   randomly chosen to be tried in the splitting process. This value must be at
+#'   least 1.
+#' @param nodeSize The algorithm will not attempt to split a node that has
+#'   observations less than 2*\code{nodeSize}; this guarantees that any two
+#'   sibling terminal nodes together have an average size of at least
+#'   \code{nodeSize}; note that it doesn't guarantee that every node is at least
+#'   as large as \code{nodeSize}.
+#' @param maxNodeDepth This parameter is analogous to \code{nodeSize} in that it
+#'   controls tree length; by default \code{maxNodeDepth} is an extremely high
+#'   number and tree depth is controlled by \code{nodeSize}.
+#' @param splitPureNodes This parameter determines whether the algorithm will
+#'   split a pure node. If set to FALSE, then before every split it will check
+#'   that every response is the same, and if so, not split. If set to TRUE it
+#'   forgoes that check and splits it. Prediction accuracy won't change under
+#'   any sensible \code{nodeResponseCombiner}; as all terminal nodes from a
+#'   split pure node should give the same prediction, so this parameter only
+#'   affects performance. If your response is continuous you'll likely
+#'   experience faster train times by setting it to TRUE. Default value is TRUE.
+#' @param savePath If set, this parameter will save each tree of the random
+#'   forest in this directory as the forest is trained. Use this parameter if
+#'   you need to save memory while training. See also \code{\link{loadForest}}
+#' @param savePath.overwrite This parameter controls the behaviour for what
+#'   happens if \code{savePath} is pointing to an existing directory. If set to
+#'   \code{warn} (default) then \code{train} refuses to proceed. If set to
+#'   \code{delete} then all the contents in that folder are deleted for the new
+#'   forest to be trained. Note that all contents are deleted, even those files
+#'   not related to \code{largeRCRF}. Use only if you're sure it's safe. If set
+#'   to \code{merge}, then the files describing the forest (such as its
+#'   parameters) are overwritten but the saved trees are not. The algorithm
+#'   assumes (without checking) that the existing trees are from a previous run
+#'   and starts from where it left off. This option is useful if recovering from
+#'   a crash.
+#' @param cores This parameter specifies how many trees will be simultaneously
+#'   trained. By default the package attempts to detect how many cores you have
+#'   by using the \code{parallel} package and using all of them. You may specify
+#'   a lower number if you wish. It is not recommended to specify a number
+#'   greater than the number of available cores as this will hurt performance
+#'   with no available benefit.
+#' @param randomSeed This parameter specifies a random seed if reproducible,
+#'   deterministic forests are desired.
+#' @param displayProgress A logical indicating whether the progress should be
+#'   displayed to console; default is \code{TRUE}. Useful to set to FALSE in
+#'   some automated situations.
+#' @export
+#' @return A \code{JRandomForest} object. You may call \code{predict} or
+#'   \code{print} on it.
+#' @seealso \code{\link{predict.JRandomForest}}
+#' @note If saving memory is a concern, you can replace \code{covariateData} or
+#'   \code{data} with an environment containing one element called \code{data}
+#'   as the actual dataset. After the data has been imported into Java, but
+#'   before the forest training begins, the dataset in the environment is
+#'   deleted, freeing up memory in R.
+#' @examples
+#' # Regression Example
+#' x1 <- rnorm(1000)
+#' x2 <- rnorm(1000)
+#' y <- 1 + x1 + x2 + rnorm(1000)
+#'
+#' data <- data.frame(x1, x2, y)
+#' forest <- train(y ~ x1 + x2, data, WeightedVarianceSplitFinder(),
+#'   MeanResponseCombiner(), MeanResponseCombiner(), ntree=100,
+#'   numberOfSplits = 5, mtry = 1, nodeSize = 5)
+#'
+#' # Fix x2 to be 0
+#' newData <- data.frame(x1 = seq(from=-2, to=2, by=0.5), x2 = 0)
+#' ypred <- predict(forest, newData)
+#'
+#' plot(ypred ~ newData$x1, type="l")
+#'
+#' # Competing Risk Example
+#' x1 <- abs(rnorm(1000))
+#' x2 <- abs(rnorm(1000))
+#'
+#' T1 <- rexp(1000, rate=x1)
+#' T2 <- rweibull(1000, shape=x1, scale=x2)
+#' C <- rexp(1000)
+#' u <- pmin(T1, T2, C)
+#' delta <- ifelse(u==T1, 1, ifelse(u==T2, 2, 0))
+#'
+#' data <- data.frame(x1, x2)
+#'
+#' forest <- train(CR_Response(delta, u) ~ x1 + x2, data,
+#'    LogRankSplitFinder(1:2), CR_ResponseCombiner(1:2),
+#'    CR_FunctionCombiner(1:2), ntree=100, numberOfSplits=5, 
+#'    mtry=1, nodeSize=10)
+#' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0)
+#' ypred <- predict(forest, newData)
+train <- function(formula, data, splitFinder = NULL, nodeResponseCombiner = NULL,
+                  forestResponseCombiner = NULL, ntree, numberOfSplits, mtry,
+                  nodeSize, maxNodeDepth = 100000, splitPureNodes=TRUE, savePath=NULL,
+                  savePath.overwrite=c("warn", "delete", "merge"), cores = getCores(),
+                  randomSeed = NULL, displayProgress = TRUE){
  
  # Having an R copy of the data loaded at the same time can be wasteful; we
  # also allow users to provide an environment of the data which gets removed
@ -343,10 +351,10 @@ train.formula <- function(formula, data, ...){
  }

  # Includes responses which we may need to later cut out
-  mf <- model.frame(formula=formula, data=data, na.action=na.pass)
+  mf <- stats::model.frame(formula=formula, data=data, na.action=stats::na.pass)

  if(is.null(responses)){
-    responses <- model.response(mf)
+    responses <- stats::model.response(mf)
  }

  # remove any response variables
@ -356,13 +364,25 @@ train.formula <- function(formula, data, ...){
  if(!is.null(env)){
    env$data <- mf
    rm(data)
-    forest <- train.default(responses, env, ...)
+    forest <- train.internal(responses, env, splitFinder = splitFinder,
+                             nodeResponseCombiner = nodeResponseCombiner,
+                             forestResponseCombiner = forestResponseCombiner,
+                             ntree = ntree, numberOfSplits = numberOfSplits,
+                             mtry = mtry, nodeSize = nodeSize, maxNodeDepth = maxNodeDepth,
+                             splitPureNodes = splitPureNodes, savePath = savePath,
+                             savePath.overwrite = savePath.overwrite, cores = cores,
+                             randomSeed = randomSeed, displayProgress = displayProgress)
  } else{
-    forest <- train.default(responses, mf, ...)
+    forest <- train.internal(responses, mf, splitFinder = splitFinder,
+                             nodeResponseCombiner = nodeResponseCombiner,
+                             forestResponseCombiner = forestResponseCombiner,
+                             ntree = ntree, numberOfSplits = numberOfSplits,
+                             mtry = mtry, nodeSize = nodeSize, maxNodeDepth = maxNodeDepth,
+                             splitPureNodes = splitPureNodes, savePath = savePath,
+                             savePath.overwrite = savePath.overwrite, cores = cores,
+                             randomSeed = randomSeed, displayProgress = displayProgress)
  }
  
-
-  
  forest$call <- match.call()
  forest$formula <- formula

--- a/R/zzz.R
+++ b/R/zzz.R
@ -1,6 +1,11 @@
 .onLoad <- function(libname, pkgname) {
-  # rJava needs to be initialized with the path to the class files
-  .jpackage(pkgname, lib.loc=libname, morePaths = "inst/java/")
+  # rJava expects a folder called 'java' to be present in the root of the
+  # package directory; but this isn't true when devtools::test() is run (instead
+  # it's in the inst folder which won't be present if the package was actually
+  # loaded from the R library). Thus with morePaths we make sure it loads up the
+  # jar in this situation.
+  .jpackage(pkgname, lib.loc=libname, morePaths="inst/java/largeRCRF-1.0-SNAPSHOT.jar")
+  
 }

 #' @import rJava
--- a/inst/java/ca/joeltherrien/randomforest/Bootstrapper.class
+++ b/inst/java/ca/joeltherrien/randomforest/Bootstrapper.class
--- a/inst/java/ca/joeltherrien/randomforest/CovariateRow.class
+++ b/inst/java/ca/joeltherrien/randomforest/CovariateRow.class
--- a/inst/java/ca/joeltherrien/randomforest/Main.class
+++ b/inst/java/ca/joeltherrien/randomforest/Main.class
--- a/inst/java/ca/joeltherrien/randomforest/Row.class
+++ b/inst/java/ca/joeltherrien/randomforest/Row.class
--- a/inst/java/ca/joeltherrien/randomforest/Settings$SettingsBuilder.class
+++ b/inst/java/ca/joeltherrien/randomforest/Settings$SettingsBuilder.class
--- a/inst/java/ca/joeltherrien/randomforest/Settings.class
+++ b/inst/java/ca/joeltherrien/randomforest/Settings.class
--- a/inst/java/ca/joeltherrien/randomforest/VisibleForTesting.class
+++ b/inst/java/ca/joeltherrien/randomforest/VisibleForTesting.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/Covariate$SplitRuleUpdater.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/Covariate$SplitRuleUpdater.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/Covariate$SplitUpdate.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/Covariate$SplitUpdate.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/Covariate$Value.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/Covariate$Value.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/Covariate.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/Covariate.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/SplitRule.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/SplitRule.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanCovariate$1.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanCovariate$1.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanCovariate$BooleanValue.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanCovariate$BooleanValue.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanCovariate.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanCovariate.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanSplitRule.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/bool/BooleanSplitRule.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorCovariate$1.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorCovariate$1.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorCovariate$FactorValue.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorCovariate$FactorValue.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorCovariate.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorCovariate.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorSplitRule.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/factor/FactorSplitRule.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericCovariate$1.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericCovariate$1.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericCovariate$NumericValue.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericCovariate$NumericValue.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericCovariate.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericCovariate.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericSplitRule.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericSplitRule.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericSplitRuleUpdater.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericSplitRuleUpdater.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericSplitUpdate.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/numeric/NumericSplitUpdate.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/settings/BooleanCovariateSettings.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/settings/BooleanCovariateSettings.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/settings/CovariateSettings.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/settings/CovariateSettings.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/settings/FactorCovariateSettings.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/settings/FactorCovariateSettings.class
--- a/inst/java/ca/joeltherrien/randomforest/covariates/settings/NumericCovariateSettings.class
+++ b/inst/java/ca/joeltherrien/randomforest/covariates/settings/NumericCovariateSettings.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskErrorRateCalculator.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskErrorRateCalculator.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskFunctions$CompetingRiskFunctionsBuilder.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskFunctions$CompetingRiskFunctionsBuilder.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskFunctions.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskFunctions.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskGraySetsImpl.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskGraySetsImpl.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponse$CompetingResponseLoader.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponse$CompetingResponseLoader.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponse.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponse.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponseWithCensorTime$CompetingResponseWithCensorTimeLoader.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponseWithCensorTime$CompetingResponseWithCensorTimeLoader.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponseWithCensorTime.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponseWithCensorTime.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskSets.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskSets.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskSetsImpl.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskSetsImpl.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskUtils.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskUtils.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/combiner/CompetingRiskFunctionCombiner.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/combiner/CompetingRiskFunctionCombiner.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/combiner/CompetingRiskResponseCombiner.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/combiner/CompetingRiskResponseCombiner.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/CompetingRiskSplitFinder$LogRankValue.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/CompetingRiskSplitFinder$LogRankValue.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/CompetingRiskSplitFinder.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/CompetingRiskSplitFinder.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/GrayLogRankSplitFinder.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/GrayLogRankSplitFinder.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/LogRankSplitFinder.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/LogRankSplitFinder.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/regression/MeanResponseCombiner.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/regression/MeanResponseCombiner.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/regression/WeightedVarianceSplitFinder$1.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/regression/WeightedVarianceSplitFinder$1.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/regression/WeightedVarianceSplitFinder$Set.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/regression/WeightedVarianceSplitFinder$Set.class
--- a/inst/java/ca/joeltherrien/randomforest/responses/regression/WeightedVarianceSplitFinder.class
+++ b/inst/java/ca/joeltherrien/randomforest/responses/regression/WeightedVarianceSplitFinder.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/Forest$ForestBuilder.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/Forest$ForestBuilder.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/Forest.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/Forest.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$1.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$1.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$ForestTrainerBuilder.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$ForestTrainerBuilder.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$TreeInMemoryWorker.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$TreeInMemoryWorker.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$TreeSavedWorker.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer$TreeSavedWorker.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/ForestTrainer.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/Node.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/Node.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/ResponseCombiner.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/ResponseCombiner.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/SimpleSplitFinder.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/SimpleSplitFinder.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/Split.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/Split.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/SplitAndScore.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/SplitAndScore.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/SplitFinder.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/SplitFinder.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/SplitNode$SplitNodeBuilder.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/SplitNode$SplitNodeBuilder.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/SplitNode.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/SplitNode.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/TerminalNode.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/TerminalNode.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/Tree.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/Tree.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/TreeTrainer$1.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/TreeTrainer$1.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/TreeTrainer$TreeTrainerBuilder.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/TreeTrainer$TreeTrainerBuilder.class
--- a/inst/java/ca/joeltherrien/randomforest/tree/TreeTrainer.class
+++ b/inst/java/ca/joeltherrien/randomforest/tree/TreeTrainer.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/DataUtils$DoubleLoader.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/DataUtils$DoubleLoader.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/DataUtils$ResponseLoader.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/DataUtils$ResponseLoader.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/DataUtils.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/DataUtils.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/DiscontinuousStepFunction.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/DiscontinuousStepFunction.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/IndexedIterator.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/IndexedIterator.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/LeftContinuousStepFunction.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/LeftContinuousStepFunction.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/MathFunction.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/MathFunction.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/Point.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/Point.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/RUtils.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/RUtils.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/RightContinuousStepFunction.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/RightContinuousStepFunction.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/SingletonIterator.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/SingletonIterator.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/StepFunction.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/StepFunction.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/SumFunction.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/SumFunction.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/UniqueSubsetValueIterator.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/UniqueSubsetValueIterator.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/UniqueValueIterator.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/UniqueValueIterator.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/Utils.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/Utils.class
--- a/inst/java/ca/joeltherrien/randomforest/utils/VeryDiscontinuousStepFunction.class
+++ b/inst/java/ca/joeltherrien/randomforest/utils/VeryDiscontinuousStepFunction.class
--- a/Show more
+++ b/Show more