diff --git a/DESCRIPTION b/DESCRIPTION index d608011..350fd85 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: largeRCRF Type: Package Title: Large Random Competing Risks Forests -Version: 1.0.1 +Version: 1.0.2 Authors@R: c( person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")), person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs")) diff --git a/NAMESPACE b/NAMESPACE index 2a9c930..d26b166 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,6 @@ S3method(extractMortalities,CompetingRiskFunctions) S3method(extractMortalities,CompetingRiskFunctions.List) S3method(extractSurvivorCurve,CompetingRiskFunctions) S3method(extractSurvivorCurve,CompetingRiskFunctions.List) -S3method(plot,JMatrixPlottable) S3method(predict,JRandomForest) S3method(print,CompetingRiskFunctions) S3method(print,CompetingRiskFunctions.List) diff --git a/R/cr_predictions.R b/R/cr_predictions.R index 673c89a..8c515f0 100644 --- a/R/cr_predictions.R +++ b/R/cr_predictions.R @@ -47,8 +47,8 @@ extractCIF <- function (x, event) { } #' @export -extractCIF.CompetingRiskFunctions <- function(prediction, event){ - fun <- stats::stepfun(prediction$time.interest, c(0, prediction$cif[,event])) +extractCIF.CompetingRiskFunctions <- function(x, event){ + fun <- stats::stepfun(x$time.interest, c(0, x$cif[,event])) class(fun) <- "function" attr(fun, "call") <- sys.call() @@ -56,8 +56,8 @@ extractCIF.CompetingRiskFunctions <- function(prediction, event){ } #' @export -extractCIF.CompetingRiskFunctions.List <- function(predictions, event){ - return(lapply(predictions, extractCIF.CompetingRiskFunctions, event)) +extractCIF.CompetingRiskFunctions.List <- function(x, event){ + return(lapply(x, extractCIF.CompetingRiskFunctions, event)) } #' @rdname CompetingRiskPredictions @@ -69,8 +69,8 @@ extractCHF <- function (x, event) { } #' @export -extractCHF.CompetingRiskFunctions <- function(prediction, event){ - fun <- stats::stepfun(prediction$time.interest, c(0, prediction$chf[,event])) +extractCHF.CompetingRiskFunctions <- function(x, event){ + fun <- stats::stepfun(x$time.interest, c(0, x$chf[,event])) class(fun) <- "function" attr(fun, "call") <- sys.call() @@ -78,8 +78,8 @@ extractCHF.CompetingRiskFunctions <- function(prediction, event){ } #' @export -extractCHF.CompetingRiskFunctions.List <- function(predictions, event){ - return(lapply(predictions, extractCHF.CompetingRiskFunctions, event)) +extractCHF.CompetingRiskFunctions.List <- function(x, event){ + return(lapply(x, extractCHF.CompetingRiskFunctions, event)) } @@ -92,8 +92,8 @@ extractSurvivorCurve <- function (x) { } #' @export -extractSurvivorCurve.CompetingRiskFunctions <- function(prediction){ - fun <- stats::stepfun(prediction$time.interest, c(1, prediction$survivorCurve)) +extractSurvivorCurve.CompetingRiskFunctions <- function(x){ + fun <- stats::stepfun(x$time.interest, c(1, x$survivorCurve)) class(fun) <- "function" attr(fun, "call") <- sys.call() @@ -101,8 +101,8 @@ extractSurvivorCurve.CompetingRiskFunctions <- function(prediction){ } #' @export -extractSurvivorCurve.CompetingRiskFunctions.List <- function(predictions){ - return(lapply(predictions, extractSurvivorCurve.CompetingRiskFunctions)) +extractSurvivorCurve.CompetingRiskFunctions.List <- function(x){ + return(lapply(x, extractSurvivorCurve.CompetingRiskFunctions)) } #' @rdname CompetingRiskPredictions @@ -115,7 +115,7 @@ extractMortalities <- function(x, event, time){ } #' @export -extractMortalities.CompetingRiskFunctions <- function(prediction, event, time){ +extractMortalities.CompetingRiskFunctions <- function(x, event, time){ if(is.null(event) | anyNA(event)){ stop("event must be specified") } @@ -124,11 +124,11 @@ extractMortalities.CompetingRiskFunctions <- function(prediction, event, time){ stop("time must be specified") } - return(.jcall(prediction$javaObject, "D", "calculateEventSpecificMortality", as.integer(event), time)) + return(.jcall(x$javaObject, "D", "calculateEventSpecificMortality", as.integer(event), time)) } #' @export -extractMortalities.CompetingRiskFunctions.List <- function(predictions, event, time){ +extractMortalities.CompetingRiskFunctions.List <- function(x, event, time){ if(is.null(event) | anyNA(event)){ stop("event must be specified") } @@ -137,5 +137,5 @@ extractMortalities.CompetingRiskFunctions.List <- function(predictions, event, t stop("time must be specified") } - return(as.numeric(lapply(predictions, extractMortalities.CompetingRiskFunctions, event, time))) + return(as.numeric(lapply(x, extractMortalities.CompetingRiskFunctions, event, time))) } \ No newline at end of file diff --git a/R/misc.R b/R/misc.R index 2073041..dc4f040 100644 --- a/R/misc.R +++ b/R/misc.R @@ -15,39 +15,39 @@ convertRListToJava <- function(lst){ } #' @export -print.SplitFinder = function(splitFinder) print(splitFinder$call) +print.SplitFinder = function(x, ...) print(x$call) #' @export -print.ResponseCombiner = function(combiner) print(combiner$call) +print.ResponseCombiner = function(x, ...) print(x$call) #' @export -print.JRandomForest <- function(forest){ +print.JRandomForest <- function(x, ...){ cat("Call:\n") - print(forest$call) + print(x$call) cat("\nParameters:\n") - cat("\tSplit Finder: "); print(forest$params$splitFinder$call) - cat("\tTerminal Node Response Combiner: "); print(forest$params$nodeResponseCombiner$call) - cat("\tForest Response Combiner: "); print(forest$params$forestResponseCombiner$call) - cat("\t# of trees: "); cat(forest$params$ntree); cat("\n") - cat("\t# of Splits: "); cat(forest$params$numberOfSplits); cat("\n") - cat("\t# of Covariates to try: "); cat(forest$params$mtry); cat("\n") - cat("\tNode Size: "); cat(forest$params$nodeSize); cat("\n") - cat("\tMax Node Depth: "); cat(forest$params$maxNodeDepth); cat("\n") + cat("\tSplit Finder: "); print(x$params$splitFinder$call) + cat("\tTerminal Node Response Combiner: "); print(x$params$nodeResponseCombiner$call) + cat("\tForest Response Combiner: "); print(x$params$forestResponseCombiner$call) + cat("\t# of trees: "); cat(x$params$ntree); cat("\n") + cat("\t# of Splits: "); cat(x$params$numberOfSplits); cat("\n") + cat("\t# of Covariates to try: "); cat(x$params$mtry); cat("\n") + cat("\tNode Size: "); cat(x$params$nodeSize); cat("\n") + cat("\tMax Node Depth: "); cat(x$params$maxNodeDepth); cat("\n") cat("Try using me with predict() or one of the relevant commands to determine error\n") } #' @export -print.CompetingRiskFunctions.List <- function(lst){ +print.CompetingRiskFunctions.List <- function(x, ...){ cat("Number of predictions: ") - cat(length(lst)) + cat(length(x)) cat("\n\nSee the help page ?CompetingRiskPredictions for a list of relevant functions on how to use this object.\n") } #' @export -print.CompetingRiskFunctions <- function(functions){ - mx <- ncol(functions$cif) +print.CompetingRiskFunctions <- function(x, ...){ + mx <- ncol(x$cif) cat(mx); cat(" CIFs available\n") cat(mx); cat(" CHFs available\n") cat("An overall survival curve available\n") @@ -55,33 +55,4 @@ print.CompetingRiskFunctions <- function(functions){ } -#' @export -plot.JMatrixPlottable <- function(mat, add=FALSE, type="s", xlab="Time", ylab=NULL, col="black", ...){ - if(!add){ - if(is.null(ylab)){ - matType <- attr(mat, "type") - event <- attr(mat, "event") - if(matType == "cif"){ - ylab <- paste0("CIF-", event, "(t)") - } - else if(matType == "chf"){ - ylab <- paste0("CHF(t)-", event, "(t)") - } - else if(matType == "kaplanMeier"){ - ylab <- "S-hat(t)" - } - else{ - ylab <- "Y" - warning("Unknown type attribute in plottable object") - } - - } - - graphics::plot(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...) - } - else{ - graphics::points(mat[,2] ~ mat[,1], col=col, type=type, xlab=xlab, ylab=ylab, ...) - } - -} diff --git a/R/predict.R b/R/predict.R index bcd84bd..8d7ff5a 100644 --- a/R/predict.R +++ b/R/predict.R @@ -4,10 +4,10 @@ #' #' Predict on the random forest. #' -#' @param forest A forest that was previously \code{\link{train}}ed +#' @param object A forest that was previously \code{\link{train}}ed #' @param newData The new data containing all of the previous predictor #' covariates. Can be NULL if you want to use the training dataset, and -#' \code{forest} hasn't been loaded from the disk; otherwise you'll have to +#' \code{object} hasn't been loaded from the disk; otherwise you'll have to #' specify it. #' @param parallel A logical indicating whether multiple cores should be #' utilized when making the predictions. Available as an option because it's @@ -18,6 +18,8 @@ #' 'out of bag' trees; set only to \code{TRUE} if you're running predictions #' on data that was used in the training. Default value is \code{TRUE} if #' \code{newData} is \code{NULL}, otherwise \code{FALSE}. +#' @param ... Other parameters that may one day get passed onto other functions; +#' currently not used. #' @return A list of responses corresponding with each row of \code{newData} if #' it's a non-regression random forest; otherwise it returns a numeric vector. #' @export @@ -28,7 +30,7 @@ #' y <- 1 + x1 + x2 + rnorm(1000) #' #' data <- data.frame(x1, x2, y) -#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, +#' forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, #' mtry = 1, nodeSize = 5) #' #' # Fix x2 to be 0 @@ -53,7 +55,10 @@ #' numberOfSplits=5, mtry=1, nodeSize=10) #' newData <- data.frame(x1 = c(-1, 0, 1), x2 = 0) #' ypred <- predict(forest, newData) -predict.JRandomForest <- function(forest, newData=NULL, parallel=TRUE, out.of.bag=NULL){ +predict.JRandomForest <- function(object, newData=NULL, parallel=TRUE, out.of.bag=NULL, ...){ + + # slight renaming + forest <- object if(is.null(newData) & is.null(forest$dataset)){ stop("forest doesn't have a copy of the training data loaded (this happens if you just loaded it); please manually specify newData and possibly out.of.bag") diff --git a/README.md b/README.md index bde971a..f959099 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,66 @@ -# README - -This R package is used to train random competing risks forests, ideally for large data. -It's based heavily off of [randomForestSRC](https://github.com/kogalur/randomForestSRC/), although there are some differences. - -This package is still in a pre-release state and so it not yet available on CRAN. -To install it now, in R install the `devtools` package and run the following command: -``` -R> devtools::install_git("https://github.com/jatherrien/largeRCRF.git") -``` - -## System Requirements - -You need: - -* R version 3.4.2 or greater -* The `rJava` package version 0.9-9 or greater -* A Java runtime version 1.8 or greater - -## Troubleshooting - -### I get an `OutOfMemoryException` error but I have plenty of RAM - -`largeRCRF` makes use of the Java virtual machine, which unfortunately restricts itself by default to a quarter of your system memory. -You can override the default by including **before** loading `largeRCRF` or any other `rJava` based package the following line: - -``` -R> options(java.parameters <- c("-Xmx13G", "-Xms13G")) -``` - -with `13G` replaced with a little less than your available system memory. - -### I get an `OutOfMemoryException` error and I'm short on RAM - -Obviously if you're short on RAM there is a limit on how large of a dataset you can train, -but there are some techniques you can use to limit how much `largeRCRF` needs. - -* If your training dataset is large you might not want both R and `largeRCRF` to have their own separate copies -(limitations due to Java require `largeRCRF` have its own copy). When specifying the `data` parameter into `train`, -instead provide an environment containing one object called `data` which is the dataset. `largeRCRF` will delete that variable -after importing it into the Java environment. - -Example: -``` -R> data.env <- new.env() -R> data.env$data <- trainingData -R> rm(trainingData) -R> model <- train(..., data=data.env, ...) -``` - -* Each core that is training trees requires its own memory; you can try limiting `largeRCRF` to train only one tree at a time by specifiying `cores=1`. - -* By default `largeRCRF` keeps the entire forest loaded in memory during training, -when in practice only the trees being trained on need to be loaded. -You can specify `savePath` to give a directory for `largeRCRF` to save trees in during training, -which will allow to `largeRCRF` to conserve memory for only those trees being currently trained. - -### Training stalls immediately at 0 trees and the CPU is idle - -This issue has been observed before on one particular system (and only on that system) but it's not clear what causes it. -It would be appreciated if you could report this bug to [joelt@sfu.ca](mailto:joelt@sfu.ca) and give your operating system -and the version of Java installed (the entire output of `java --version`). - -As a workaround, this issue seems to occur randomly; so try restarting your code to see if it runs. - - +# README + +This R package is used to train random competing risks forests, ideally for large data. +It's based heavily off of [randomForestSRC](https://github.com/kogalur/randomForestSRC/), although there are some differences. + +This package is still in a pre-release state and so it not yet available on CRAN. +To install it now, in R install the `devtools` package and run the following command: +``` +R> devtools::install_git("https://github.com/jatherrien/largeRCRF.git") +``` + +## System Requirements + +You need: + +* R version 3.4.0 or greater +* The `rJava` package version 0.9-9 or greater +* A Java runtime version 1.8 or greater + +## Troubleshooting + +### I get an `OutOfMemoryException` error but I have plenty of RAM + +`largeRCRF` makes use of the Java virtual machine, which unfortunately restricts itself by default to a quarter of your system memory. +You can override the default by including **before** loading `largeRCRF` or any other `rJava` based package the following line: + +``` +R> options(java.parameters <- c("-Xmx13G", "-Xms13G")) +``` + +with `13G` replaced with a little less than your available system memory. + +### I get an `OutOfMemoryException` error and I'm short on RAM + +Obviously if you're short on RAM there is a limit on how large of a dataset you can train, +but there are some techniques you can use to limit how much `largeRCRF` needs. + +* If your training dataset is large you might not want both R and `largeRCRF` to have their own separate copies +(limitations due to Java require `largeRCRF` have its own copy). When specifying the `data` parameter into `train`, +instead provide an environment containing one object called `data` which is the dataset. `largeRCRF` will delete that variable +after importing it into the Java environment. + +Example: +``` +R> data.env <- new.env() +R> data.env$data <- trainingData +R> rm(trainingData) +R> model <- train(..., data=data.env, ...) +``` + +* Each core that is training trees requires its own memory; you can try limiting `largeRCRF` to train only one tree at a time by specifiying `cores=1`. + +* By default `largeRCRF` keeps the entire forest loaded in memory during training, +when in practice only the trees being trained on need to be loaded. +You can specify `savePath` to give a directory for `largeRCRF` to save trees in during training, +which will allow to `largeRCRF` to conserve memory for only those trees being currently trained. + +### Training stalls immediately at 0 trees and the CPU is idle + +This issue has been observed before on one particular system (and only on that system) but it's not clear what causes it. +It would be appreciated if you could report this bug to [joelt@sfu.ca](mailto:joelt@sfu.ca) and give your operating system +and the version of Java installed (the entire output of `java --version`). + +As a workaround, this issue seems to occur randomly; so try restarting your code to see if it runs. + + diff --git a/man/predict.JRandomForest.Rd b/man/predict.JRandomForest.Rd index adf4272..bba20a7 100644 --- a/man/predict.JRandomForest.Rd +++ b/man/predict.JRandomForest.Rd @@ -4,15 +4,15 @@ \alias{predict.JRandomForest} \title{Predict} \usage{ -\method{predict}{JRandomForest}(forest, newData = NULL, - parallel = TRUE, out.of.bag = NULL) +\method{predict}{JRandomForest}(object, newData = NULL, + parallel = TRUE, out.of.bag = NULL, ...) } \arguments{ -\item{forest}{A forest that was previously \code{\link{train}}ed} +\item{object}{A forest that was previously \code{\link{train}}ed} \item{newData}{The new data containing all of the previous predictor covariates. Can be NULL if you want to use the training dataset, and -\code{forest} hasn't been loaded from the disk; otherwise you'll have to +\code{object} hasn't been loaded from the disk; otherwise you'll have to specify it.} \item{parallel}{A logical indicating whether multiple cores should be @@ -25,6 +25,9 @@ get strange errors while predicting.} 'out of bag' trees; set only to \code{TRUE} if you're running predictions on data that was used in the training. Default value is \code{TRUE} if \code{newData} is \code{NULL}, otherwise \code{FALSE}.} + +\item{...}{Other parameters that may one day get passed onto other functions; +currently not used.} } \value{ A list of responses corresponding with each row of \code{newData} if @@ -40,7 +43,7 @@ x2 <- rnorm(1000) y <- 1 + x1 + x2 + rnorm(1000) data <- data.frame(x1, x2, y) -forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, +forest <- train(y ~ x1 + x2, data, ntree=100, numberOfSplits = 5, mtry = 1, nodeSize = 5) # Fix x2 to be 0