Add integrated Brier score error measure

2019-07-23 11:17:21 -07:00 · 2019-07-23 11:17:21 -07:00 · 360be1f80e
commit 360be1f80e
parent cb4c9b73ae
10 changed files with 329 additions and 5 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 Package: largeRCRF
 Type: Package
 Title: Large Random Competing Risks Forests
-Version: 1.0.3
+Version: 1.0.4
 Authors@R: c(
    person("Joel", "Therrien", email = "joel_therrien@sfu.ca", role = c("aut", "cre", "cph")),
    person("Jiguo", "Cao", email = "jiguo_cao@sfu.ca", role = c("aut", "dgs"))
--- a/3
+++ b/3
@ -1,5 +1,7 @@
 # Generated by roxygen2: do not edit by hand

+S3method("[",CompetingRiskResponses)
+S3method("[",CompetingRiskResponsesWithCensorTimes)
 S3method(extractCHF,CompetingRiskFunctions)
 S3method(extractCHF,CompetingRiskFunctions.List)
 S3method(extractCIF,CompetingRiskFunctions)
@ -28,6 +30,7 @@ export(extractCHF)
 export(extractCIF)
 export(extractMortalities)
 export(extractSurvivorCurve)
+export(integratedBrierScore)
 export(loadForest)
 export(naiveConcordance)
 export(saveForest)
--- a/R/CR_Response.R
+++ b/R/CR_Response.R
@ -38,6 +38,51 @@ CR_Response <- function(delta, u, C = NULL){
 }


+# This function is useful is we ever want to do something like CR_Response(c(1,1,2), c(0.1,0.2,0.3))[1]
+#' @export
+"[.CompetingRiskResponses" <- function(object, indices){
+  newList <- list(
+    eventIndicator = object$eventIndicator[indices],
+    eventTime = object$eventTime[indices]
+    )
+  
+  previous.java.list <- object$javaObject
+  
+  new.java.list <- .jcall(.class_RUtils,
+                          makeResponse(.class_List),
+                          "produceSublist",
+                          previous.java.list,
+                          .jarray(as.integer(indices - 1)))
+  
+  newList$javaObject <- new.java.list
+  
+  class(newList) <- "CompetingRiskResponses"
+  return(newList)
+}
+
+# This function is useful is we ever want to do something like CR_Response(c(1,1,2), c(0.1,0.2,0.3), c(2,3,4))[1]
+#' @export
+"[.CompetingRiskResponsesWithCensorTimes" <- function(object, indices){
+  newList <- list(
+    eventIndicator = object$eventIndicator[indices],
+    eventTime = object$eventTime[indices],
+    censorTime = object$censorTime[indices]
+    )
+  
+  previous.java.list <- object$javaObject
+  
+  new.java.list <- .jcall(.class_RUtils,
+                          makeResponse(.class_List),
+                          "produceSublist",
+                          previous.java.list,
+                          .jarray(as.integer(indices - 1)))
+  
+  newList$javaObject <- new.java.list
+  
+  class(newList) <- "CompetingRiskResponsesWithCensorTimes"
+  return(newList)
+}
+
 # Internal function
 Java_CompetingRiskResponses <- function(delta, u){
  
--- a/R/cr_integratedBrierScore.R
+++ b/R/cr_integratedBrierScore.R
@ -0,0 +1,114 @@
+
+
+#' Integrated Brier Score
+#'
+#' Used to calculate the Integrated Brier Score, which for the competing risks
+#' setting is the integral of the squared difference between each observed
+#' cumulative incidence function (CIF) for each observation and the
+#' corresponding predicted CIF. If the survivor function (1 - CDF) of the
+#' censoring distribution is provided, weights can be calculated to account for
+#' the censoring.
+#'
+#' @return A numeric vector of the Integrated Brier Score for each prediction.
+#' @param responses A list of responses corresponding to the provided
+#'   mortalities; use \code{\link{CR_Response}}.
+#' @param predictions The predictions to be tested against.
+#' @param event The event type for the error to be calculated on.
+#' @param time \code{time} specifies the upper bound of the integral.
+#' @param censoringDistribution Optional; if provided then weights are
+#'   calculated on the errors. There are three ways to provide it - \itemize{
+#'   \item{If you have all the censor times and just want to use a simple
+#'   empirical estimate of the distribution, just provide a numeric vector of
+#'   all of the censor times and it will be automatically calculated.} \item{You
+#'   can directly specify the survivor function by providing a list with two
+#'   numeric vectors called \code{x} and \code{y}. They should be of the same
+#'   length and correspond to each point. It is assumed that previous to the
+#'   first value in \code{y} the \code{y} value is 1.0; and that the function
+#'   you provide is a right-continuous step function.} \item{You can provide a
+#'   function from \code{\link[stats]{stepfun}}. Note that this only supports
+#'   functions where \code{right = FALSE} (default), and that the first y value
+#'   (corresponding to y before the first x value) will be to set to 1.0
+#'   regardless of what is specified.}
+#'
+#'   }
+#' @param parallel A logical indicating whether multiple cores should be
+#'   utilized when calculating the error. Available as an option because it's
+#'   been observed that using Java's \code{parallelStream} can be unstable on
+#'   some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you
+#'   get strange errors while predicting.
+#'
+#' @export
+#' @references Section 4.2 of Ishwaran H, Gerds TA, Kogalur UB, Moore RD, Gange
+#'   SJ, Lau BM (2014). “Random Survival Forests for Competing Risks.”
+#'   Biostatistics, 15(4), 757–773. doi:10.1093/ biostatistics/kxu010.
+#'
+#' @examples
+#' data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6)
+#'
+#' model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1)
+#'
+#' newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6)
+#' predictions <- predict(model, newData)
+#'
+#' scores <- integratedBrierScore(CR_Response(data$delta, data$T), predictions, 1, 6.0)
+#' 
+integratedBrierScore <- function(responses, predictions, event, time, censoringDistribution = NULL, parallel = TRUE){
+  if(length(responses$eventTime) != length(predictions)){
+    stop("Length of responses and predictions must be equal.")
+  } 
+  
+  java.censoringDistribution <- NULL
+  if(!is.null(censoringDistribution)){
+    if(is.numeric(censoringDistribution)){
+      # estimate ECDF
+      censoringTimes <- .jarray(censoringDistribution, "D")
+      java.censoringDistribution <- .jcall(.class_Utils, makeResponse(.class_RightContinuousStepFunction), "estimateOneMinusECDF", censoringTimes)
+      
+    } else if(is.list(censoringDistribution)){
+      # First check that censoringDistribution fits the correct format
+      if(is.null(censoringDistribution$x) | is.null(censoringDistribution$y)){
+        stop("If the censoringDistribution is provided as a list, it must have an x and a y item that are numeric.")
+      }
+      
+      if(length(censoringDistribution$x) != length(censoringDistribution$y)){
+        stop("x and y in censoringDistribution must have the same length.")
+      }
+      
+      if(!is.numeric(censoringDistribution$x) | !is.numeric(censoringDistribution$y)){
+        stop("x and y in censoringDistribution must both be numeric.")
+      }
+      
+      java.censoringDistribution <- createRightContinuousStepFunction(censoringDistribution$x, censoringDistribution$y, defaultY = 1.0)
+      
+    } else if("stepfun" %in% class(censoringDistribution)){
+      x <- knots(censoringDistribution)
+      y <- censoringDistribution(x)
+      
+      java.censoringDistribution <- createRightContinuousStepFunction(x, y, defaultY = 1.0)
+    }
+    else{
+      stop("Invalid censoringDistribution")
+    }
+    
+    # Make sure we wrap it in an Optional
+    java.censoringDistribution <- .object_Optional(java.censoringDistribution)
+    
+  }
+  else{
+    java.censoringDistribution <- .object_Optional(NULL)
+  }
+  
+  predictions.java <- lapply(predictions, function(x){return(x$javaObject)})
+  predictions.java <- convertRListToJava(predictions.java)
+  
+  errors <- .jcall(.class_CompetingRiskUtils, "[D", "calculateIBSError", 
+         responses$javaObject, 
+         predictions.java, 
+         java.censoringDistribution,
+         as.integer(event), 
+         time,
+         parallel)
+  
+  return(errors)
+  
+}
--- a/R/createRightContinuousStepFunction.R
+++ b/R/createRightContinuousStepFunction.R
@ -0,0 +1,11 @@
+# Internal function
+createRightContinuousStepFunction <- function(x, y, defaultY){
+  x.java <- .jarray(as.numeric(x))
+  y.java <- .jarray(as.numeric(y))
+  
+  # as.numeric is explicitly required in case integers were accidently passed
+  # in.
+  newFun <- .jnew(.class_RightContinuousStepFunction, as.numeric(x), as.numeric(y), as.numeric(defaultY))
+  return(newFun)
+  
+}
--- a/R/java_classes_directory.R
+++ b/R/java_classes_directory.R
@ -14,11 +14,14 @@
 # Utility Classes
 .class_DataUtils <- "ca/joeltherrien/randomforest/utils/DataUtils"
 .class_RUtils <- "ca/joeltherrien/randomforest/utils/RUtils"
+.class_Utils <- "ca/joeltherrien/randomforest/utils/Utils"
 .class_CompetingRiskUtils <- "ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskUtils"
 .class_Settings <- "ca/joeltherrien/randomforest/Settings"

 # Misc. Classes
 .class_RightContinuousStepFunction <- "ca/joeltherrien/randomforest/utils/RightContinuousStepFunction"
+.class_CompetingRiskResponse <- "ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponse"
+.class_CompetingRiskResponseWithCensorTime <- "ca/joeltherrien/randomforest/responses/competingrisk/CompetingRiskResponseWithCensorTime"

 # TreeTrainer & its Builder
 .class_TreeTrainer <- "ca/joeltherrien/randomforest/tree/TreeTrainer"
@ -50,12 +53,12 @@
 .class_LogRankSplitFinder <- "ca/joeltherrien/randomforest/responses/competingrisk/splitfinder/LogRankSplitFinder"
 .class_WeightedVarianceSplitFinder <- "ca/joeltherrien/randomforest/responses/regression/WeightedVarianceSplitFinder"

-.object_Optional <- function(forest=NULL){
-  if(is.null(forest)){
+.object_Optional <- function(object=NULL){
+  if(is.null(object)){
    return(.jcall("java/util/Optional", "Ljava/util/Optional;", "empty"))
  } else{
-    forest <- .jcast(forest, .class_Object)
-    return(.jcall("java/util/Optional", "Ljava/util/Optional;", "of", forest))
+    object <- .jcast(object, .class_Object)
+    return(.jcall("java/util/Optional", "Ljava/util/Optional;", "of", object))
  }
  
 }
--- a/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar
+++ b/inst/java/largeRCRF-library-1.0-SNAPSHOT.jar
--- a/man/integratedBrierScore.Rd
+++ b/man/integratedBrierScore.Rd
@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cr_integratedBrierScore.R
+\name{integratedBrierScore}
+\alias{integratedBrierScore}
+\title{Integrated Brier Score}
+\usage{
+integratedBrierScore(responses, predictions, event, time,
+  censoringDistribution = NULL, parallel = TRUE)
+}
+\arguments{
+\item{responses}{A list of responses corresponding to the provided
+mortalities; use \code{\link{CR_Response}}.}
+
+\item{predictions}{The predictions to be tested against.}
+
+\item{event}{The event type for the error to be calculated on.}
+
+\item{time}{\code{time} specifies the upper bound of the integral.}
+
+\item{censoringDistribution}{Optional; if provided then weights are
+  calculated on the errors. There are three ways to provide it - \itemize{
+  \item{If you have all the censor times and just want to use a simple
+  empirical estimate of the distribution, just provide a numeric vector of
+  all of the censor times and it will be automatically calculated.} \item{You
+  can directly specify the survivor function by providing a list with two
+  numeric vectors called \code{x} and \code{y}. They should be of the same
+  length and correspond to each point. It is assumed that previous to the
+  first value in \code{y} the \code{y} value is 1.0; and that the function
+  you provide is a right-continuous step function.} \item{You can provide a
+  function from \code{\link[stats]{stepfun}}. Note that this only supports
+  functions where \code{right = FALSE} (default), and that the first y value
+  (corresponding to y before the first x value) will be to set to 1.0
+  regardless of what is specified.}
+
+  }}
+
+\item{parallel}{A logical indicating whether multiple cores should be
+utilized when calculating the error. Available as an option because it's
+been observed that using Java's \code{parallelStream} can be unstable on
+some systems. Default value is \code{TRUE}; only set to \code{FALSE} if you
+get strange errors while predicting.}
+}
+\value{
+A numeric vector of the Integrated Brier Score for each prediction.
+}
+\description{
+Used to calculate the Integrated Brier Score, which for the competing risks
+setting is the integral of the squared difference between each observed
+cumulative incidence function (CIF) for each observation and the
+corresponding predicted CIF. If the survivor function (1 - CDF) of the
+censoring distribution is provided, weights can be calculated to account for
+the censoring.
+}
+\examples{
+data <- data.frame(delta=c(1,1,0,0,2,2), T=1:6, x=1:6)
+
+model <- train(CR_Response(delta, T) ~ x, data, ntree=100, numberOfSplits=0, mtry=1, nodeSize=1)
+
+newData <- data.frame(delta=c(1,0,2,1,0,2), T=1:6, x=1:6)
+predictions <- predict(model, newData)
+
+scores <- integratedBrierScore(CR_Response(data$delta, data$T), predictions, 1, 6.0)
+
+}
+\references{
+Section 4.2 of Ishwaran H, Gerds TA, Kogalur UB, Moore RD, Gange
+  SJ, Lau BM (2014). “Random Survival Forests for Competing Risks.”
+  Biostatistics, 15(4), 757–773. doi:10.1093/ biostatistics/kxu010.
+}
--- a/tests/testthat/test_brier_score.R
+++ b/tests/testthat/test_brier_score.R
@ -0,0 +1,38 @@
+context("Calculate integrated Brier score without error")
+
+# This code is more concerned that the code runs without error. The tests in the
+# Java code check that the results it returns are accurate.
+test_that("Can calculate Integrated Brier Score", {
+
+  sampleData <- data.frame(x=rnorm(100))
+  sampleData$T <- sample(0:4, size=100, replace = TRUE) # the censor distribution we provide needs to conform to the data or we can get NaNs
+  sampleData$delta <- sample(0:2, size = 100, replace = TRUE)
+  
+  testData <- sampleData[1:5,]
+  trainingData <- sampleData[6:100,]
+  
+  forest <- train(CR_Response(delta, T) ~ x, trainingData, ntree=50, numberOfSplits=0, mtry=1, nodeSize=5, cores=2, displayProgress=FALSE)
+  
+  predictions <- predict(forest, testData)
+  
+  scores_test <- integratedBrierScore(CR_Response(testData$delta, testData$T), predictions, event = 1, time = 4,
+                                 censoringDistribution = NULL)
+  # Check that we don't get a crash if we calculate the error for only one observation
+  scores_one <- integratedBrierScore(CR_Response(testData$delta, testData$T)[1], predictions[1], event = 1, time = 4,
+                                 censoringDistribution = NULL)
+  
+  # Make sure our error didn't somehow change
+  expect_equal(scores_one, scores_test[1])
+  
+  # Provide a censoring distribution via censor times
+  scores_censoring1 <- integratedBrierScore(CR_Response(testData$delta, testData$T), predictions, event = 1, time = 4,
+                                      censoringDistribution = c(0,1,1,2,3,4))
+  scores_censoring2 <- integratedBrierScore(CR_Response(testData$delta, testData$T), predictions, event = 1, time = 4,
+                                     censoringDistribution = list(x = 0:4, y = 1 - c(1/6, 3/6, 4/6, 5/6, 6/6)))
+  scores_censoring3 <- integratedBrierScore(CR_Response(testData$delta, testData$T), predictions, event = 1, time = 4,
+                                            censoringDistribution = stepfun(x=0:4, y=1 - c(0, 1/6, 3/6, 4/6, 5/6, 6/6)))
+  
+  expect_equal(scores_censoring1, scores_censoring2)
+  expect_equal(scores_censoring1, scores_censoring3)
+  
+})
--- a/tests/testthat/test_responses_length_1.R
+++ b/tests/testthat/test_responses_length_1.R
@ -15,3 +15,44 @@ test_that("CR_Response of length 1 - no censor times", {
  expect_true(T) # show Ok if we got this far
  
 })
+
+test_that("Can sub-index CR_Response - no censor times", {
+  
+  x <- CR_Response(1:5, 1:5)
+  
+  index <- 5
+  
+  y <- x[index]
+  
+  expect_equal(y$eventTime, index)
+  expect_equal(y$eventIndicator, index)
+  
+  expect_equal(rJava::.jcall(y$javaObject, "I", "size"), 1)
+  oneJavaItem <- rJava::.jcall(y$javaObject, largeRCRF:::makeResponse(largeRCRF:::.class_Object), "get", 0L)
+  oneJavaItem <- rJava::.jcast(oneJavaItem, largeRCRF:::.class_CompetingRiskResponse)
+  delta <- rJava::.jcall(oneJavaItem, "I", "getDelta")
+  
+  expect_equal(delta, index)
+  
+})
+
+test_that("Can sub-index CR_Response - censor times", {
+  
+  x <- CR_Response(1:5, 1:5, 1:5)
+  
+  index <- 5
+  
+  y <- x[index]
+  
+  expect_equal(y$eventTime, index)
+  expect_equal(y$eventIndicator, index)
+  expect_equal(y$censorTime, index)
+  
+  expect_equal(rJava::.jcall(y$javaObject, "I", "size"), 1)
+  oneJavaItem <- rJava::.jcall(y$javaObject, largeRCRF:::makeResponse(largeRCRF:::.class_Object), "get", 0L)
+  oneJavaItem <- rJava::.jcast(oneJavaItem, largeRCRF:::.class_CompetingRiskResponseWithCensorTime)
+  delta <- rJava::.jcall(oneJavaItem, "D", "getC")
+  
+  expect_equal(delta, index)
+  
+})