Add parameter to decide on whether to check for node purity or not

2018-10-15 11:03:35 -07:00 · 2018-10-15 11:03:35 -07:00 · cce5ad1e0f
commit cce5ad1e0f
parent 7fba964af9
2 changed files with 24 additions and 2 deletions
--- a/src/main/java/ca/joeltherrien/randomforest/Settings.java
+++ b/src/main/java/ca/joeltherrien/randomforest/Settings.java
@ -177,6 +177,7 @@ public class Settings {
    private int numberOfSplits = 5;
    private int nodeSize = 5;
    private int maxNodeDepth = 1000000; // basically no maxNodeDepth
    private boolean checkNodePurity = false;
    private ObjectNode responseCombinerSettings = new ObjectNode(JsonNodeFactory.instance);
    private ObjectNode groupDifferentiatorSettings = new ObjectNode(JsonNodeFactory.instance);
--- a/src/main/java/ca/joeltherrien/randomforest/tree/TreeTrainer.java
+++ b/src/main/java/ca/joeltherrien/randomforest/tree/TreeTrainer.java
@ -27,6 +27,12 @@ public class TreeTrainer<Y, O> {
    private final int maxNodeDepth;
    private final int mtry;
    /**
     * Whether to check if a node is pure or not when deciding to split. Splitting on a pure node won't change predictive accuracy,
     * but (depending on conditions) may hurt performance.
     */
    private final boolean checkNodePurity;
    private final List<Covariate> covariates;
    public TreeTrainer(final Settings settings, final List<Covariate> covariates){
@ -34,6 +40,7 @@ public class TreeTrainer<Y, O> {
        this.nodeSize = settings.getNodeSize();
        this.maxNodeDepth = settings.getMaxNodeDepth();
        this.mtry = settings.getMtry();
        this.checkNodePurity = settings.isCheckNodePurity();
        this.responseCombiner = settings.getResponseCombiner();
        this.groupDifferentiator = settings.getGroupDifferentiator();
@ -48,7 +55,7 @@ public class TreeTrainer<Y, O> {
    }
    private Node<O> growNode(List<Row<Y>> data, int depth){
-        // TODO; what is minimum per tree?
+        // See https://kogalur.github.io/randomForestSRC/theory.html#section3.1 (near bottom)
        if(data.size() >= 2*nodeSize && depth < maxNodeDepth && !nodeIsPure(data)){
            final List<Covariate> covariatesToTry = selectCovariates(this.mtry);
            final SplitRuleAndSplit bestSplitRuleAndSplit = findBestSplitRule(data, covariatesToTry);
@ -161,8 +168,22 @@ public class TreeTrainer<Y, O> {
    }
    private boolean nodeIsPure(List<Row<Y>> data){
        if(!checkNodePurity){
            return false;
        }
        if(data.size() <= 1){
            return true;
        }
        final Y first = data.get(0).getResponse();
-        return data.stream().allMatch(row -> row.getResponse().equals(first));
+        for(int i = 1; i< data.size(); i++){
            if(!data.get(i).getResponse().equals(first)){
                return false;
            }
        }
        return true;
    }
    private class SplitRuleAndSplit{