Comment debug prints

online-ml · MarcoDiFrancesco · Apr 11, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 12, 2024
commit 30fb86b3307aa215c77996f50310395bb2bc408d
diff --git a/examples/classification/synthetic.rs b/examples/classification/synthetic.rs
@@ -4,9 +4,6 @@ use light_river::classification::mondrian_tree::MondrianTree;
 use light_river::common::ClassifierOutput;
 use light_river::common::ClassifierTarget;
 use light_river::datasets::synthetic::Synthetic;
-use light_river::metrics::rocauc::ROCAUC;
-use light_river::metrics::traits::ClassificationMetric;
-use light_river::stream::data_stream::DataStream;
 use light_river::stream::iter_csv::IterCsv;
 use ndarray::{s, Array1};
 use num::ToPrimitive;
@@ -43,17 +40,16 @@ fn main() {
     let window_size: usize = 1000;
     let n_trees: usize = 1;
 
-    let transactions_f = Synthetic::load_data().unwrap();
+    let transactions_f = Synthetic::load_data();
     let features = get_features(transactions_f);
 
-    let transactions_c = Synthetic::load_data().unwrap();
+    let transactions_c = Synthetic::load_data();
     let labels = get_labels(transactions_c);
     println!("labels: {labels:?}, features: {features:?}");
     let mut mf: MondrianForest<f32> = MondrianForest::new(window_size, n_trees, &features, &labels);
-
     let mut score_total = 0.0;
 
-    let transactions = Synthetic::load_data().unwrap();
+    let transactions = Synthetic::load_data();
     for (idx, transaction) in transactions.enumerate() {
         let data = transaction.unwrap();
 

diff --git a/src/classification/mondrian_forest.rs b/src/classification/mondrian_forest.rs
@@ -69,7 +69,7 @@ impl<F: FType> MondrianForest<F> {
                 "Probability should not be NaN. Found: {:?}.",
                 probs.to_vec()
             );
-            total_probs += &probs; // Assuming `probs` is an Array1<F>
+            total_probs += &probs;
         }
 
         // Average the probabilities by the number of trees

diff --git a/src/classification/mondrian_node.rs b/src/classification/mondrian_node.rs
@@ -50,8 +50,6 @@ impl<F: FType> Node<F> {
     /// e.g. y=2, stats.counts=[0, 1, 10] -> False
     /// e.g. y=2, stats.counts=[0, 0, 10] -> True
     /// e.g. y=1, stats.counts=[0, 0, 10] -> False
-    ///
-    /// From: River function
     pub fn is_dirac(&self, y: usize) -> bool {
         return self.stats.counts.sum() == self.stats.counts[y];
     }
@@ -123,11 +121,6 @@ impl<F: FType> Stats<F> {
     }
     /// Return probabilities of sample 'x' belonging to each class.
     ///
-    /// e.g. probs: [0.1, 0.2, 0.7]
-    ///
-    /// TODO: Remove the assert that check for exact values, I was testing if unit tests make sense, but as
-    /// shown below this does not show the error. The function is just too complex.
-    ///
     /// # Example
     /// ```
     /// use light_river::classification::alias::FType;
@@ -146,15 +139,10 @@ impl<F: FType> Stats<F> {
     ///
     /// let x = Array1::from_vec(vec![1.5, 3.0]);
     /// let probs = stats.predict_proba(&x);
-    /// let expected = vec![0.998075, 0.001924008, 0.0];
-    /// assert!(
-    ///     (probs.clone() - Array1::from_vec(expected)).mapv(|a: f32| a.abs()).iter().all(|&x| x < 1e-4),
-    ///     "Probabilities do not match expected values"
-    /// );
     /// // Check all values inside [0, 1] range
     /// assert!(probs.clone().iter().all(|&x| x >= 0.0 && x <= 1.0), "Probabilities should be in [0, 1] range");
     /// // Check sum is 1
-    /// assert!((probs.clone().sum() - 1.0).abs() < 1e-4, "Sum of probabilities should be 1");
+    /// assert!((probs.clone().sum() - 1.0f32).abs() < 1e-4, "Sum of probabilities should be 1");
     /// ```
     pub fn predict_proba(&self, x: &Array1<F>) -> Array1<F> {
         let mut probs = Array1::zeros(self.num_labels);
@@ -169,16 +157,14 @@ impl<F: FType> Stats<F> {
             .zip(self.counts.iter())
             .enumerate()
         {
-            // println!("predict_proba() - mid - index: {:?}, sum: {:?}, sq_sum: {:?}, count: {:?}", index, sum.to_vec(), sq_sum.to_vec(), count);
-            let epsilon = F::epsilon(); // F::from_f32(1e-9).unwrap();
+            let epsilon = F::epsilon();
             let count_f = F::from_usize(count).unwrap();
             let avg = &sum / count_f;
             let var = (&sq_sum / count_f) - (&avg * &avg) + epsilon;
             let sigma = (&var * count_f) / (count_f - F::one() + epsilon);
-            // println!("predict_proba() - mid - avg: {:?}, var: {:?}, sigma: {:?}", avg.to_vec(), var.to_vec(), sigma.to_vec());
             let pi = F::from_f32(std::f32::consts::PI).unwrap() * F::from_f32(2.0).unwrap();
             let z = pi.powi(x.len() as i32) * sigma.mapv(|s| s * s).sum().sqrt();
-            // Same as dot product
+            // Dot product
             let dot_feature = (&(x - &avg) * &(x - &avg)).sum();
             let dot_sigma = (&sigma * &sigma).sum();
             let exponent = -F::from_f32(0.5).unwrap() * dot_feature / dot_sigma;
@@ -192,9 +178,6 @@ impl<F: FType> Stats<F> {
             probs[index] = prob;
         }
 
-        // println!("predict_proba() post - probs: {:?}", probs.to_vec());
-        // println!();
-
         // Check at least one probability is non-zero. Otherwise we have division by zero.
         assert!(
             !probs.iter().all(|&x| x == F::zero()),
@@ -205,6 +188,7 @@ impl<F: FType> Stats<F> {
         for prob in probs.iter_mut() {
             *prob /= sum_prob;
         }
+        // println!("predict_proba() post - probs: {:?}", probs.to_vec());
         probs
     }
 }
diff --git a/src/classification/mondrian_tree.rs b/src/classification/mondrian_tree.rs
@@ -113,7 +113,7 @@ impl<F: FType> MondrianTree<F> {
     /// Note: In Nel215 codebase should work on multiple records, here it's
     /// working only on one, so it's the same as "predict()".
     pub fn predict_proba(&self, x: &Array1<F>) -> Array1<F> {
-        println!("predict_proba() - tree size: {}", self.nodes.len());
+        // println!("predict_proba() - tree size: {}", self.nodes.len());
         // self.test_tree();
         self.predict(x, self.root.unwrap(), F::one())
     }
@@ -155,10 +155,10 @@ impl<F: FType> MondrianTree<F> {
         extensions_sum: F,
     ) -> F {
         if self.nodes[node_idx].is_dirac(y) {
-            println!(
-                "go_downwards() - node: {node_idx} - extensions_sum: {:?} - all same class",
-                extensions_sum
-            );
+            // println!(
+            //     "go_downwards() - node: {node_idx} - extensions_sum: {:?} - all same class",
+            //     extensions_sum
+            // );
             return F::zero();
         }
 
@@ -167,10 +167,10 @@ impl<F: FType> MondrianTree<F> {
 
             // From River: If the node is a leaf we must split it
             if self.nodes[node_idx].is_leaf {
-                println!(
-                    "go_downwards() - node: {node_idx} - extensions_sum: {:?} - split is_leaf",
-                    extensions_sum
-                );
+                // println!(
+                //     "go_downwards() - node: {node_idx} - extensions_sum: {:?} - split is_leaf",
+                //     extensions_sum
+                // );
                 return split_time;
             }
 
@@ -180,19 +180,18 @@ impl<F: FType> MondrianTree<F> {
             let child_time = self.nodes[child_idx].time;
             // 2. We check if splitting time occurs before child creation time
             if split_time < child_time {
-                println!(
-                    "go_downwards() - node: {node_idx} - extensions_sum: {:?} - split mid tree",
-                    extensions_sum
-                );
-                // Go to next child????
+                // println!(
+                //     "go_downwards() - node: {node_idx} - extensions_sum: {:?} - split mid tree",
+                //     extensions_sum
+                // );
                 return split_time;
             }
-            println!("go_downwards() - node: {node_idx} - extensions_sum: {:?} - not increased enough to split (mid node)", extensions_sum);
+            // println!("go_downwards() - node: {node_idx} - extensions_sum: {:?} - not increased enough to split (mid node)", extensions_sum);
         } else {
-            println!(
-                "go_downwards() - node: {node_idx} - extensions_sum: {:?} - not outside box",
-                extensions_sum
-            );
+            // println!(
+            //     "go_downwards() - node: {node_idx} - extensions_sum: {:?} - not outside box",
+            //     extensions_sum
+            // );
         }
 
         F::zero()
@@ -345,58 +344,42 @@ impl<F: FType> MondrianTree<F> {
             None => Some(self.create_leaf(x, y, None, F::zero())),
             Some(root_idx) => Some(self.go_downwards(root_idx, x, y)),
         };
-        println!("partial_fit() tree post {}", self);
+        // println!("partial_fit() tree post {}", self);
     }
 
     fn fit(&self) {
         unimplemented!("Make the program first work with 'partial_fit', then implement this")
     }
 
-    /// Function in River: "go_downwards()"
-    ///
-    /// Recursive function to predict probabilities.
     fn predict(&self, x: &Array1<F>, node_idx: usize, p_not_separated_yet: F) -> Array1<F> {
         let node = &self.nodes[node_idx];
 
-        // Step 1: Calculate the time feature from the parent node.
-        let d = node.time - self.get_parent_time(node_idx);
-
-        // Step 2: If 'x' is outside the box, calculate distance of 'x' from the box
-        let dist_max = (x - &node.max_list).mapv(|v| F::max(v, F::zero()));
-        let dist_min = (&node.min_list - x).mapv(|v| F::max(v, F::zero()));
-        let eta = dist_min.sum() + dist_max.sum();
-        // It works, but check again once 'max_list' and 'min_list' are not 0s
-        // println!("x: {:?}, node.max_list {:?}, max(max_list) {:?}, node.min_list {:?}, max(min_list) {:?}",
-        //  x.to_vec(), node.max_list.to_vec(), dist_max.to_vec(), node.min_list.to_vec(), dist_min.to_vec());
-
-        // Step 3: Probability 'p' of the box not splitting.
+        // Probability 'p' of the box not splitting.
         //     eta (box dist): larger distance, more prob of splitting
-        //     d (time diff with parent): more dist with parent, more prob of splitting
-        let p = F::one() - (-d * eta).exp();
-        // println!("predict() -> pre  create_result() - node_idx {}", node.stats);
+        //     d (time delta with parent): more dist with parent, more prob of splitting
+        let p = {
+            let d = node.time - self.get_parent_time(node_idx);
+            // If 'x' is outside the box, calculate distance of 'x' from the box
+            let dist_max = (x - &node.max_list).mapv(|v| F::max(v, F::zero()));
+            let dist_min = (&node.min_list - x).mapv(|v| F::max(v, F::zero()));
+            let eta = dist_min.sum() + dist_max.sum();
+            F::one() - (-d * eta).exp()
+        };
 
-        // Step 4: Generate a result for the current node using its statistics.
+        // Generate a result for the current node using its statistics.
         let res = node.stats.create_result(x, p_not_separated_yet * p);
-        // println!("predict() -> post create_result() - node.stats: {}", node.stats);
-        // println!(
-        //     "predict() - res: {:?}, p_not_separated_yet: {:?}, p: {:?}",
-        //     res, p_not_separated_yet, p
-        // );
 
+        let w = p_not_separated_yet * (F::one() - p);
         if node.is_leaf {
-            let w = p_not_separated_yet * (F::one() - p);
             let res2 = node.stats.create_result(x, w);
-            // println!("predict() - ischild - res: {:?}, res2: {:?}", res.to_vec(), res2.to_vec());
             return res + res2;
         } else {
             let child_idx = if x[node.feature] <= node.threshold {
                 node.left
             } else {
                 node.right
             };
-            let child_res =
-                self.predict(x, child_idx.unwrap(), p_not_separated_yet * (F::one() - p));
-            // println!("predict() - notchild - res: {:?}, child_res: {:?}", res.to_vec(), child_res.to_vec());
+            let child_res = self.predict(x, child_idx.unwrap(), w);
             return res + child_res;
         }
     }

diff --git a/src/datasets/synthetic.rs b/src/datasets/synthetic.rs
@@ -8,14 +8,11 @@ use std::{fs::File, path::Path};
 /// Add 'synthetic.csv' to project root directory.
 pub struct Synthetic;
 impl Synthetic {
-    pub fn load_data() -> Result<IterCsv<f32, File>, Box<dyn std::error::Error>> {
+    pub fn load_data() -> IterCsv<f32, File> {
         // let file_name = "syntetic_dataset_paper.csv";
         let file_name = "syntetic_dataset_int.csv";
-        let file = File::open(file_name)?;
+        let file = File::open(file_name).unwrap();
         let y_cols = Some(Target::Name("label".to_string()));
-        match IterCsv::<f32, File>::new(file, y_cols) {
-            Ok(x) => Ok(x),
-            Err(e) => Err(Box::new(e)),
-        }
+        IterCsv::<f32, File>::new(file, y_cols).unwrap()
     }
 }