Skip to content

Commit

Permalink
ChunkFillNone for PolarsNumeric
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 23, 2020
1 parent 7ba7a16 commit ef54fca
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 18 deletions.
39 changes: 24 additions & 15 deletions polars/src/chunked_array/aggregate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ use crate::chunked_array::ChunkedArray;
use crate::datatypes::BooleanChunked;
use crate::{datatypes::PolarsNumericType, prelude::*};
use arrow::compute;
use num::{Num, NumCast, ToPrimitive};
use std::cmp::PartialOrd;
use std::ops::{Add, Div};

impl<T> ChunkAgg<T::Native> for ChunkedArray<T>
where
T: PolarsNumericType,
T::Native: std::ops::Add<Output = T::Native> + std::cmp::PartialOrd,
T::Native: Add<Output = T::Native> + PartialOrd + Div<Output = T::Native> + Num + NumCast,
{
/// Returns `None` if the array is empty or only contains null values.
fn sum(&self) -> Option<T::Native> {
self.downcast_chunks()
.iter()
Expand All @@ -23,29 +25,31 @@ where
})
}

/// Returns the minimum value in the array, according to the natural order.
/// Returns an option because the array is nullable.
fn min(&self) -> Option<T::Native> {
self.downcast_chunks()
.iter()
.filter_map(|&a| compute::min(a))
.fold_first(|acc, v| if acc < v { acc } else { v })
}

/// Returns the maximum value in the array, according to the natural order.
/// Returns an option because the array is nullable.
fn max(&self) -> Option<T::Native> {
self.downcast_chunks()
.iter()
.filter_map(|&a| compute::max(a))
.fold_first(|acc, v| if acc > v { acc } else { v })
}

fn mean(&self) -> Option<T::Native> {
let len = (self.len() - self.null_count()) as f64;
self.sum()
.map(|v| NumCast::from(v.to_f64().unwrap() / len).unwrap())
}
}

fn min_max_helper(ca: &BooleanChunked, min: bool) -> Option<u64> {
let min_max = ca.into_iter().fold(0, |acc: u64, x| match x {
fn min_max_helper(ca: &BooleanChunked, min: bool) -> Option<u8> {
let min_max = ca.into_iter().fold(0, |acc: u8, x| match x {
Some(v) => {
let v = v as u64;
let v = v as u8;
if min {
if acc < v {
acc
Expand All @@ -66,30 +70,35 @@ fn min_max_helper(ca: &BooleanChunked, min: bool) -> Option<u64> {
}

/// Booleans are casted to 1 or 0.
impl ChunkAgg<u64> for BooleanChunked {
impl ChunkAgg<u8> for BooleanChunked {
/// Returns `None` if the array is empty or only contains null values.
fn sum(&self) -> Option<u64> {
fn sum(&self) -> Option<u8> {
if self.len() == 0 {
return None;
}
let sum = self.into_iter().fold(0, |acc: u64, x| match x {
Some(v) => acc + v as u64,
let sum = self.into_iter().fold(0, |acc: u8, x| match x {
Some(v) => acc + v as u8,
None => acc,
});
Some(sum)
}

fn min(&self) -> Option<u64> {
fn min(&self) -> Option<u8> {
if self.len() == 0 {
return None;
}
min_max_helper(self, true)
}

fn max(&self) -> Option<u64> {
fn max(&self) -> Option<u8> {
if self.len() == 0 {
return None;
}
min_max_helper(self, false)
}

fn mean(&self) -> Option<u8> {
let len = self.len() - self.null_count();
self.sum().map(|v| (v as usize / len) as u8)
}
}
1 change: 0 additions & 1 deletion polars/src/chunked_array/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,6 @@ pub fn get_large_list_builder(
mod test {
use super::*;
use arrow::array::Int32Array;
use itertools::Itertools;

#[test]
fn test_existing_null_bitmap() {
Expand Down
133 changes: 133 additions & 0 deletions polars/src/chunked_array/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use crate::prelude::*;
use crate::utils::Xob;
use arrow::compute;
use itertools::Itertools;
use num::{Num, NumCast};
use std::cmp::Ordering;
use std::ops::{Add, Div};

/// Cast `ChunkedArray<T>` to `ChunkedArray<N>`
pub trait ChunkCast {
Expand Down Expand Up @@ -32,6 +34,10 @@ pub trait ChunkAgg<T> {
/// Returns the maximum value in the array, according to the natural order.
/// Returns an option because the array is nullable.
fn max(&self) -> Option<T>;

/// Returns the mean value in the array.
/// Returns an option because the array is nullable.
fn mean(&self) -> Option<T>;
}

/// Compare [Series](series/series/enum.Series.html)
Expand Down Expand Up @@ -219,6 +225,101 @@ impl ChunkSort<BooleanType> for BooleanChunked {
}
}

pub enum FillNoneStrategy<T> {
Backward,
Forward,
Mean,
Min,
Max,
Value(T),
}

/// Replace None values with various strategies
pub trait ChunkFillNone<T> {
/// Replace None values with one of the following strategies:
/// * Forward fill (replace None with the previous value)
/// * Backward fill (replace None with the next value)
/// * Mean fill (replace None with the mean of the whole array)
/// * Min fill (replace None with the minimum of the whole array)
/// * Max fill (replace None with the maximum of the whole array)
/// * Value fill (replace None with a given value)
fn fill_none(&self, strategy: FillNoneStrategy<T>) -> Result<Self>
where
Self: Sized;
}

fn fill_forward<T>(ca: &ChunkedArray<T>) -> ChunkedArray<T>
where
T: PolarsNumericType,
{
ca.into_iter()
.scan(None, |previous, opt_v| {
let val = match opt_v {
Some(_) => Some(opt_v),
None => Some(*previous),
};
*previous = opt_v;
val
})
.collect()
}

fn fill_backward<T>(ca: &ChunkedArray<T>) -> ChunkedArray<T>
where
T: PolarsNumericType,
{
let mut iter = ca.into_iter().peekable();

let mut builder = PrimitiveChunkedBuilder::<T>::new(ca.name(), ca.len());
while let Some(opt_v) = iter.next() {
match opt_v {
Some(v) => builder.append_value(v),
None => {
match iter.peek() {
// end of iterator
None => builder.append_null(),
Some(opt_v) => builder.append_option(*opt_v),
}
}
}
}
builder.finish()
}

fn fill_value<T>(ca: &ChunkedArray<T>, value: Option<T::Native>) -> ChunkedArray<T>
where
T: PolarsNumericType,
{
ca.into_iter()
.map(|opt_v| match opt_v {
Some(_) => opt_v,
None => value,
})
.collect()
}

impl<T> ChunkFillNone<T::Native> for ChunkedArray<T>
where
T: PolarsNumericType,
T::Native: Add<Output = T::Native> + PartialOrd + Div<Output = T::Native> + Num + NumCast,
{
fn fill_none(&self, strategy: FillNoneStrategy<T::Native>) -> Result<Self> {
// nothing to fill
if self.null_count() == 0 {
return Ok(self.clone());
}
let ca = match strategy {
FillNoneStrategy::Forward => fill_forward(self),
FillNoneStrategy::Backward => fill_backward(self),
FillNoneStrategy::Min => fill_value(self, self.min()),
FillNoneStrategy::Max => fill_value(self, self.max()),
FillNoneStrategy::Mean => fill_value(self, self.mean()),
FillNoneStrategy::Value(val) => fill_value(self, Some(val)),
};
Ok(ca)
}
}

/// Fill a ChunkedArray with one value.
pub trait ChunkFull<T> {
/// Create a ChunkedArray with a single value.
Expand Down Expand Up @@ -504,4 +605,36 @@ mod test {
assert_eq!(Vec::from(&shifted), &[Some(1), Some(2), None]);
assert!(ca.shift(3, None).is_err());
}

#[test]
fn test_fill_none() {
let ca =
Int32Chunked::new_from_opt_slice("", &[None, Some(2), Some(3), None, Some(4), None]);
let filled = ca.fill_none(FillNoneStrategy::Forward).unwrap();
assert_eq!(
Vec::from(&filled),
&[None, Some(2), Some(3), Some(3), Some(4), Some(4)]
);
let filled = ca.fill_none(FillNoneStrategy::Backward).unwrap();
assert_eq!(
Vec::from(&filled),
&[Some(2), Some(2), Some(3), Some(4), Some(4), None]
);
let filled = ca.fill_none(FillNoneStrategy::Min).unwrap();
assert_eq!(
Vec::from(&filled),
&[Some(2), Some(2), Some(3), Some(2), Some(4), Some(2)]
);
let filled = ca.fill_none(FillNoneStrategy::Value(10)).unwrap();
assert_eq!(
Vec::from(&filled),
&[Some(10), Some(2), Some(3), Some(10), Some(4), Some(10)]
);
let filled = ca.fill_none(FillNoneStrategy::Mean).unwrap();
assert_eq!(
Vec::from(&filled),
&[Some(3), Some(2), Some(3), Some(3), Some(4), Some(3)]
);
println!("{:?}", filled);
}
}
4 changes: 2 additions & 2 deletions polars/src/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ pub use crate::{
comparison::NumComp,
iterator::{IntoNoNullIterator, NumericChunkIterDispatch},
ops::{
ChunkAgg, ChunkApply, ChunkCast, ChunkCompare, ChunkFilter, ChunkFull, ChunkReverse,
ChunkShift, ChunkSort, ChunkUnique,
ChunkAgg, ChunkApply, ChunkCast, ChunkCompare, ChunkFillNone, ChunkFilter, ChunkFull,
ChunkReverse, ChunkShift, ChunkSort, ChunkUnique, FillNoneStrategy,
},
take::{
AsTakeIndex, IntoTakeRandom, NumTakeRandomChunked, NumTakeRandomCont, Take, TakeRandom,
Expand Down

0 comments on commit ef54fca

Please sign in to comment.