From 00a37e8ff533a09ef7d8b730f8e0bb333224a17f Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Fri, 17 Oct 2025 15:41:23 -0700 Subject: [PATCH] Differentiate between null count 0 + none --- parquet/src/file/statistics.rs | 59 ++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 889e5ea66bbe..5c64c697b236 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -125,19 +125,17 @@ pub(crate) fn from_thrift_page_stats( ) -> Result> { Ok(match thrift_stats { Some(stats) => { - // Number of nulls recorded, when it is not available, we just mark it as 0. - // TODO this should be `None` if there is no information about NULLS. - // see https://github.com/apache/arrow-rs/pull/6216/files - let null_count = stats.null_count.unwrap_or(0); - - if null_count < 0 { - return Err(ParquetError::General(format!( - "Statistics null count is negative {null_count}", - ))); - } - - // Generic null count. - let null_count = Some(null_count as u64); + let null_count = match stats.null_count { + Some(null_count) => { + if null_count < 0 { + return Err(ParquetError::General(format!( + "Statistics null count is negative {null_count}", + ))); + } + Some(null_count as u64) + } + None => None, + }; // Generic distinct count (count of distinct values occurring) let distinct_count = stats.distinct_count.map(|value| value as u64); // Whether or not statistics use deprecated min/max fields. @@ -1062,21 +1060,7 @@ mod tests { let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats)) .unwrap() .unwrap(); - // TODO: remove branch when we no longer support assuming null_count==None in the thrift - // means null_count = Some(0) - if null_count.is_none() { - assert_ne!(round_tripped, statistics); - assert!(round_tripped.null_count_opt().is_some()); - assert_eq!(round_tripped.null_count_opt(), Some(0)); - assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt()); - assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt()); - assert_eq!( - round_tripped.distinct_count_opt(), - statistics.distinct_count_opt() - ); - } else { - assert_eq!(round_tripped, statistics); - } + assert_eq!(round_tripped, statistics); } fn make_bool_stats(distinct_count: Option, null_count: Option) -> Statistics { @@ -1094,6 +1078,25 @@ mod tests { )) } + #[test] + fn test_from_thrift_null_count() { + let thrift_stats = PageStatistics { + max: None, + min: None, + null_count: None, + distinct_count: None, + max_value: None, + min_value: None, + is_max_value_exact: None, + is_min_value_exact: None, + }; + + let stats = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats)) + .unwrap() + .unwrap(); + assert_eq!(stats.null_count_opt(), None); + } + #[test] fn test_int96_invalid_statistics() { let mut thrift_stats = PageStatistics {