From e52afc73278dac56becb2daf2809e1bd94b09762 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 21:41:40 +1100 Subject: [PATCH 01/24] c --- crates/polars-core/src/frame/horizontal.rs | 114 +++++++++------------ crates/polars-core/src/frame/mod.rs | 20 ++-- crates/polars-core/src/frame/validation.rs | 59 +++++++++++ crates/polars-schema/src/schema.rs | 22 ++-- 4 files changed, 125 insertions(+), 90 deletions(-) create mode 100644 crates/polars-core/src/frame/validation.rs diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 6d2500b03568..0dc75d2dcfc1 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -1,28 +1,8 @@ -use polars_error::{polars_ensure, polars_err, PolarsResult}; -use polars_utils::aliases::PlHashSet; +use polars_error::{polars_err, PolarsResult}; use super::Column; use crate::datatypes::AnyValue; use crate::frame::DataFrame; -use crate::prelude::PlSmallStr; - -fn check_hstack( - col: &Column, - names: &mut PlHashSet, - height: usize, - is_empty: bool, -) -> PolarsResult<()> { - polars_ensure!( - col.len() == height || is_empty, - ShapeMismatch: "unable to hstack Series of length {} and DataFrame of height {}", - col.len(), height, - ); - polars_ensure!( - names.insert(col.name().clone()), - Duplicate: "unable to hstack, column with name {:?} already exists", col.name().as_str(), - ); - Ok(()) -} impl DataFrame { /// Add columns horizontally. @@ -31,6 +11,8 @@ impl DataFrame { /// The caller must ensure: /// - the length of all [`Column`] is equal to the height of this [`DataFrame`] /// - the columns names are unique + /// + /// Note that on a debug build this will panic on duplicates / height mismatch pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self { // If we don't have any columns yet, copy the height from the given columns. if let Some(fst) = columns.first() { @@ -41,13 +23,13 @@ impl DataFrame { } } + self.clear_schema(); + self.columns.extend_from_slice(columns); + if cfg!(debug_assertions) { - // It is an impl error if this fails. - self._validate_hstack(columns).unwrap(); + Self::validate_columns_slice(&self.columns).unwrap(); } - self.clear_schema(); - self.columns.extend_from_slice(columns); self } @@ -63,28 +45,15 @@ impl DataFrame { /// } /// ``` pub fn hstack_mut(&mut self, columns: &[Column]) -> PolarsResult<&mut Self> { - self._validate_hstack(columns)?; - Ok(unsafe { self.hstack_mut_unchecked(columns) }) - } + // Validate first - on a debug build `hstack_mut_unchecked` will panic on invalid columns. + Self::validate_columns_iter(self.get_columns().iter().chain(columns))?; - fn _validate_hstack(&self, columns: &[Column]) -> PolarsResult<()> { - let mut names = self - .columns - .iter() - .map(|c| c.name().clone()) - .collect::>(); - - let height = self.height(); - let is_empty = self.is_empty(); - // first loop check validity. We don't do this in a single pass otherwise - // this DataFrame is already modified when an error occurs. - for col in columns { - check_hstack(col, &mut names, height, is_empty)?; - } + unsafe { self.hstack_mut_unchecked(columns) }; - Ok(()) + Ok(self) } } + /// Concat [`DataFrame`]s horizontally. /// Concat horizontally and extend with null values if lengths don't match pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> PolarsResult { @@ -96,8 +65,15 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars let owned_df; + let mut out_width = 0; + + let all_equal_height = dfs.iter().all(|df| { + out_width += df.width(); + df.height() == output_height + }); + // if not all equal length, extend the DataFrame with nulls - let dfs = if !dfs.iter().all(|df| df.height() == output_height) { + let dfs = if !all_equal_height { owned_df = dfs .iter() .cloned() @@ -123,30 +99,38 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars dfs }; - let mut first_df = dfs[0].clone(); - let height = first_df.height(); - let is_empty = first_df.is_empty(); + let mut acc_df = DataFrame::empty(); + unsafe { acc_df.get_columns_mut() }.reserve(out_width); - let mut names = if check_duplicates { - first_df - .columns - .iter() - .map(|s| s.name().clone()) - .collect::>() - } else { - Default::default() - }; + for df in dfs { + unsafe { acc_df.get_columns_mut() }.extend(df.get_columns().iter().cloned()); + } - for df in &dfs[1..] { - let cols = df.get_columns(); + if check_duplicates { + DataFrame::validate_columns_slice(acc_df.get_columns()) + .map_err(|e| e.context("unable to hstack".into()))?; + } - if check_duplicates { - for col in cols { - check_hstack(col, &mut names, height, is_empty)?; - } - } + Ok(acc_df) +} - unsafe { first_df.hstack_mut_unchecked(cols) }; +#[cfg(test)] +mod tests { + use polars_error::PolarsError; + + #[test] + fn test_hstack_mut_empty_frame_height_validation() { + use crate::frame::DataFrame; + use crate::prelude::{Column, DataType}; + let mut df = DataFrame::empty(); + let result = df.hstack_mut(&[ + Column::full_null("a".into(), 1, &DataType::Null), + Column::full_null("b".into(), 3, &DataType::Null), + ]); + + assert!( + matches!(result, Err(PolarsError::ShapeMismatch(_))), + "expected shape mismatch error" + ); } - Ok(first_df) } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index e1babf6ed0aa..bd50951adb38 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -4,7 +4,7 @@ use std::{mem, ops}; use arrow::datatypes::ArrowSchemaRef; use polars_row::ArrayRef; -use polars_schema::schema::debug_ensure_matching_schema_names; +use polars_schema::schema::ensure_matching_schema_names; use polars_utils::itertools::Itertools; use rayon::prelude::*; @@ -31,6 +31,7 @@ pub(crate) mod horizontal; pub mod row; mod top_k; mod upstream_traits; +mod validation; use arrow::record_batch::{RecordBatch, RecordBatchT}; use polars_utils::pl_str::PlSmallStr; @@ -271,17 +272,8 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn new(columns: Vec) -> PolarsResult { - ensure_names_unique(&columns, |s| s.name().as_str())?; - - let Some(fst) = columns.first() else { - return Ok(DataFrame { - height: 0, - columns, - cached_schema: OnceLock::new(), - }); - }; - - Self::new_with_height(fst.len(), columns) + DataFrame::validate_columns_slice(&columns)?; + Ok(unsafe { Self::new_no_checks_height_from_first(columns) }) } pub fn new_with_height(height: usize, columns: Vec) -> PolarsResult { @@ -1845,7 +1837,9 @@ impl DataFrame { cols: &[PlSmallStr], schema: &Schema, ) -> PolarsResult> { - debug_ensure_matching_schema_names(schema, self.schema())?; + if cfg!(debug_assertions) { + ensure_matching_schema_names(schema, self.schema())?; + } cols.iter() .map(|name| { diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs new file mode 100644 index 000000000000..fc686cc20634 --- /dev/null +++ b/crates/polars-core/src/frame/validation.rs @@ -0,0 +1,59 @@ +use polars_error::{polars_bail, PolarsResult}; +use polars_utils::aliases::{InitHashMaps, PlHashSet}; + +use super::column::Column; +use super::DataFrame; + +impl DataFrame { + /// Ensure all equal height and names are unique. + /// + /// An Ok() result indicates `columns` is a valid state for a DataFrame. + pub fn validate_columns_slice(columns: &[Column]) -> PolarsResult<()> { + Self::validate_columns_iter(columns.iter()) + } + + pub fn validate_columns_iter<'a, I: IntoIterator>( + columns_iter: I, + ) -> PolarsResult<()> { + fn _validate_columns_iter_impl( + columns_iter: &mut dyn Iterator, + ) -> PolarsResult<()> { + let Some(first) = columns_iter.next() else { + return Ok(()); + }; + + let first_len = first.len(); + let first_name = first.name(); + + let mut names = PlHashSet::with_capacity({ + let (_, exact) = columns_iter.size_hint(); + exact.unwrap_or(16) + }); + + names.insert(first_name); + + for col in columns_iter { + let col_name = col.name(); + let col_len = col.len(); + + if col_len != first_len { + polars_bail!( + ShapeMismatch: + "height of '{}' ({}) does not match height of '{}' ({})", + col_name, col_len, first_name, first_len + ) + } + + if names.contains(col_name) { + polars_bail!(Duplicate: "column '{}' is duplicate", col_name) + } + + names.insert(col_name); + } + + Ok(()) + } + + _validate_columns_iter_impl(&mut columns_iter.into_iter()) + } +} diff --git a/crates/polars-schema/src/schema.rs b/crates/polars-schema/src/schema.rs index 38dec96ccbb5..bd65cdad9a12 100644 --- a/crates/polars-schema/src/schema.rs +++ b/crates/polars-schema/src/schema.rs @@ -457,18 +457,16 @@ where } } -pub fn debug_ensure_matching_schema_names(lhs: &Schema, rhs: &Schema) -> PolarsResult<()> { - if cfg!(debug_assertions) { - let lhs = lhs.iter_names().collect::>(); - let rhs = rhs.iter_names().collect::>(); - - if lhs != rhs { - polars_bail!( - SchemaMismatch: - "lhs: {:?} rhs: {:?}", - lhs, rhs - ) - } +pub fn ensure_matching_schema_names(lhs: &Schema, rhs: &Schema) -> PolarsResult<()> { + let lhs = lhs.iter_names().collect::>(); + let rhs = rhs.iter_names().collect::>(); + + if lhs != rhs { + polars_bail!( + SchemaMismatch: + "lhs: {:?} rhs: {:?}", + lhs, rhs + ) } Ok(()) From ae4c90ca45d5279278efd8fd49508d524b21bb2d Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 21:49:14 +1100 Subject: [PATCH 02/24] c --- crates/polars-core/src/frame/horizontal.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 0dc75d2dcfc1..bc2403f6e510 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -74,10 +74,14 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars // if not all equal length, extend the DataFrame with nulls let dfs = if !all_equal_height { + out_width = 0; + owned_df = dfs .iter() .cloned() .map(|mut df| { + out_width += df.width(); + if df.height() != output_height { let diff = output_height - df.height(); From 59ab64d8af4d50ef1efd2112a429a9bc838634b5 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 21:51:01 +1100 Subject: [PATCH 03/24] c --- crates/polars-core/src/frame/horizontal.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index bc2403f6e510..beeda8ee78e9 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -12,7 +12,7 @@ impl DataFrame { /// - the length of all [`Column`] is equal to the height of this [`DataFrame`] /// - the columns names are unique /// - /// Note that on a debug build this will panic on duplicates / height mismatch + /// Note that on a debug build this will panic on duplicates / height mismatch. pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self { // If we don't have any columns yet, copy the height from the given columns. if let Some(fst) = columns.first() { From 82243224dfa698a27ec64522d26dafcb825e9f45 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 21:51:35 +1100 Subject: [PATCH 04/24] c --- crates/polars-core/src/frame/validation.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index fc686cc20634..a322bf20ce5f 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -12,6 +12,9 @@ impl DataFrame { Self::validate_columns_iter(columns.iter()) } + /// Ensure all equal height and names are unique. + /// + /// An Ok() result indicates `columns` is a valid state for a DataFrame. pub fn validate_columns_iter<'a, I: IntoIterator>( columns_iter: I, ) -> PolarsResult<()> { From 49500eb9f810099087c33ec62f2fd3055910a4ed Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 21:51:58 +1100 Subject: [PATCH 05/24] c --- crates/polars-core/src/frame/validation.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index a322bf20ce5f..37a7b89c1a43 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -18,6 +18,8 @@ impl DataFrame { pub fn validate_columns_iter<'a, I: IntoIterator>( columns_iter: I, ) -> PolarsResult<()> { + return _validate_columns_iter_impl(&mut columns_iter.into_iter()); + fn _validate_columns_iter_impl( columns_iter: &mut dyn Iterator, ) -> PolarsResult<()> { @@ -56,7 +58,5 @@ impl DataFrame { Ok(()) } - - _validate_columns_iter_impl(&mut columns_iter.into_iter()) } } From ef124499f817a9a53383cc8c86900b827cf94873 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 22:07:43 +1100 Subject: [PATCH 06/24] c --- crates/polars-core/src/frame/horizontal.rs | 21 +++++++++++++-------- crates/polars-core/src/frame/mod.rs | 3 ++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index beeda8ee78e9..edbfe15ce21e 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -103,19 +103,24 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars dfs }; - let mut acc_df = DataFrame::empty(); - unsafe { acc_df.get_columns_mut() }.reserve(out_width); + let mut acc_cols = Vec::with_capacity(out_width); for df in dfs { - unsafe { acc_df.get_columns_mut() }.extend(df.get_columns().iter().cloned()); + acc_cols.extend(df.get_columns().iter().cloned()); } - if check_duplicates { - DataFrame::validate_columns_slice(acc_df.get_columns()) - .map_err(|e| e.context("unable to hstack".into()))?; - } + let df = if check_duplicates { + DataFrame::new(acc_cols).map_err(|e| e.context("unable to hstack".into()))? + } else { + if cfg!(debug_assertions) { + if acc_cols.len() > 2 { + assert!(acc_cols.iter().all(|c| c.len() == acc_cols[0].len())); + } + } + unsafe { DataFrame::new_no_checks_height_from_first(acc_cols) } + }; - Ok(acc_df) + Ok(df) } #[cfg(test)] diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index bd50951adb38..70998c12aa80 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -272,7 +272,8 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn new(columns: Vec) -> PolarsResult { - DataFrame::validate_columns_slice(&columns)?; + DataFrame::validate_columns_slice(&columns) + .map_err(|e| e.context("could not create a new DataFrame"))?; Ok(unsafe { Self::new_no_checks_height_from_first(columns) }) } From 558ae3bc698dbe8628bff44191d6decdd79d777b Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 22:08:07 +1100 Subject: [PATCH 07/24] c --- crates/polars-core/src/frame/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 70998c12aa80..69f3c31fb556 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -273,7 +273,7 @@ impl DataFrame { /// ``` pub fn new(columns: Vec) -> PolarsResult { DataFrame::validate_columns_slice(&columns) - .map_err(|e| e.context("could not create a new DataFrame"))?; + .map_err(|e| e.context("could not create a new DataFrame".into()))?; Ok(unsafe { Self::new_no_checks_height_from_first(columns) }) } From 390fe9204d43ca7c72380081ed08ffefd7a134d6 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 22:34:20 +1100 Subject: [PATCH 08/24] c --- crates/polars-core/src/frame/horizontal.rs | 7 +++---- crates/polars-core/src/frame/mod.rs | 2 +- crates/polars-core/src/frame/validation.rs | 2 +- py-polars/tests/unit/dataframe/test_df.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index edbfe15ce21e..ecae228b7e44 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -112,11 +112,10 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars let df = if check_duplicates { DataFrame::new(acc_cols).map_err(|e| e.context("unable to hstack".into()))? } else { - if cfg!(debug_assertions) { - if acc_cols.len() > 2 { - assert!(acc_cols.iter().all(|c| c.len() == acc_cols[0].len())); - } + if cfg!(debug_assertions) && acc_cols.len() > 1 { + assert!(acc_cols.iter().all(|c| c.len() == acc_cols[0].len())); } + unsafe { DataFrame::new_no_checks_height_from_first(acc_cols) } }; diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 69f3c31fb556..22d5a7cdfcdf 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -273,7 +273,7 @@ impl DataFrame { /// ``` pub fn new(columns: Vec) -> PolarsResult { DataFrame::validate_columns_slice(&columns) - .map_err(|e| e.context("could not create a new DataFrame".into()))?; + .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?; Ok(unsafe { Self::new_no_checks_height_from_first(columns) }) } diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index 37a7b89c1a43..5655b54a4823 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -44,7 +44,7 @@ impl DataFrame { if col_len != first_len { polars_bail!( ShapeMismatch: - "height of '{}' ({}) does not match height of '{}' ({})", + "height of column '{}' ({}) does not match height of column '{}' ({})", col_name, col_len, first_name, first_len ) } diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 235fb0e44586..9e718b6bd8af 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -3009,7 +3009,7 @@ def test_get_column_index() -> None: def test_dataframe_creation_with_different_series_lengths_19795() -> None: with pytest.raises( ShapeError, - match='could not create a new DataFrame: series "a" has length 2 while series "b" has length 1', + match=r"could not create a new DataFrame: height of column 'b' \(1\) does not match height of column 'a' \(2\)", ): pl.DataFrame({"a": [1, 2], "b": [1]}) From adbad18e403299fce74194a22abb69b88d8e18ed Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 22:35:54 +1100 Subject: [PATCH 09/24] c --- crates/polars-core/src/frame/mod.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 22d5a7cdfcdf..357514ea933c 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -515,11 +515,7 @@ impl DataFrame { /// having an equal length and a unique name, if not this may panic down the line. pub unsafe fn new_no_checks(height: usize, columns: Vec) -> DataFrame { if cfg!(debug_assertions) { - ensure_names_unique(&columns, |s| s.name().as_str()).unwrap(); - - for col in &columns { - assert_eq!(col.len(), height); - } + DataFrame::validate_columns_slice(&columns).unwrap(); } unsafe { Self::_new_no_checks_impl(height, columns) } From 2e31326e2b9df3dc1d3d8cec85c476262e41f460 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:24:41 +1100 Subject: [PATCH 10/24] c --- crates/polars-core/src/frame/mod.rs | 24 ---------------- crates/polars-core/src/frame/validation.rs | 28 +++++++++++++++++-- crates/polars-ops/src/frame/pivot/mod.rs | 3 +- .../polars-ops/src/series/ops/to_dummies.rs | 3 +- 4 files changed, 28 insertions(+), 30 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 357514ea933c..5234837c9c14 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -533,30 +533,6 @@ impl DataFrame { } } - /// Create a new `DataFrame` but does not check the length of the `Series`, - /// only check for duplicates. - /// - /// It is advised to use [DataFrame::new] in favor of this method. - /// - /// # Safety - /// - /// It is the callers responsibility to uphold the contract of all `Series` - /// having an equal length, if not this may panic down the line. - pub unsafe fn new_no_length_checks(columns: Vec) -> PolarsResult { - ensure_names_unique(&columns, |s| s.name().as_str())?; - - Ok(if cfg!(debug_assertions) { - Self::new(columns).unwrap() - } else { - let height = Self::infer_height(&columns); - DataFrame { - height, - columns, - cached_schema: OnceLock::new(), - } - }) - } - /// Shrink the capacity of this DataFrame to fit its length. pub fn shrink_to_fit(&mut self) { // Don't parallelize this. Memory overhead diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index 5655b54a4823..7da1186d6007 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -9,7 +9,31 @@ impl DataFrame { /// /// An Ok() result indicates `columns` is a valid state for a DataFrame. pub fn validate_columns_slice(columns: &[Column]) -> PolarsResult<()> { - Self::validate_columns_iter(columns.iter()) + if columns.len() <= 4 { + // Too small to be worth spawning a hashmap for, this is at most 6 comparisons. + for i in 0..columns.len() - 1 { + let name = columns[i].name(); + let height = columns[i].len(); + + for other in columns.iter().skip(i + 1) { + if other.name() == name { + polars_bail!(duplicate = name); + } + + if other.len() != height { + polars_bail!( + ShapeMismatch: + "height of column '{}' ({}) does not match height of column '{}' ({})", + other.name(), other.len(), name, height + ) + } + } + } + + Ok(()) + } else { + Self::validate_columns_iter(columns.iter()) + } } /// Ensure all equal height and names are unique. @@ -50,7 +74,7 @@ impl DataFrame { } if names.contains(col_name) { - polars_bail!(Duplicate: "column '{}' is duplicate", col_name) + polars_bail!(duplicate = col_name) } names.insert(col_name); diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index 34d553a65db0..a65c06c7454a 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -379,6 +379,5 @@ fn pivot_impl_single_column( }); out?; - // SAFETY: length has already been checked. - unsafe { DataFrame::new_no_length_checks(final_cols) } + DataFrame::new(final_cols) } diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs index eb2cf3a228c1..5737d35b0eae 100644 --- a/crates/polars-ops/src/series/ops/to_dummies.rs +++ b/crates/polars-ops/src/series/ops/to_dummies.rs @@ -46,8 +46,7 @@ impl ToDummies for Series { }) .collect::>(); - // SAFETY: `dummies_helper` functions preserve `self.len()` length - unsafe { DataFrame::new_no_length_checks(sort_columns(columns)) } + DataFrame::new(sort_columns(columns)) } } From 4b06c3ee0b3eef682b2eda76081e670bf2ae1fb2 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:27:06 +1100 Subject: [PATCH 11/24] c --- crates/polars-core/src/frame/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 5234837c9c14..44fdf6c68f34 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -261,6 +261,8 @@ impl DataFrame { /// Create a DataFrame from a Vector of Series. /// + /// Errors if a column names are not unique, or if heights are not all equal. + /// /// # Example /// /// ``` From 6aa6cf2dd5c1fc7ad7b5e71ccfe9a987498e449e Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:27:54 +1100 Subject: [PATCH 12/24] c --- crates/polars-core/src/frame/validation.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index 7da1186d6007..6429f1280a26 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -9,7 +9,9 @@ impl DataFrame { /// /// An Ok() result indicates `columns` is a valid state for a DataFrame. pub fn validate_columns_slice(columns: &[Column]) -> PolarsResult<()> { - if columns.len() <= 4 { + if columns.len() <= 1 { + Ok(()) + } else if columns.len() <= 4 { // Too small to be worth spawning a hashmap for, this is at most 6 comparisons. for i in 0..columns.len() - 1 { let name = columns[i].name(); From d9d82115b4aa2589df513a23b3547103ee927881 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:31:16 +1100 Subject: [PATCH 13/24] c --- crates/polars-core/src/frame/horizontal.rs | 23 +++++++++++----------- crates/polars-core/src/frame/validation.rs | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index ecae228b7e44..07943e09287b 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -14,15 +14,6 @@ impl DataFrame { /// /// Note that on a debug build this will panic on duplicates / height mismatch. pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self { - // If we don't have any columns yet, copy the height from the given columns. - if let Some(fst) = columns.first() { - if self.width() == 0 { - // SAFETY: The functions invariants asks for all columns to be the same length so - // that makes that a valid height. - unsafe { self.set_height(fst.len()) }; - } - } - self.clear_schema(); self.columns.extend_from_slice(columns); @@ -30,6 +21,10 @@ impl DataFrame { Self::validate_columns_slice(&self.columns).unwrap(); } + if let Some(c) = self.columns.first() { + unsafe { self.set_height(c.len()) }; + } + self } @@ -45,10 +40,14 @@ impl DataFrame { /// } /// ``` pub fn hstack_mut(&mut self, columns: &[Column]) -> PolarsResult<&mut Self> { - // Validate first - on a debug build `hstack_mut_unchecked` will panic on invalid columns. - Self::validate_columns_iter(self.get_columns().iter().chain(columns))?; + self.clear_schema(); + self.columns.extend_from_slice(columns); - unsafe { self.hstack_mut_unchecked(columns) }; + Self::validate_columns_slice(&self.columns)?; + + if let Some(c) = self.columns.first() { + unsafe { self.set_height(c.len()) }; + } Ok(self) } diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index 6429f1280a26..8c48091a0eea 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -34,7 +34,7 @@ impl DataFrame { Ok(()) } else { - Self::validate_columns_iter(columns.iter()) + DataFrame::validate_columns_iter(columns.iter()) } } From 1c132cc5fd42f78e6a21bee6ee73d525e0bbaa3f Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:33:29 +1100 Subject: [PATCH 14/24] c --- crates/polars-core/src/frame/horizontal.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 07943e09287b..8ff4d839464d 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -29,7 +29,7 @@ impl DataFrame { } /// Add multiple [`Column`] to a [`DataFrame`]. - /// The added `Series` are required to have the same length. + /// Errors if the resulting DataFrame columns have duplicate names or unequal heights. /// /// # Example /// From 68b32a5877fb4217365f5f024b51da57854f4d13 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:34:42 +1100 Subject: [PATCH 15/24] c --- crates/polars-core/src/frame/horizontal.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 8ff4d839464d..fc3ad2bb2d37 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -18,7 +18,7 @@ impl DataFrame { self.columns.extend_from_slice(columns); if cfg!(debug_assertions) { - Self::validate_columns_slice(&self.columns).unwrap(); + DataFrame::validate_columns_slice(&self.columns).unwrap(); } if let Some(c) = self.columns.first() { @@ -43,7 +43,7 @@ impl DataFrame { self.clear_schema(); self.columns.extend_from_slice(columns); - Self::validate_columns_slice(&self.columns)?; + DataFrame::validate_columns_slice(&self.columns)?; if let Some(c) = self.columns.first() { unsafe { self.set_height(c.len()) }; From 7bc9c33db4373b0351f20b9dfa1db44e7f8f9189 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:36:57 +1100 Subject: [PATCH 16/24] c --- crates/polars-core/src/frame/horizontal.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index fc3ad2bb2d37..3c7efa7c636e 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -108,15 +108,11 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars acc_cols.extend(df.get_columns().iter().cloned()); } - let df = if check_duplicates { - DataFrame::new(acc_cols).map_err(|e| e.context("unable to hstack".into()))? - } else { - if cfg!(debug_assertions) && acc_cols.len() > 1 { - assert!(acc_cols.iter().all(|c| c.len() == acc_cols[0].len())); - } + if check_duplicates { + DataFrame::validate_columns_slice(&acc_cols)?; + } - unsafe { DataFrame::new_no_checks_height_from_first(acc_cols) } - }; + let df = unsafe { DataFrame::new_no_checks_height_from_first(acc_cols) }; Ok(df) } From e3ae1f9b1d3c0322ab93d3b60162bac133269392 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:41:08 +1100 Subject: [PATCH 17/24] c --- crates/polars-core/src/frame/validation.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index 8c48091a0eea..ad5d5681ab62 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -34,14 +34,11 @@ impl DataFrame { Ok(()) } else { - DataFrame::validate_columns_iter(columns.iter()) + DataFrame::_validate_columns_iter(columns.iter()) } } - /// Ensure all equal height and names are unique. - /// - /// An Ok() result indicates `columns` is a valid state for a DataFrame. - pub fn validate_columns_iter<'a, I: IntoIterator>( + fn _validate_columns_iter<'a, I: IntoIterator>( columns_iter: I, ) -> PolarsResult<()> { return _validate_columns_iter_impl(&mut columns_iter.into_iter()); From d2942228506d556d6a09a9f6c46567be1d855833 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 19 Feb 2025 23:47:18 +1100 Subject: [PATCH 18/24] c --- crates/polars-core/src/frame/validation.rs | 36 ++++++---------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/crates/polars-core/src/frame/validation.rs b/crates/polars-core/src/frame/validation.rs index ad5d5681ab62..bb54f6dfa257 100644 --- a/crates/polars-core/src/frame/validation.rs +++ b/crates/polars-core/src/frame/validation.rs @@ -10,8 +10,10 @@ impl DataFrame { /// An Ok() result indicates `columns` is a valid state for a DataFrame. pub fn validate_columns_slice(columns: &[Column]) -> PolarsResult<()> { if columns.len() <= 1 { - Ok(()) - } else if columns.len() <= 4 { + return Ok(()); + } + + if columns.len() <= 4 { // Too small to be worth spawning a hashmap for, this is at most 6 comparisons. for i in 0..columns.len() - 1 { let name = columns[i].name(); @@ -31,36 +33,16 @@ impl DataFrame { } } } - - Ok(()) } else { - DataFrame::_validate_columns_iter(columns.iter()) - } - } - - fn _validate_columns_iter<'a, I: IntoIterator>( - columns_iter: I, - ) -> PolarsResult<()> { - return _validate_columns_iter_impl(&mut columns_iter.into_iter()); - - fn _validate_columns_iter_impl( - columns_iter: &mut dyn Iterator, - ) -> PolarsResult<()> { - let Some(first) = columns_iter.next() else { - return Ok(()); - }; + let first = &columns[0]; let first_len = first.len(); let first_name = first.name(); - let mut names = PlHashSet::with_capacity({ - let (_, exact) = columns_iter.size_hint(); - exact.unwrap_or(16) - }); - + let mut names = PlHashSet::with_capacity(columns.len()); names.insert(first_name); - for col in columns_iter { + for col in &columns[1..] { let col_name = col.name(); let col_len = col.len(); @@ -78,8 +60,8 @@ impl DataFrame { names.insert(col_name); } - - Ok(()) } + + Ok(()) } } From 84cf9b00f90a510edbe9aab7de79a4ab57134adc Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Thu, 20 Feb 2025 00:06:59 +1100 Subject: [PATCH 19/24] c --- crates/polars-schema/src/schema.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/polars-schema/src/schema.rs b/crates/polars-schema/src/schema.rs index bd65cdad9a12..21abfc0ddbf3 100644 --- a/crates/polars-schema/src/schema.rs +++ b/crates/polars-schema/src/schema.rs @@ -458,14 +458,14 @@ where } pub fn ensure_matching_schema_names(lhs: &Schema, rhs: &Schema) -> PolarsResult<()> { - let lhs = lhs.iter_names().collect::>(); - let rhs = rhs.iter_names().collect::>(); + let lhs_names = lhs.iter_names(); + let rhs_names = rhs.iter_names(); - if lhs != rhs { + if !(lhs_names.len() == rhs_names.len() && lhs_names.zip(rhs_names).all(|(l, r)| l == r)) { polars_bail!( SchemaMismatch: "lhs: {:?} rhs: {:?}", - lhs, rhs + lhs.iter_names().collect::>(), rhs.iter_names().collect::>() ) } From 40330c53e041028a3b5cf4ec9053bd5f6fde724e Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Thu, 20 Feb 2025 02:01:08 +1100 Subject: [PATCH 20/24] base @ hstack-validation From b8159510748dd0a6a5829103b772942128d40a5c Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Thu, 20 Feb 2025 19:33:22 +1100 Subject: [PATCH 21/24] undo mut before raising err --- crates/polars-core/src/frame/horizontal.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 3c7efa7c636e..141dcab367fa 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -18,7 +18,11 @@ impl DataFrame { self.columns.extend_from_slice(columns); if cfg!(debug_assertions) { - DataFrame::validate_columns_slice(&self.columns).unwrap(); + if let err @ Err(_) = DataFrame::validate_columns_slice(&self.columns) { + // Reset DataFrame state to before extend. + self.columns.truncate(self.columns.len() - columns.len()); + err.unwrap(); + } } if let Some(c) = self.columns.first() { @@ -43,7 +47,11 @@ impl DataFrame { self.clear_schema(); self.columns.extend_from_slice(columns); - DataFrame::validate_columns_slice(&self.columns)?; + if let err @ Err(_) = DataFrame::validate_columns_slice(&self.columns) { + // Reset DataFrame state to before extend. + self.columns.truncate(self.columns.len() - columns.len()); + err?; + } if let Some(c) = self.columns.first() { unsafe { self.set_height(c.len()) }; From 132447179a63f8670aa659b66b2cb1d9190c276a Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Thu, 20 Feb 2025 19:54:09 +1100 Subject: [PATCH 22/24] add test --- crates/polars-core/src/frame/horizontal.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 141dcab367fa..667f4c71a81b 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -143,5 +143,8 @@ mod tests { matches!(result, Err(PolarsError::ShapeMismatch(_))), "expected shape mismatch error" ); + + // Ensure the DataFrame is not mutated in the error case. + assert_eq!(df.width(), 0); } } From 278ac609ef1eb53989a84f6ce6f40a45f3a1825d Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Thu, 20 Feb 2025 20:26:08 +1100 Subject: [PATCH 23/24] comment --- crates/polars-core/src/frame/horizontal.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 667f4c71a81b..c139484d8e7a 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -35,6 +35,9 @@ impl DataFrame { /// Add multiple [`Column`] to a [`DataFrame`]. /// Errors if the resulting DataFrame columns have duplicate names or unequal heights. /// + /// Note: If `self` is empty, `self.height` will always be overridden by the height of the first + /// column in `columns`. + /// /// # Example /// /// ```rust From becfb02492f19262e630fd57c32ae24f5decbf11 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Thu, 20 Feb 2025 20:26:43 +1100 Subject: [PATCH 24/24] comment --- crates/polars-core/src/frame/horizontal.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index c139484d8e7a..932d37708092 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -12,6 +12,9 @@ impl DataFrame { /// - the length of all [`Column`] is equal to the height of this [`DataFrame`] /// - the columns names are unique /// + /// Note: If `self` is empty, `self.height` will always be overridden by the height of the first + /// column in `columns`. + /// /// Note that on a debug build this will panic on duplicates / height mismatch. pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self { self.clear_schema();