From 14ed394019d9e2ee77af6de8c82402f1f2c16e89 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Thu, 2 Jan 2025 00:36:55 -0500 Subject: [PATCH] docs(machine_learning_patterns.md): add ml patterns tutorial add tutorial covering: data validation with eager/lazy evaluation time series operations and validation feature engineering with backend-agnostic transformations environment management for production/testing closes #1696 --- .../machine_learning_patterns.md | 421 ++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 422 insertions(+) create mode 100644 docs/pandas_like_concepts/machine_learning_patterns.md diff --git a/docs/pandas_like_concepts/machine_learning_patterns.md b/docs/pandas_like_concepts/machine_learning_patterns.md new file mode 100644 index 000000000..0c75bf0c4 --- /dev/null +++ b/docs/pandas_like_concepts/machine_learning_patterns.md @@ -0,0 +1,421 @@ +# Machine Learning Patterns in Narwhals + +There are two types of ML developers: +- Those who write different code for different DataFrame backends +- Those who want their code to work consistently everywhere + +Narwhals aims to help the second group! Let's learn about the patterns that make this possible. + +## 1. Backward Compatibility + +When building ML pipelines, you want your code to remain stable across updates. Narwhals provides this through its stable API: + +```python exec="1" source="material-block" session="ex1" +import narwhals.stable.v1 as nw +from narwhals.typing import IntoFrameT +import pandas as pd + +# Example dataset +data = {"feature1": [1, 2, 3], "feature2": [4, 5, 6]} + + +def backward_compatible_workflow(df: IntoFrameT) -> IntoFrameT: + """Use Narwhals stable.v1 API to process data.""" + # Convert to Narwhals lazy frame + df_nw = nw.from_native(df) + + # Perform transformations + df_transformed = df_nw.select( + [ + nw.col("feature1").mean().alias("mean_feature1"), + nw.col("feature2").sum().alias("sum_feature2"), + ] + ) + + # Convert back to native format (e.g., Pandas) + return df_transformed.to_native() +``` + +What makes this work? The stable API uses semantic versioning with additional guarantees: +1. `narwhals.stable.v1` is frozen - no breaking changes ever +2. New features only appear in new major versions (v2, v3, etc.) +3. Multiple versions can coexist, so you can migrate gradually: + +```python exec="true" source="material-block" result="python" session="ex1" +# Testing the backward-compatible workflow +import pandas as pd + +df_pd = pd.DataFrame(data) +result = backward_compatible_workflow(df_pd) + +print("Result using Narwhals stable API v1:") +print(result) +``` + +## 2. The Collect-Then-Item Pattern + +If you've ever seen errors like: +> AttributeError: 'LazyFrame' object has no attribute 'item' + +or + +> AttributeError: 'DataFrame' object has no attribute 'collect' + +This pattern is for you. The challenge comes from how different backends handle materialization: + +- Pandas (eager): Values are always materialized, just use `item()` +- Polars (mixed): Some operations are lazy, needs `collect()` then `item()` +- Dask (lazy): Everything is lazy until explicitly materialized + +Here's how to handle all cases consistently: + +=== "Incorrect Approach" +```python exec="1" source="material-block" session="ex2" +import narwhals as nw +from narwhals.typing import FrameT +import pandas as pd +import polars as pl +import dask.dataframe as dd + +# Create sample ML dataset +data = { + "numeric_feature": [1.5, 2.0, None, 4.0, 5.5], # Has missing value + "categorical_feature": ["A", "B", "A", "C", "B"], # Needs encoding +} + + +@nw.narwhalify +def get_mean_wrong(df: FrameT, column: str) -> float: + # This fails with Dask - no collect() + return df.select([nw.col(column).mean()]).item() +``` + +=== "Also Incorrect" + ```python exec="1" source="above" session="ex2" + @nw.narwhalify + def get_mean_also_wrong(df: FrameT, column: str) -> float: + # This fails with Pandas - no collect() method + return df.select([nw.col(column).mean()]).collect().item() + ``` + +=== "Correct Approach" + ```python exec="1" source="above" session="ex2" + @nw.narwhalify + def get_mean(df: FrameT, column: str) -> float: + result = df.select([nw.col(column).mean()]) + # Check if we need collect() + return result.item() if not hasattr(result, "collect") else result.collect().item() + ``` + +Under the hood: +1. `hasattr(result, 'collect')` checks if we're dealing with a lazy frame +2. For eager frames (Pandas): Just use `item()` directly +3. For lazy frames (Dask): First `collect()` to materialize, then `item()` +4. For mixed frames (Polars): Same as lazy frames + +Let's see it work across all backends: + +```python exec="true" source="material-block" result="python" session="ex2" +# Start with any lazy backend (e.g., Dask) +df_pd = pd.DataFrame(data) +df_dask = dd.from_pandas(df_pd, npartitions=2) + +# Test across backends +backends = { + "Pandas (eager)": df_pd, + "Polars (mixed)": pl.DataFrame(data), + "Dask (lazy)": df_dask, +} + +for name, df in backends.items(): + df_nw = nw.from_native(df) + try: + result = get_mean_wrong(df_nw, "numeric_feature") + print(f"{name}: {result}") + except Exception as e: + print(f"{name}: Failed - {str(e)}") +``` + +## 3. Data Validation + +When validating data for ML, you often need to handle custom objects or mixed types. This is especially tricky because: +- Different backends handle non-standard types differently +- You need different behavior in development vs production +- Error messages should be helpful for debugging + +Narwhals solves this with two modes: + +=== "Development Mode" + ```python exec="1" source="material-block" session="ex3" + import narwhals as nw + from narwhals.typing import FrameT + import pandas as pd + + # Create sample data for demonstration + class CustomObject: + def __init__(self, value): + self.value = value + + def __str__(self): + return f"Custom({self.value})" + + + data = { + "feature": [1, 2, 3], + "custom": [CustomObject(1), CustomObject(2), CustomObject(3)], + } + df = pd.DataFrame(data) + + # Allow inspection of problematic data + df_nw = nw.from_native(df, pass_through=True) + ``` + +=== "Production Mode" + ```python exec="1" source="above" session="ex3" + # Strict validation for production + df_nw = nw.from_native(df, pass_through=False) + ``` + +What's happening under the hood? +1. Development mode (`pass_through=True`): + - Wraps unsupported objects without conversion + - Allows inspection of problematic data + - Operations fail only when actually using bad columns + +2. Production mode (`pass_through=False`): + - Validates all columns immediately + - Fails fast if any column has unsupported types + - Prevents bad data from entering your pipeline + +Let's see the difference: + +```python exec="true" source="material-block" result="python" session="ex3" +class CustomObject: + def __init__(self, value): + self.value = value + + def __str__(self): + return f"Custom({self.value})" + + +data = { + "feature": [1, 2, 3], + "custom": [CustomObject(1), CustomObject(2), CustomObject(3)], +} +df = pd.DataFrame(data) + +print("Development Mode:") +print("-" * 50) +df_dev = nw.from_native(df, pass_through=True) +print("1. Load data (succeeds):") +print(df_dev.to_native()) + +print("\n2. Use good column (succeeds):") +result = df_dev.select([nw.col("feature").mean()]) +print(result.to_native()) + +print("\n3. Use bad column (fails helpfully):") +try: + result = df_dev.select([nw.col("custom").mean()]) +except Exception as e: + print(f"Error: {str(e)}") + +print("\nProduction Mode:") +print("-" * 50) +print("1. Load data (fails fast):") +try: + df_prod = nw.from_native(df, pass_through=False) +except Exception as e: + print(f"Error: {str(e)}") +``` + +## 4. Feature Engineering + +When preprocessing features for ML, you need to handle both eager operations (like computing statistics) and lazy operations (like transformations). The challenge is doing this efficiently across backends. + +Here's what can go wrong: + +=== "Memory Inefficient" + ```python exec="1" source="material-block" session="ex4" + import narwhals as nw + from narwhals.typing import FrameT + import pandas as pd + + + @nw.narwhalify + def process_feature_inefficient(df: FrameT, column: str) -> FrameT: + # Bad: Materializes entire column just to compute mean + mean_val = df.select([nw.col(column)]).collect().to_numpy().mean() + return df.select([nw.col(column).fill_null(mean_val)]) + ``` + +=== "Type Unsafe" + ```python exec="1" source="above" session="ex4" + @nw.narwhalify + def process_feature_unsafe(df: FrameT, column: str) -> FrameT: + # Bad: No type casting before computing mean + mean_val = df.select([nw.col(column).mean()]).item() + return df.select([nw.col(column).fill_null(mean_val)]) + ``` + +=== "Correct Approach" + ```python exec="1" source="above" session="ex4" + @nw.narwhalify(eager_only=True) + def process_feature(df: FrameT, column: str) -> FrameT: + # 1. Cast to correct type first + # 2. Use collect-then-item pattern + # 3. Keep final transformation lazy + result = df.select([nw.col(column).cast(nw.Float64()).mean()]) + mean_val = ( + result.item() if not hasattr(result, "collect") else result.collect().item() + ) + + return df.select( + [nw.col(column).cast(nw.Float64()).fill_null(mean_val).alias(column)] + ) + ``` + +The correct approach: +1. Uses `eager_only=True` to signal immediate value needs +2. Casts to proper type before computing statistics +3. Uses collect-then-item pattern for materialization +4. Keeps final transformation lazy for optimization + +Let's see it handle tricky data: + +```python exec="true" source="material-block" result="python" session="ex4" +data = { + "clean": [1.0, None, 3.0], + "mixed": ["1.0", None, "3.0"], + "invalid": ["1.0", "bad", "3.0"], +} +df = pd.DataFrame(data) +df_nw = nw.from_native(df) + +print("Clean numeric data:") +print(process_feature(df_nw, "clean")) + +print("\nMixed string/numeric:") +print(process_feature(df_nw, "mixed")) + +print("\nInvalid data (fails safely):") +try: + print(process_feature(df_nw, "invalid")) +except Exception as e: + print(f"Error: {str(e)}") +``` + +## 5. Time Series Validation + +Time series data adds extra complexity: +- Timestamps must be unique within groups +- Different backends handle timestamps differently +- Operations must stay lazy for large datasets + +Here's how to handle it: + +=== "Memory Inefficient" + ```python exec="1" source="material-block" session="ex5" + import narwhals as nw + from narwhals.typing import FrameT + import pandas as pd + + + @nw.narwhalify + def check_duplicates_inefficient(df: FrameT, id_col: str, time_col: str) -> FrameT: + # Bad: Materializes entire frame to check duplicates + return ( + df.collect() + .group_by([id_col, time_col]) + .agg([nw.col(time_col).count().alias("count")]) + .filter(nw.col("count") > 1) + ) + ``` + +=== "Correct Approach" + ```python exec="1" source="above" session="ex5" + @nw.narwhalify + def check_duplicates(df: FrameT, id_col: str, time_col: str) -> FrameT: + # 1. Group by stays lazy + # 2. Count stays lazy + # 3. Filter stays lazy + counts = df.group_by([id_col, time_col]).agg( + [nw.col(time_col).count().alias("count")] + ) + return counts.filter(nw.col("count") > 1) + ``` + +Under the hood: +1. `group_by` creates a lazy grouped frame +2. `agg` defines the computation but doesn't execute +3. `filter` adds to the computation plan +4. Final result stays lazy until needed + +Let's test with real data: + +```python exec="true" source="material-block" result="python" session="ex5" +import pandas as pd + +# Create time series with known issues +dates = pd.date_range("2023-01-01", periods=3, freq="H") +data = { + "id": [1, 1, 1, 2, 2], + "timestamp": [ + dates[0], # First timestamp + dates[0], # Duplicate for id=1 + dates[1], + dates[0], # Duplicate for id=2 + dates[2], + ], +} +df = pd.DataFrame(data) +df_nw = nw.from_native(df) + +print("Found duplicates:") +print(check_duplicates(df_nw, "id", "timestamp")) +``` + +## 6. Environment Management + +Production ML pipelines need different environments for different stages: + +=== "Development Environment" + ```toml + # Lean dependencies for production + [tool.hatch.envs.default] + dependencies = [ + "narwhals", + "pandas" + ] + ``` + +=== "Testing Environment" + ```toml + # Full dependencies for testing + [tool.hatch.envs.test] + dependencies = [ + "narwhals", + "pandas", + "polars", + "dask", + "pytest", + "hypothesis" + ] + ``` + +Why two environments? +1. Development: + - Minimal dependencies + - Faster builds + - Smaller containers + - Matches production + +2. Testing: + - All backends + - Testing tools + - Validation tools + - Performance profiling + +This ensures your code works everywhere while keeping production deployments lean. + +**Happy coding!** 🚀 diff --git a/mkdocs.yml b/mkdocs.yml index 307f8a6aa..d7af2ac0e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,6 +17,7 @@ nav: - pandas_like_concepts/column_names.md - pandas_like_concepts/boolean.md - pandas_like_concepts/null_handling.md + - pandas_like_concepts/machine_learning_patterns.md - Overhead: overhead.md - Perfect backwards compatibility policy: backcompat.md - Supported libraries and extending Narwhals: extending.md