diff --git a/CHANGELOG.md b/CHANGELOG.md index d9bacaafa6..42efb25dcb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,7 @@ - support for adding or subtracting timestamps and `Timedelta`. - support for binary arithmetic between two `Timedelta` values. - support for lazy `TimedeltaIndex`. + - support for `pd.to_timedelta`. - Added support for index's arithmetic and comparison operators. - Added support for `Series.dt.round`. - Added documentation pages for `DatetimeIndex`. diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst index 8f21e7431d..b055ed9dc6 100644 --- a/docs/source/modin/supported/general_supported.rst +++ b/docs/source/modin/supported/general_supported.rst @@ -163,7 +163,8 @@ Top-level dealing with datetime-like data | | | | - or ``arg`` is DataFrame and data type is not int | | | | | - or ``arg`` is Series and data type is string | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``to_timedelta`` | N | | | +| ``to_timedelta`` | P | ``errors`` | ``N`` if ``errors`` is given or converting from | +| | | | string type | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ Top-level dealing with Interval data diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 3a9198a5fe..8d933cd6a1 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -23,7 +23,7 @@ from __future__ import annotations from collections.abc import Hashable, Iterable, Mapping, Sequence -from datetime import date, datetime, tzinfo +from datetime import date, datetime, timedelta, tzinfo from logging import getLogger from typing import TYPE_CHECKING, Any, Literal, Union @@ -35,6 +35,7 @@ from pandas._libs.tslibs import to_offset from pandas._typing import ( AnyArrayLike, + ArrayLike, Axis, DateTimeErrorChoices, IndexLabel, @@ -2096,21 +2097,116 @@ def get_names(obj): return None -@_inherit_docstrings(pandas.to_datetime, apilink="pandas.to_timedelta") @snowpark_pandas_telemetry_standalone_function_decorator -@pandas_module_level_function_not_implemented() -def to_timedelta(arg, unit=None, errors="raise"): # noqa: PR01, RT01, D200 +def to_timedelta( + arg: str + | int + | float + | timedelta + | list + | tuple + | range + | ArrayLike + | pd.Index + | pd.Series + | pandas.Index + | pandas.Series, + unit: str = None, + errors: DateTimeErrorChoices = "raise", +): """ Convert argument to timedelta. - Accepts str, timedelta, list-like or Series for arg parameter. - Returns a Series if and only if arg is provided as a Series. + Timedeltas are absolute differences in times, expressed in difference + units (e.g. days, hours, minutes, seconds). This method converts + an argument from a recognized timedelta format / value into + a Timedelta type. + + Parameters + ---------- + arg : str, timedelta, list-like or Series + The data to be converted to timedelta. + unit : str, optional + Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``. + + Possible values: + * 'W' + * 'D' / 'days' / 'day' + * 'hours' / 'hour' / 'hr' / 'h' / 'H' + * 'm' / 'minute' / 'min' / 'minutes' / 'T' + * 's' / 'seconds' / 'sec' / 'second' / 'S' + * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' + * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' + * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' + + Must not be specified when `arg` contains strings and ``errors="raise"``. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. + + Returns + ------- + timedelta + If parsing succeeded. + Return type depends on input: + - list-like: TimedeltaIndex of timedelta64 dtype + - Series: Series of timedelta64 dtype + - scalar: Timedelta + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. + convert_dtypes : Convert dtypes. + + Notes + ----- + If the precision is higher than nanoseconds, the precision of the duration is + truncated to nanoseconds for string inputs. + + Examples + -------- + Parsing a single string to a Timedelta: + + >>> pd.to_timedelta('1 days 06:05:01.00003') + Timedelta('1 days 06:05:01.000030') + >>> pd.to_timedelta('15.5us') + Timedelta('0 days 00:00:00.000015500') + + Parsing a list or array of strings: + + >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) + + Converting numbers by specifying the `unit` keyword argument: + + >>> pd.to_timedelta(np.arange(5), unit='s') + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], + dtype='timedelta64[ns]', freq=None) + >>> pd.to_timedelta(np.arange(5), unit='d') + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py - if isinstance(arg, Series): - query_compiler = arg._query_compiler.to_timedelta(unit=unit, errors=errors) - return Series(query_compiler=query_compiler) - return pandas.to_timedelta(arg, unit=unit, errors=errors) + # If arg is snowpark pandas lazy object call to_timedelta on the query compiler. + if isinstance(arg, (Series, pd.Index)): + query_compiler = arg._query_compiler.to_timedelta( + unit=unit if unit else "ns", + errors=errors, + include_index=isinstance(arg, pd.Index), + ) + return arg.__constructor__(query_compiler=query_compiler) + + # Use native pandas to_timedelta for scalar values and list-like objects. + result = pandas.to_timedelta(arg, unit=unit, errors=errors) + # Convert to lazy if result is a native pandas Series or Index. + if isinstance(result, pandas.Index): + return pd.Index(result) + if isinstance(result, pandas.Series): + return pd.Series(result) + # Return the result as is for scaler. + return result @snowpark_pandas_telemetry_standalone_function_decorator diff --git a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py index f71fbd14bd..4860baf4ac 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py @@ -21,6 +21,7 @@ cast, convert_timezone, date_part, + floor, iff, to_decimal, ) @@ -39,6 +40,57 @@ _FractionalType, ) +# Reference: https://github.com/pandas-dev/pandas/blob/ef3368a8046f3c2e98c773be179f0a49a51d4bdc/pandas/_libs/tslibs/timedeltas.pyx#L109 +# Note: this does not include deprecated units 'M' and 'Y'. +VALID_PANDAS_TIMEDELTA_ABBREVS = { + "W": "W", + "w": "W", + "D": "D", + "d": "D", + "days": "D", + "day": "D", + "hours": "h", + "hour": "h", + "hr": "h", + "h": "h", + "m": "m", + "minute": "m", + "min": "m", + "minutes": "m", + "s": "s", + "seconds": "s", + "sec": "s", + "second": "s", + "ms": "ms", + "milliseconds": "ms", + "millisecond": "ms", + "milli": "ms", + "millis": "ms", + "us": "us", + "microseconds": "us", + "microsecond": "us", + "µs": "us", + "micro": "us", + "micros": "us", + "ns": "ns", + "nanoseconds": "ns", + "nano": "ns", + "nanos": "ns", + "nanosecond": "ns", +} + +# multipliers to convert the timedelta unit to nanoseconds +TIMEDELTA_UNIT_MULTIPLIER = { + "W": 7 * 24 * 3600 * (10**9), + "D": 24 * 3600 * (10**9), + "h": 3600 * (10**9), + "m": 60 * (10**9), + "s": (10**9), + "ms": (10**6), + "us": (10**3), + "ns": 1, +} + VALID_TO_DATETIME_DF_KEYS = { "year": "year", "years": "year", @@ -111,6 +163,17 @@ def col_to_s(col: Column, unit: Literal["D", "s", "ms", "us", "ns"]) -> Column: return col / 10**9 +def col_to_timedelta(col: Column, unit: str) -> Column: + """ + Converts ``col`` (stored in the specified units) to timedelta nanoseconds. + """ + td_unit = VALID_PANDAS_TIMEDELTA_ABBREVS.get(unit) + if not td_unit: + # Same error as native pandas. + raise ValueError(f"invalid unit abbreviation: {unit}") + return cast(floor(col * TIMEDELTA_UNIT_MULTIPLIER[td_unit]), LongType()) + + PANDAS_DATETIME_FORMAT_TO_SNOWFLAKE_MAPPING = { "%Y": "YYYY", "%y": "YY", diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index e13c77f8ec..e77cb99fa8 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -270,6 +270,7 @@ from snowflake.snowpark.modin.plugin._internal.timestamp_utils import ( VALID_TO_DATETIME_DF_KEYS, DateTimeOrigin, + col_to_timedelta, generate_timestamp_col, raise_if_to_datetime_not_supported, to_snowflake_timestamp_format, @@ -6223,6 +6224,86 @@ def dataframe_to_datetime( ) ) + def to_timedelta( + self, + unit: str = "ns", + errors: DateTimeErrorChoices = "raise", + include_index: bool = False, + ) -> "SnowflakeQueryCompiler": + """ + Convert data to timedelta. + + Args: + unit: Denotes unit of the input data. Defaults to 'ns'. + Possible values: + * 'W' + * 'D' / 'days' / 'day' + * 'hours' / 'hour' / 'hr' / 'h' / 'H' + * 'm' / 'minute' / 'min' / 'minutes' / 'T' + * 's' / 'seconds' / 'sec' / 'second' / 'S' + * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' + * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' + * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. + include_index: If true, also convert index columns to timedelta. + + Returns: + A new query compiler with the data converted to timedelta. + """ + if errors != "raise": + ErrorMessage.parameter_not_implemented_error("errors", "pd.to_timedelta") + internal_frame = self._modin_frame + col_ids = internal_frame.data_column_snowflake_quoted_identifiers + data_column_types = [TimedeltaType()] * len(col_ids) + + index_column_types = internal_frame.cached_index_column_snowpark_pandas_types + if include_index: + col_ids.extend(internal_frame.index_column_snowflake_quoted_identifiers) + index_column_types = [TimedeltaType()] * len( + internal_frame.index_column_snowflake_quoted_identifiers + ) + + # Raise error if the original data type is not numeric. + id_to_type = internal_frame.quoted_identifier_to_snowflake_type(col_ids) + for id, sf_type in id_to_type.items(): + if isinstance(sf_type, TimedeltaType): + # already timedelta + col_ids.remove(id) + elif isinstance(sf_type, StringType): + ErrorMessage.not_implemented( + "Snowpark pandas method pd.to_timedelta does not yet support conversion from string type" + ) + elif not isinstance(sf_type, _NumericType): + raise TypeError( + f"dtype {TypeMapper.to_pandas(sf_type)} cannot be converted to timedelta64[ns]" + ) + + # If all columns are already timedelta. No conversion is needed. + if not col_ids: + return self + + internal_frame = ( + internal_frame.update_snowflake_quoted_identifiers_with_expressions( + {col_id: col_to_timedelta(col(col_id), unit) for col_id in col_ids} + ).frame + ) + + return SnowflakeQueryCompiler( + internal_frame.create( + ordered_dataframe=internal_frame.ordered_dataframe, + data_column_pandas_index_names=internal_frame.data_column_pandas_index_names, + data_column_pandas_labels=internal_frame.data_column_pandas_labels, + index_column_pandas_labels=internal_frame.index_column_pandas_labels, + data_column_snowflake_quoted_identifiers=internal_frame.data_column_snowflake_quoted_identifiers, + index_column_snowflake_quoted_identifiers=internal_frame.index_column_snowflake_quoted_identifiers, + data_column_types=data_column_types, + index_column_types=index_column_types, + ) + ) + def series_to_datetime( self, errors: DateTimeErrorChoices = "raise", diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index bbd415536a..95fcf68492 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -2624,9 +2624,11 @@ def __repr__(self) -> str: name_repr = f", name='{self.name}'" if self.name else "" # Length is displayed only when the number of elements is greater than the number of elements to display. length_repr = f", length={length_of_index}" if too_many_elem else "" - # The frequency is displayed only for DatetimeIndex. + # The frequency is displayed for DatetimeIndex and TimedeltaIndex # TODO: SNOW-1625233 update freq_repr; replace None with the correct value. - freq_repr = ", freq=None" if "DatetimeIndex" in class_name else "" + freq_repr = ( + ", freq=None" if class_name in ("DatetimeIndex", "TimedeltaIndex") else "" + ) repr = ( class_name diff --git a/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py b/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py index 7facf4acef..86ed2a5ded 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py @@ -107,12 +107,12 @@ def __init__( Examples -------- >>> pd.TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days']) - TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]') + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) We can also let pandas infer the frequency when possible. >>> pd.TimedeltaIndex(np.arange(5) * 24 * 3600 * 1e9, freq='infer') - TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]') + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ if query_compiler: # Raise error if underlying type is not a Timedelta type. diff --git a/tests/integ/modin/tools/test_to_timedelta.py b/tests/integ/modin/tools/test_to_timedelta.py new file mode 100644 index 0000000000..6f33702702 --- /dev/null +++ b/tests/integ/modin/tools/test_to_timedelta.py @@ -0,0 +1,191 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +""" test to_timedelta function""" +import re +from datetime import timedelta + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +from snowflake.snowpark.modin.plugin._internal.timestamp_utils import ( + VALID_PANDAS_TIMEDELTA_ABBREVS, +) +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import ( + assert_index_equal, + assert_series_equal, + eval_snowpark_pandas_result, +) + +TIMEDELTA_DATA = [ + "1W", + "1 W", + "1d", + "2 D", + "3day", + "4 day", + "5days", + "6 days 06:05:01.00003", + "1h", + "2 H", + "3hr", + "4 hour", + "5hours", + "1m", + "2 T", + "3min", + "4 minute", + "5minutes", + "1s", + "2 S", + "3sec", + "4 second", + "5seconds", + "1ms", + "2 L", + "3milli", + "4 millisecond", + "5milliseconds", + "1us", + "2 U", + "3micro", + "4 microsecond", + "5microseconds", + "1ns", + "2 N", + "3nano", + "4 nanosecond", + "5nanoseconds", + "1 day 3 millis", + "6 hours 4 nanos", + "4 days 00:01:02.000000009", + "02:01:03", + 567, + 123.456, + timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5, microseconds=6), +] + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize("arg", TIMEDELTA_DATA) +def test_to_timedelta_scalar(arg): + assert native_pd.to_timedelta(arg) == pd.to_timedelta(arg) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize("arg", [None, np.nan, "nan", "", "NaT", np.timedelta64("NaT")]) +def test_to_timedelta_na(arg): + assert pd.isna(native_pd.to_timedelta(arg)) + assert pd.isna(pd.to_timedelta(arg)) + + +@sql_count_checker(query_count=1) +@pytest.mark.parametrize( + "data", [TIMEDELTA_DATA, np.array(TIMEDELTA_DATA), tuple(TIMEDELTA_DATA)] +) +def test_to_timedelta_listlike(data): + assert_index_equal(pd.to_timedelta(data), native_pd.to_timedelta(data)) + + +@sql_count_checker(query_count=2) +@pytest.mark.parametrize("unit", VALID_PANDAS_TIMEDELTA_ABBREVS.keys()) +def test_to_timedelta_series(unit): + native_series = native_pd.Series([1, 2, 3]) + snow_series = pd.Series([1, 2, 3]) + expected = native_pd.to_timedelta(native_series, unit=unit) + + # native series + assert_series_equal(pd.to_timedelta(native_series, unit=unit), expected) + # lazy series + assert_series_equal(pd.to_timedelta(snow_series, unit=unit), expected) + + +@sql_count_checker(query_count=2) +@pytest.mark.parametrize("unit", VALID_PANDAS_TIMEDELTA_ABBREVS.keys()) +def test_to_timedelta_units(unit): + native_index = native_pd.Index([1, 2, 3]) + snow_index = pd.Index([1, 2, 3]) + expected = native_pd.to_timedelta(native_index, unit=unit) + + # native index + assert_index_equal(pd.to_timedelta(native_index, unit=unit), expected) + # lazy index + assert_index_equal(pd.to_timedelta(snow_index, unit=unit), expected) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize("unit", ["year", "month", "week"]) +def test_to_timedelta_invalid_unit(unit): + data = [1, 2, 3] + eval_snowpark_pandas_result( + "snow", + "native", + lambda x: native_pd.to_timedelta(data, unit=unit) + if x == "native" + else pd.to_timedelta(data, unit=unit), + expect_exception=True, + expect_exception_type=ValueError, + expect_exception_match=f"invalid unit abbreviation: {unit}", + ) + + +@sql_count_checker(query_count=1) +@pytest.mark.parametrize( + "native_index", + [ + native_pd.Index([1, 3, 4]), + native_pd.Index([1.4, 3.0, 45.9]), + native_pd.TimedeltaIndex(["1 min", "2 seconds", "3 millis"]), + ], +) +def test_to_timedelta_dtypes(native_index): + snow_index = pd.Index(native_index) + eval_snowpark_pandas_result( + snow_index, + native_index, + lambda x: native_pd.to_timedelta(x, unit="nanos") + if isinstance(x, native_pd.Index) + else pd.to_timedelta(x, unit="nanos"), + ) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize( + "native_index", + [native_pd.Index([True, False]), native_pd.date_range("20210101", periods=3)], +) +def test_to_timedelta_invalid_dtype(native_index): + snow_index = pd.Index(native_index) + eval_snowpark_pandas_result( + snow_index, + native_index, + lambda x: native_pd.to_timedelta(x) + if isinstance(x, native_pd.Index) + else pd.to_timedelta(x), + expect_exception=True, + expect_exception_type=TypeError, + expect_exception_match=re.escape( + f"dtype {native_index.dtype} cannot be converted to timedelta64[ns]" + ), + ) + + +@sql_count_checker(query_count=0) +def test_to_timedelta_string_dtype_not_implemented(): + data = pd.Index(["1 days", "2 days", "3 days"]) + msg = "Snowpark pandas method pd.to_timedelta does not yet support conversion from string type" + with pytest.raises(NotImplementedError, match=msg): + pd.to_timedelta(data) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize("errors", ["ignore", "coerce"]) +def test_to_timedelta_errors_not_implemented(errors): + data = pd.Index([1, 2, 3]) + msg = "Snowpark pandas method pd.to_timedelta does not yet support the 'errors' parameter" + with pytest.raises(NotImplementedError, match=msg): + pd.to_timedelta(data, errors=errors) diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py index 9aecdf81d4..0b2a94a20b 100644 --- a/tests/integ/modin/utils.py +++ b/tests/integ/modin/utils.py @@ -249,10 +249,10 @@ def assert_snowpark_pandas_equal_to_pandas( Raises: AssertionError if the converted dataframe does not match with the original one """ - assert isinstance(snow, (DataFrame, Series, Index)) + assert isinstance(snow, (DataFrame, Series, Index)), f"Got type: {type(snow)}" assert isinstance( expected_pandas, (native_pd.DataFrame, native_pd.Series, native_pd.Index) - ) + ), f"Got type: {type(expected_pandas)}" # Due to server-side compression, only check that index values are equivalent and ignore the # index types. Snowpark pandas will use the smallest possible dtype (typically int8), while # native pandas will default to int64. diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py index ad0ae0aa5d..1e72dbd43c 100644 --- a/tests/unit/modin/test_unsupported.py +++ b/tests/unit/modin/test_unsupported.py @@ -48,7 +48,6 @@ def test_unsupported_io(io_method, kwargs): ["crosstab", {"index": "", "columns": ""}], ["lreshape", {"data": "", "groups": ""}], ["wide_to_long", {"df": "", "stubnames": "", "i": "", "j": ""}], - ["to_timedelta", {"arg": ""}], ], ) def test_unsupported_general(general_method, kwargs):