snowflakedb · sfc-gh-azhan · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -78,6 +78,7 @@
 #### Improvements
 
 - Refactored `quoted_identifier_to_snowflake_type` to avoid making metadata queries if the types have been cached locally.
+- Improved `pd.to_datetime` to handle all local input cases. 
 
 #### Bug Fixes
 

@@ -1742,16 +1742,13 @@ def to_datetime(
 
     The default behaviour (``utc=False``) is as follows:
 
-    - Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
+    - Timezone-naive inputs are kept as timezone-naive :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex`:
 
-    >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
+    >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
     DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)
 
-    - Timezone-aware inputs *with constant time offset* are still converted to
-      timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.
-
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
-    DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
+    DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Use right format to convert to timezone-aware type (Note that when call Snowpark
       pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
@@ -1763,17 +1760,17 @@ def to_datetime(
       issued from a timezone with daylight savings, such as Europe/Paris):
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
-    DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
+    Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]')
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
-    DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
+    Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]')
 
     Setting ``utc=True`` makes sure always convert to timezone-aware outputs:
 
     - Timezone-naive inputs are *localized* based on the session timezone
 
     >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
-    DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
+    DatetimeIndex(['2018-10-26 05:00:00-07:00', '2018-10-26 06:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Timezone-aware inputs are *converted* to session timezone
 
@@ -1784,8 +1781,28 @@ def to_datetime(
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
     raise_if_native_pandas_objects(arg)
 
-    if arg is None:
-        return None  # same as pandas
+    if not isinstance(arg, (DataFrame, Series, pd.Index)):
+        # use pandas.to_datetime to convert local data to datetime
+        res = pandas.to_datetime(
+            arg,
+            errors,
+            dayfirst,
+            yearfirst,
+            utc,
+            format,
+            exact,
+            unit,
+            infer_datetime_format,
+            origin,
+            cache,
+        )
+        if isinstance(res, pandas.Series):
+            res = pd.Series(res)
+        elif not is_scalar(res):
+            res = pd.Index(res)
+        return res
+
+    # handle modin objs
     if unit and unit not in VALID_TO_DATETIME_UNIT:
         raise ValueError(f"Unrecognized unit {unit}")
 
@@ -1795,15 +1812,8 @@ def to_datetime(
             argument="cache",
             message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
         )
-    arg_is_scalar = is_scalar(arg)
-
-    if not isinstance(arg, (DataFrame, Series, pd.Index)):
-        # Turn dictionary like arg into pd.DataFrame and list-like or scalar to
-        # pd.Index.
-        arg = [arg] if arg_is_scalar else arg
-        arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)
 
-    series_or_index = arg._to_datetime(
+    return arg._to_datetime(
         errors=errors,
         dayfirst=dayfirst,
         yearfirst=yearfirst,
@@ -1814,13 +1824,6 @@ def to_datetime(
         infer_datetime_format=infer_datetime_format,
         origin=origin,
     )
-    if arg_is_scalar:
-        # Calling squeeze directly on Snowpark pandas Series makes an unnecessary
-        # count sql call. To avoid that we convert Snowpark pandas Series to Native
-        # pandas series first.
-        # Note: When arg_is_scalar is True 'series_or_index' is always an Index.
-        return series_or_index.to_series().to_pandas().squeeze()
-    return series_or_index
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator

@@ -123,6 +123,11 @@
 the specified time units.
 """
 
+AUTO_FORMAT_WARNING_MSG = """Snowflake automatic format detection is used when a format is not provided. 
+In this case Snowflake's auto format may yield different result values compared to pandas.
+See https://docs.snowflake.com/en/sql-reference/date-time-input-output#supported-formats-for-auto-detection for details
+"""
+
 # TODO: SNOW-1127160: support other units
 VALID_TO_DATETIME_UNIT = ["D", "s", "ms", "us", "ns"]
 
@@ -304,9 +309,7 @@ def generate_timestamp_col(
         if isinstance(datatype, (StringType, VariantType)):
             WarningMessage.mismatch_with_pandas(
                 "to_datetime",
-                "Snowpark pandas to_datetime uses Snowflake's automatic format "
-                "detection to convert string to datetime when a format is not provided. "
-                "In this case Snowflake's auto format may yield different result values compared to pandas.",
+                AUTO_FORMAT_WARNING_MSG.replace("\n", ""),
             )
 
         from snowflake.snowpark.modin.plugin._internal.type_utils import (

@@ -51,7 +51,7 @@ def ignored_argument(cls, operation: str, argument: str, message: str) -> None:
     @classmethod
     def mismatch_with_pandas(cls, operation: str, message: str) -> None:
         cls.single_warning(
-            f"`{operation}` implementation has mismatches with pandas:\n{message}."
+            f"`{operation}` implementation may have mismatches with pandas:\n{message}."
         )
 
     @classmethod

@@ -126,7 +126,7 @@ def test_astype_to_timedelta(dtype):
     eval_snowpark_pandas_result(snow_df, native_df, lambda df: df.astype(dtype))
 
 
-@sql_count_checker(query_count=2)
+@sql_count_checker(query_count=0)
 def test_astype_to_timedelta_negative():
     native_datetime_df = native_pd.DataFrame(
         data={"col1": [pd.to_datetime("2000-01-01"), pd.to_datetime("2001-01-01")]}

@@ -104,7 +104,7 @@ def test_to_datetime_format(self, cache, box, format, expected):
             ["1/3/2000", "20000103", "%m/%d/%Y"],
         ],
     )
-    @sql_count_checker(query_count=1)
+    @sql_count_checker(query_count=0)
     def test_to_datetime_format_scalar(self, cache, arg, expected, format):
         result = to_datetime(arg, format=format, cache=cache)
         expected = Timestamp(expected)
@@ -120,7 +120,7 @@ def test_to_datetime_format_scalar(self, cache, arg, expected, format):
     def test_to_datetime_format_unimplemented(self, cache, arg, format):
         with pytest.raises(NotImplementedError):
             assert to_datetime(
-                arg, format=format, cache=cache
+                pd.Index([arg]), format=format, cache=cache
             ) == native_pd.to_datetime(arg, format=format, cache=cache)
 
     @pytest.mark.parametrize(
@@ -135,7 +135,7 @@ def test_to_datetime_format_not_match(self, cache, arg, format):
             SnowparkSQLException,
             match=f"Can't parse '{arg}' as timestamp with format 'DD/MM/YYYY'",
         ):
-            to_datetime(arg, format=format, cache=cache)
+            to_datetime(pd.Index([arg]), format=format, cache=cache).to_pandas()
 
     @sql_count_checker(query_count=2, udf_count=0)
     def test_to_datetime_format_YYYYMMDD(self, cache):
@@ -302,7 +302,7 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input, expected):
     @sql_count_checker(query_count=2)
     def test_to_datetime_with_NA(self, data, format, expected):
         # GH#42957
-        result = to_datetime(data, format=format)
+        result = to_datetime(pd.Index(data), format=format)
         assert_index_equal(result, pd.DatetimeIndex(expected))
 
     @sql_count_checker(query_count=1, udf_count=0)
@@ -328,7 +328,7 @@ def test_to_datetime_format_integer_year_month(self, cache):
         result = to_datetime(ser, format="%Y%m", cache=cache)
         assert_series_equal(result, expected, check_index_type=False)
 
-    @sql_count_checker(query_count=1)
+    @sql_count_checker(query_count=0)
     def test_to_datetime_format_microsecond(self, cache):
         month_abbr = calendar.month_abbr[4]
         val = f"01-{month_abbr}-2011 00:00:01.978"
@@ -384,7 +384,9 @@ def test_to_datetime_format_microsecond(self, cache):
     )
     @sql_count_checker(query_count=1)
     def test_to_datetime_format_time(self, cache, value, format, dt):
-        assert to_datetime(value, format=format, cache=cache) == dt
+        assert (
+            to_datetime(pd.Index([value]), format=format, cache=cache).to_pandas() == dt
+        )
 
     @sql_count_checker(query_count=0)
     def test_to_datetime_with_non_exact_unimplemented(self, cache):
@@ -407,9 +409,9 @@ def test_to_datetime_with_non_exact_unimplemented(self, cache):
             "2012-01-01 09:00:00.001000000",
         ],
     )
-    @sql_count_checker(query_count=2)
+    @sql_count_checker(query_count=1, join_count=1)
     def test_parse_nanoseconds_with_formula(self, cache, arg):
-
+        arg = pd.Index([arg])
         # GH8989
         # truncating the nanoseconds when a format was provided
         expected = to_datetime(arg, cache=cache)
@@ -426,7 +428,10 @@ def test_parse_nanoseconds_with_formula(self, cache, arg):
     @sql_count_checker(query_count=0)
     def test_to_datetime_format_weeks(self, value, fmt, expected, cache):
         with pytest.raises(NotImplementedError):
-            assert to_datetime(value, format=fmt, cache=cache) == expected
+            assert (
+                to_datetime(pd.Index([value]), format=fmt, cache=cache).to_pandas()[0]
+                == expected
+            )
 
     @pytest.mark.parametrize(
         "fmt,dates,expected_dates",
@@ -497,7 +502,7 @@ def test_to_datetime_parse_tzname_or_tzoffset_fallback(
     ):
         # GH 13486
         with pytest.raises(NotImplementedError):
-            to_datetime(dates, format=fmt).to_list()
+            to_datetime(pd.Index(dates), format=fmt).to_list()
 
     @sql_count_checker(query_count=4)
     def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self):
@@ -535,7 +540,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset):
             SnowparkSQLException,
             match="Can't parse|as timestamp with format 'YYYY-MM-DD HH24:MI:SS TZHTZM'",
         ):
-            to_datetime([date], format=fmt).to_pandas()
+            to_datetime(pd.Index([date]), format=fmt).to_pandas()
 
     @sql_count_checker(query_count=0)
     def test_to_datetime_parse_timezone_keeps_name(self):
@@ -551,15 +556,15 @@ class TestToDatetime:
     def test_to_datetime_mixed_datetime_and_string(self):
         d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1)))
         d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1)))
-        res = to_datetime(["2020-01-01 17:00:00 -0100", d2])
+        res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2]))
         # The input will become a series with variant type and the timezone is unaware by the Snowflake engine, so the
         # result ignores the timezone by default
         expected = native_pd.DatetimeIndex(
             [datetime(2020, 1, 1, 17), datetime(2020, 1, 1, 18)]
         )
         assert_index_equal(res, expected)
         # Set utc=True to make sure timezone aware in to_datetime
-        res = to_datetime(["2020-01-01 17:00:00 -0100", d2], utc=True)
+        res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2]), utc=True)
         expected = pd.DatetimeIndex([d1, d2])
         assert_index_equal(res, expected)
 
@@ -584,15 +589,15 @@ def test_to_datetime_dtarr(self, tz):
 
     @sql_count_checker(query_count=1)
     def test_to_datetime_pydatetime(self):
-        actual = to_datetime(datetime(2008, 1, 15))
+        actual = to_datetime(pd.Index([datetime(2008, 1, 15)]))
         assert actual == np.datetime64(datetime(2008, 1, 15))
 
     @pytest.mark.parametrize(
         "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")]
     )
-    @sql_count_checker(query_count=1)
+    @sql_count_checker(query_count=1, join_count=2)
     def test_to_datetime_dt64s(self, cache, dt):
-        assert to_datetime(dt, cache=cache) == Timestamp(dt)
+        assert to_datetime(pd.Index([dt]), cache=cache)[0] == Timestamp(dt)
 
     @pytest.mark.parametrize(
         "sample",
@@ -831,11 +836,11 @@ def test_to_datetime_df_negative(self):
             {"arg": 1490195805433502912, "unit": "ns"},
         ],
     )
-    @sql_count_checker(query_count=1)
+    @sql_count_checker(query_count=1, join_count=2)
     def test_to_datetime_unit(self, sample):
-        assert pd.to_datetime(
-            sample["arg"], unit=sample["unit"]
-        ) == native_pd.to_datetime(sample["arg"], unit=sample["unit"])
+        assert pd.to_datetime(pd.Index([sample["arg"]]), unit=sample["unit"])[
+            0
+        ] == native_pd.to_datetime(sample["arg"], unit=sample["unit"])
 
     @sql_count_checker(query_count=0)
     def test_to_datetime_unit_negative(self):