4
4
import logging
5
5
import operator
6
6
from abc import ABCMeta , abstractmethod
7
+ from collections .abc import Iterable
7
8
from functools import cached_property
8
- from typing import Any , Callable , List , Optional , Sequence , TypeVar , Union
9
+ from typing import Any , Callable , List , Optional , Sequence , TypeVar , Union , Mapping
9
10
from typing import Collection , Dict , cast , overload
10
11
11
- import pyspark .sql .functions as sfn
12
+ from datetime import datetime as dt , timedelta as td
13
+
12
14
from IPython .core .display import HTML
13
15
from IPython .display import display as ipydisplay
14
- from pyspark .sql import GroupedData
15
- from pyspark .sql import SparkSession
16
+ import pandas as pd
17
+ from pandas .core .frame import DataFrame as PandasDataFrame
18
+
19
+ from pyspark import RDD
20
+ import pyspark .sql .functions as sfn
21
+ from pyspark .sql import SparkSession , GroupedData
16
22
from pyspark .sql .column import Column
17
23
from pyspark .sql .dataframe import DataFrame
18
- from pyspark .sql .types import DataType , StructType
24
+ from pyspark .sql .types import AtomicType , DataType , StructType
19
25
from pyspark .sql .window import Window , WindowSpec
20
26
21
27
import tempo .interpol as t_interpolation
@@ -97,7 +103,9 @@ def time_str_to_double(df: DataFrame,
97
103
98
104
class TSDF (WindowBuilder ):
99
105
"""
100
- This object is the main wrapper over a Spark data frame which allows a user to parallelize time series computations on a Spark data frame by various dimensions. The two dimensions required are partition_cols (list of columns by which to summarize) and ts_col (timestamp column, which can be epoch or TimestampType).
106
+ This class represents a time series DataFrame (TSDF) - a DataFrame with a
107
+ time series index. It can represent multiple logical time series,
108
+ each identified by a unique set of series IDs.
101
109
"""
102
110
103
111
def __init__ (
@@ -142,7 +150,8 @@ def __withStandardizedColOrder(self) -> TSDF:
142
150
* ts_index,
143
151
* observation columns
144
152
145
- :return: a :class:`TSDF` with the columns reordered into "standard order" (as described above)
153
+ :return: a :class:`TSDF` with the columns reordered into
154
+ "standard order" (as described above)
146
155
"""
147
156
std_ordered_cols = (
148
157
list (self .series_ids )
@@ -155,6 +164,88 @@ def __withStandardizedColOrder(self) -> TSDF:
155
164
# default column name for constructed timeseries index struct columns
156
165
__DEFAULT_TS_IDX_COL = "ts_idx"
157
166
167
+ @classmethod
168
+ def buildEmptyLattice (
169
+ cls ,
170
+ spark : SparkSession ,
171
+ start_time : dt ,
172
+ end_time : Optional [dt ] = None ,
173
+ step_size : Optional [td ] = None ,
174
+ num_intervals : Optional [int ] = None ,
175
+ ts_col : Optional [str ] = None ,
176
+ series_ids : Optional [Any ] = None ,
177
+ series_schema : Optional [Union [AtomicType , StructType , str ]] = None ,
178
+ observation_cols : Optional [Union [Mapping [str , str ], Iterable [str ]]] = None ,
179
+ num_partitions : Optional [int ] = None ) -> TSDF :
180
+ """
181
+ Construct an empty "lattice", i.e. a :class:`TSDF` with a time range
182
+ for each unique series and a set of observational columns (initialized to Nulls)
183
+
184
+ :param spark: the Spark session to use
185
+ :param start_time: the start time of the lattice
186
+ :param end_time: the end time of the lattice (optional)
187
+ :param step_size: the step size between each time interval (optional)
188
+ :param num_intervals: the number of intervals to create (optional)
189
+ :param ts_col: the name of the timestamp column (optional)
190
+ :param series_ids: the unique series identifiers (optional)
191
+ :param series_schema: the schema of the series identifiers (optional)
192
+ :param observation_cols: the observational columns to include (optional)
193
+ :param num_partitions: the number of partitions to create (optional)
194
+
195
+ :return: a :class:`TSDF` representing the empty lattice
196
+ """
197
+
198
+ # set a default timestamp column if not provided
199
+ if ts_col is None :
200
+ ts_col = cls .__DEFAULT_TS_IDX_COL
201
+
202
+ # initialize the lattice as a time range
203
+ lattice_df = t_utils .time_range (spark ,
204
+ start_time ,
205
+ end_time ,
206
+ step_size ,
207
+ num_intervals ,
208
+ ts_colname = ts_col )
209
+ select_exprs = [sfn .col (ts_col )]
210
+
211
+ # handle construction of the series_ids DataFrame
212
+ series_df = None
213
+ if series_ids :
214
+ if isinstance (series_ids , DataFrame ):
215
+ series_df = series_ids
216
+ elif isinstance (series_ids , (RDD , PandasDataFrame )):
217
+ series_df = spark .createDataFrame (series_ids )
218
+ elif isinstance (series_ids , dict ):
219
+ series_df = spark .createDataFrame (pd .DataFrame (series_ids ))
220
+ else :
221
+ series_df = spark .createDataFrame (data = series_ids , schema = series_schema )
222
+ # add the series columns to the select expressions
223
+ select_exprs += [sfn .col (c ) for c in series_df .columns ]
224
+ # lattice is the cross join of the time range and the series identifiers
225
+ lattice_df = lattice_df .crossJoin (series_df )
226
+
227
+ # set up select expressions for the observation columns
228
+ if observation_cols :
229
+ # convert to a dict if not already, mapping all columns to "double" types
230
+ if not isinstance (observation_cols , dict ):
231
+ observation_cols = {col : "double" for col in observation_cols }
232
+ select_exprs += [sfn .lit (None ).cast (coltype ).alias (colname )
233
+ for colname , coltype in observation_cols .items ()]
234
+ lattice_df = lattice_df .select (* select_exprs )
235
+
236
+ # repartition the lattice in a more optimal way
237
+ if num_partitions is None :
238
+ num_partitions = lattice_df .rdd .getNumPartitions ()
239
+ if series_df :
240
+ sort_cols = series_df .columns + [ts_col ]
241
+ lattice_df = (lattice_df .repartition (num_partitions , * (series_df .columns ))
242
+ .sortWithinPartitions (* sort_cols ))
243
+ else :
244
+ lattice_df = lattice_df .repartitionByRange (num_partitions , ts_col )
245
+
246
+ # construct the appropriate TSDF
247
+ return TSDF (lattice_df , ts_col = ts_col , series_ids = series_df .columns )
248
+
158
249
@classmethod
159
250
def fromSubsequenceCol (
160
251
cls ,
0 commit comments