From 2273cb84254b340910c9e87f9dc8f9f7686d6ff8 Mon Sep 17 00:00:00 2001 From: Stefano Date: Thu, 19 Dec 2019 11:17:34 +0100 Subject: [PATCH 1/5] Added examples folder with README Signed-off-by: Stefano --- examples/README.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 examples/README.rst diff --git a/examples/README.rst b/examples/README.rst new file mode 100644 index 0000000..51189c4 --- /dev/null +++ b/examples/README.rst @@ -0,0 +1,27 @@ +.. image:: https://www.giotto.ai/static/vector/logo.svg + :width: 850 + +Examples and Tutorials +====================== + +In this folder you can find basic tutorials and examples: you can read through them to +understand how `giotto-time` works. + +Quick start +----------- + +This tutorial is about giving an overview on the basic features of `giotto-time`. +You will learn how to train a simple time series model with custom features. +Some considerations on input-output are presented. + +Details and advanced features +----------------------------- + +This tutorial details more advanced features of `giotto-time`. +You will learn more details on the feature generation and on custom linear regressor +model for time series forecasting. + +Causality Tests +--------------- + +This tutorial details the causality tests built-in in `giotto-time`. From b311189399bc2686047df6cc486680706cc7d7d4 Mon Sep 17 00:00:00 2001 From: Stefano Date: Thu, 19 Dec 2019 15:59:46 +0100 Subject: [PATCH 2/5] Bug fix in time series preparation Signed-off-by: Stefano --- .../tests/test_time_series_preparation.py | 33 +++++++++++++++++++ .../time_series_preparation.py | 12 ++++--- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/giottotime/time_series_preparation/tests/test_time_series_preparation.py b/giottotime/time_series_preparation/tests/test_time_series_preparation.py index 33e8821..c455320 100644 --- a/giottotime/time_series_preparation/tests/test_time_series_preparation.py +++ b/giottotime/time_series_preparation/tests/test_time_series_preparation.py @@ -128,6 +128,39 @@ def test_wrong_input_type(self, wrong_input: Tuple): with pytest.raises(TypeError): time_series_preparation._to_time_index_series(wrong_input) + @given(series_with_period_index(), st.datetimes(), available_freqs()) + def test_period_index_dataframe_unchanged( + self, period_index_series: pd.Series, start: pd.datetime, freq: pd.Timedelta, + ): + period_index_dataframe = pd.DataFrame(period_index_series) + time_series_preparation = TimeSeriesPreparation(start=start, freq=freq) + computed_time_series = time_series_preparation._to_time_index_series( + period_index_dataframe + ) + assert_series_equal(computed_time_series, period_index_series) + + @given(series_with_datetime_index(), st.datetimes(), available_freqs()) + def test_datetime_index_dataframe_unchanged( + self, datetime_index_series: pd.Series, start: pd.datetime, freq: pd.Timedelta, + ): + datetime_index_dataframe = pd.DataFrame(datetime_index_series) + time_series_preparation = TimeSeriesPreparation(start=start, freq=freq) + computed_time_series = time_series_preparation._to_time_index_series( + datetime_index_dataframe + ) + assert_series_equal(computed_time_series, datetime_index_series) + + @given(series_with_timedelta_index(), st.datetimes(), available_freqs()) + def test_timedelta_index_dataframe_unchanged( + self, timedelta_index_series: pd.Series, start: pd.datetime, freq: pd.Timedelta, + ): + timedelta_index_dataframe = pd.DataFrame(timedelta_index_series) + time_series_preparation = TimeSeriesPreparation(start=start, freq=freq) + computed_time_series = time_series_preparation._to_time_index_series( + timedelta_index_dataframe + ) + assert_series_equal(computed_time_series, timedelta_index_series) + class TestToEquispacedTimeSeries: @given( diff --git a/giottotime/time_series_preparation/time_series_preparation.py b/giottotime/time_series_preparation/time_series_preparation.py index 2e909e6..7b63914 100644 --- a/giottotime/time_series_preparation/time_series_preparation.py +++ b/giottotime/time_series_preparation/time_series_preparation.py @@ -78,13 +78,15 @@ def __init__( self.freq ) - def transform(self, time_series: Union[List, np.array, pd.Series]) -> pd.DataFrame: + def transform( + self, time_series: Union[List, np.array, pd.Series, pd.DataFrame] + ) -> pd.DataFrame: """Transforms an array-like sequence in a period-index DataFrame with a single column. Parameters ---------- - time_series : Union[List, np.array, pd.Series], required + time_series : Union[List, np.array, pd.Series, pd.DataFrame], required The input time series. Returns @@ -104,9 +106,11 @@ def transform(self, time_series: Union[List, np.array, pd.Series]) -> pd.DataFra return period_index_dataframe def _to_time_index_series( - self, array_like_object: Union[List, np.array, pd.Series] + self, array_like_object: Union[List, np.array, pd.Series, pd.DataFrame] ) -> pd.Series: - if isinstance(array_like_object, pd.Series): + if isinstance(array_like_object, pd.DataFrame): + return self.pandas_converter.transform(array_like_object.iloc[:, 0]) + elif isinstance(array_like_object, pd.Series): return self.pandas_converter.transform(array_like_object) elif any( isinstance(array_like_object, type_) for type_ in SUPPORTED_SEQUENCE_TYPES From 323269fd25791b2b21ad83a302e84a743d14ea20 Mon Sep 17 00:00:00 2001 From: Stefano Date: Thu, 19 Dec 2019 17:19:43 +0100 Subject: [PATCH 3/5] Added quick-start notebook Signed-off-by: Stefano --- examples/quick-start.ipynb | 577 +++++++++++++++++++++++++++++++++++++ 1 file changed, 577 insertions(+) create mode 100644 examples/quick-start.ipynb diff --git a/examples/quick-start.ipynb b/examples/quick-start.ipynb new file mode 100644 index 0000000..c24d038 --- /dev/null +++ b/examples/quick-start.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:34:45.079040Z", + "start_time": "2019-12-19T10:34:45.075853Z" + } + }, + "source": [ + "# Giotto-Time\n", + "\n", + "Welcome to `giotto-time`, our new library for time series forecasting!\n", + "\n", + "Let's start with an example." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:37:13.829605Z", + "start_time": "2019-12-19T10:37:13.827033Z" + } + }, + "source": [ + "## First example" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:51:37.701263Z", + "start_time": "2019-12-19T10:51:37.698686Z" + } + }, + "source": [ + "### Ingredients" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:43:03.249232Z", + "start_time": "2019-12-19T10:43:03.244743Z" + } + }, + "source": [ + "These are the main ingredients of `giotto-time`:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:12:59.958832Z", + "start_time": "2019-12-19T14:12:59.307286Z" + } + }, + "outputs": [], + "source": [ + "from giottotime.time_series_preparation import TimeSeriesPreparation\n", + "from giottotime.feature_creation import FeatureCreation, ShiftFeature, MovingAverageFeature\n", + "from giottotime.model_selection import FeatureSplitter\n", + "from giottotime.models import GAR" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:43:23.300668Z", + "start_time": "2019-12-19T10:43:23.100775Z" + } + }, + "source": [ + "- `TimeSeriesPreparation`: checks the input format of the time series and converts it to the expected format.\n", + "- `FeatureCreation`, `ShiftFeature`, `MovingAverageFeature`: create the desired features on the time series that are \n", + " used for the forecasting.\n", + "- `FeatureSplitter`: prepares the custom `giotto-time` train-test matrices that are used in the model\n", + "- `GAR`: generalized-auto-regressive model. This is the only time series model that we provide for the first release." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:50:42.797962Z", + "start_time": "2019-12-19T10:50:42.792529Z" + } + }, + "source": [ + "We also need a `scikit-learn`-model. We go for a standard linear regressor for this example" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:13:00.294607Z", + "start_time": "2019-12-19T14:13:00.291612Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:51:15.298065Z", + "start_time": "2019-12-19T10:51:15.295733Z" + } + }, + "source": [ + "### Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T10:55:55.362286Z", + "start_time": "2019-12-19T10:55:55.358045Z" + } + }, + "source": [ + "We use the `pandas.testing` module to create a testing time series" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:13:00.964858Z", + "start_time": "2019-12-19T14:13:00.961460Z" + } + }, + "outputs": [], + "source": [ + "def test_time_series():\n", + " from pandas.util import testing as testing\n", + " \n", + " testing.N, testing.K = 500, 1\n", + " df = testing.makeTimeDataFrame( freq=\"D\" )\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:13:01.303804Z", + "start_time": "2019-12-19T14:13:01.299245Z" + } + }, + "outputs": [], + "source": [ + "time_series = test_time_series()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T13:37:09.941132Z", + "start_time": "2019-12-19T13:37:09.938476Z" + } + }, + "source": [ + "### Time Series Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T13:37:33.357619Z", + "start_time": "2019-12-19T13:37:33.347192Z" + } + }, + "source": [ + "The input time series has to be a `pandas.DataFrame` with a `PeriodIndex`. Use the provided class `TimeSeriesPreparation` to convert the time series in this format" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:13:02.320075Z", + "start_time": "2019-12-19T14:13:02.317384Z" + } + }, + "outputs": [], + "source": [ + "time_series_preparation = TimeSeriesPreparation()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:13:04.685763Z", + "start_time": "2019-12-19T14:13:04.681195Z" + } + }, + "outputs": [], + "source": [ + "period_index_time_series = time_series_preparation.transform(time_series)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:15:31.332440Z", + "start_time": "2019-12-19T14:15:31.322583Z" + } + }, + "source": [ + "### Feature Creation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:20:25.312078Z", + "start_time": "2019-12-19T14:20:25.307741Z" + } + }, + "source": [ + "The feature creation part is one of the core part of our library and the bridge between traditional time series forecasting techniques and machine learning.\n", + "\n", + "Starting with a time series in a `pandas.DataFrame`, we create two matrices `X` and `y` which can be used for training and testing." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:49:59.729021Z", + "start_time": "2019-12-19T14:49:59.718573Z" + } + }, + "source": [ + "We provide 12 different features. For simplicity we train a model using only `ShiftFeature` and `MovingAverageFeature`. \n", + "\n", + "`ShiftFeature` provides a temporal shift of the time series. Adding two `ShiftFeature` with shifts 1 and 2 is equivalent to an `AR(2)` model. \n", + "\n", + "The possibility to add the features that you want allows you to choose the model that best fits your data." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:17:42.856996Z", + "start_time": "2019-12-19T14:17:42.853237Z" + } + }, + "outputs": [], + "source": [ + "features = [\n", + " ShiftFeature(1, output_name='shift_1'),\n", + " ShiftFeature(2, output_name='shift_2'),\n", + " MovingAverageFeature(3, output_name='moving_average_3'),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:18:17.215100Z", + "start_time": "2019-12-19T14:18:17.211908Z" + } + }, + "outputs": [], + "source": [ + "feature_creation = FeatureCreation(time_series_features=features, horizon=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:21:07.990558Z", + "start_time": "2019-12-19T14:21:07.970206Z" + } + }, + "outputs": [], + "source": [ + "features_X, features_y = feature_creation.fit_transform(period_index_time_series)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:42:32.549572Z", + "start_time": "2019-12-19T14:42:32.547124Z" + } + }, + "source": [ + "### Train-Test split" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:43:02.820280Z", + "start_time": "2019-12-19T14:43:02.817384Z" + } + }, + "outputs": [], + "source": [ + "feature_splitter = FeatureSplitter()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:43:40.401560Z", + "start_time": "2019-12-19T14:43:40.380814Z" + } + }, + "outputs": [], + "source": [ + "X_train, y_train, X_test, y_test = feature_splitter.transform(features_X, features_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:44:02.820817Z", + "start_time": "2019-12-19T14:44:02.818276Z" + } + }, + "source": [ + "### Training" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:45:09.230395Z", + "start_time": "2019-12-19T14:45:09.227402Z" + } + }, + "outputs": [], + "source": [ + "model = GAR(base_model=LinearRegression())" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:45:09.467974Z", + "start_time": "2019-12-19T14:45:09.458956Z" + } + }, + "outputs": [], + "source": [ + "model = model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:45:20.428649Z", + "start_time": "2019-12-19T14:45:20.414290Z" + } + }, + "outputs": [], + "source": [ + "predictions = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T15:49:35.236013Z", + "start_time": "2019-12-19T15:49:35.225037Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
y_0y_1y_2
2001-05-130.498604-1.11394NaN
2001-05-14-1.113940NaNNaN
\n", + "
" + ], + "text/plain": [ + " y_0 y_1 y_2\n", + "2001-05-13 0.498604 -1.11394 NaN\n", + "2001-05-14 -1.113940 NaN NaN" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-19T14:45:22.160170Z", + "start_time": "2019-12-19T14:45:22.152303Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
y_0y_1y_2
2001-05-130.1689570.0241960.083588
2001-05-140.0192400.0757770.110530
\n", + "
" + ], + "text/plain": [ + " y_0 y_1 y_2\n", + "2001-05-13 0.168957 0.024196 0.083588\n", + "2001-05-14 0.019240 0.075777 0.110530" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From bee8c3f440f8fe56413c175f8a453b912dd6f465 Mon Sep 17 00:00:00 2001 From: Stefano Date: Fri, 20 Dec 2019 12:13:50 +0100 Subject: [PATCH 4/5] Basic example done Signed-off-by: Stefano --- examples/quick-start.ipynb | 233 +++++++++++++++++++------------------ 1 file changed, 118 insertions(+), 115 deletions(-) diff --git a/examples/quick-start.ipynb b/examples/quick-start.ipynb index c24d038..ad099c0 100644 --- a/examples/quick-start.ipynb +++ b/examples/quick-start.ipynb @@ -57,8 +57,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:12:59.958832Z", - "start_time": "2019-12-19T14:12:59.307286Z" + "end_time": "2019-12-20T11:08:40.433188Z", + "start_time": "2019-12-20T11:08:39.863805Z" } }, "outputs": [], @@ -102,8 +102,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:13:00.294607Z", - "start_time": "2019-12-19T14:13:00.291612Z" + "end_time": "2019-12-20T11:08:41.268423Z", + "start_time": "2019-12-20T11:08:41.265378Z" } }, "outputs": [], @@ -140,8 +140,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:13:00.964858Z", - "start_time": "2019-12-19T14:13:00.961460Z" + "end_time": "2019-12-20T11:08:42.074384Z", + "start_time": "2019-12-20T11:08:42.070697Z" } }, "outputs": [], @@ -159,8 +159,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:13:01.303804Z", - "start_time": "2019-12-19T14:13:01.299245Z" + "end_time": "2019-12-20T11:08:42.366492Z", + "start_time": "2019-12-20T11:08:42.361791Z" } }, "outputs": [], @@ -197,8 +197,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:13:02.320075Z", - "start_time": "2019-12-19T14:13:02.317384Z" + "end_time": "2019-12-20T11:08:43.161252Z", + "start_time": "2019-12-20T11:08:43.158360Z" } }, "outputs": [], @@ -208,11 +208,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:13:04.685763Z", - "start_time": "2019-12-19T14:13:04.681195Z" + "end_time": "2019-12-20T11:08:43.428293Z", + "start_time": "2019-12-20T11:08:43.421929Z" } }, "outputs": [], @@ -264,11 +264,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:17:42.856996Z", - "start_time": "2019-12-19T14:17:42.853237Z" + "end_time": "2019-12-20T11:08:44.450001Z", + "start_time": "2019-12-20T11:08:44.446647Z" } }, "outputs": [], @@ -282,11 +282,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:18:17.215100Z", - "start_time": "2019-12-19T14:18:17.211908Z" + "end_time": "2019-12-20T11:08:44.737915Z", + "start_time": "2019-12-20T11:08:44.734648Z" } }, "outputs": [], @@ -296,11 +296,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:21:07.990558Z", - "start_time": "2019-12-19T14:21:07.970206Z" + "end_time": "2019-12-20T11:08:45.045070Z", + "start_time": "2019-12-20T11:08:45.022402Z" } }, "outputs": [], @@ -320,13 +320,25 @@ "### Train-Test split" ] }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-20T10:59:18.112521Z", + "start_time": "2019-12-20T10:59:18.108823Z" + } + }, + "source": [ + "We use `FeatureSplitter` to split the matrices X and y in train and test. " + ] + }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:43:02.820280Z", - "start_time": "2019-12-19T14:43:02.817384Z" + "end_time": "2019-12-20T11:08:45.885739Z", + "start_time": "2019-12-20T11:08:45.882557Z" } }, "outputs": [], @@ -336,11 +348,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:43:40.401560Z", - "start_time": "2019-12-19T14:43:40.380814Z" + "end_time": "2019-12-20T11:08:46.240108Z", + "start_time": "2019-12-20T11:08:46.221414Z" } }, "outputs": [], @@ -360,13 +372,31 @@ "### Training" ] }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-20T11:01:12.922844Z", + "start_time": "2019-12-20T11:01:12.919591Z" + } + }, + "source": [ + "We provide a `GAR` (Generalized Auto Regressive) model to forecast the time series.\n", + "\n", + "The traditional `AR` model is equivalent to our `GAR` model that uses only `ShiftFeature` columns in the `X` matrix.\n", + "`GAR` supports all the features compatible with the feature creation step.\n", + "\n", + "Moreover, `GAR` internally uses a `scikit-learn` compatible model for the internal time series regression. \n", + "In this example we use `LinearRegression`. A priori all the `fit-transform-predict` models are compatible (e.g. ridge regression, random forest, boosting, etc.. " + ] + }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:45:09.230395Z", - "start_time": "2019-12-19T14:45:09.227402Z" + "end_time": "2019-12-20T11:08:47.555831Z", + "start_time": "2019-12-20T11:08:47.553017Z" } }, "outputs": [], @@ -376,11 +406,11 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:45:09.467974Z", - "start_time": "2019-12-19T14:45:09.458956Z" + "end_time": "2019-12-20T11:08:48.059122Z", + "start_time": "2019-12-20T11:08:48.050062Z" } }, "outputs": [], @@ -389,95 +419,54 @@ ] }, { - "cell_type": "code", - "execution_count": 32, + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:45:20.428649Z", - "start_time": "2019-12-19T14:45:20.414290Z" + "end_time": "2019-12-20T11:01:01.280526Z", + "start_time": "2019-12-20T11:01:01.278125Z" } }, - "outputs": [], "source": [ - "predictions = model.predict(X_test)" + "### Forecasting" ] }, { - "cell_type": "code", - "execution_count": 36, + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T15:49:35.236013Z", - "start_time": "2019-12-19T15:49:35.225037Z" - }, - "scrolled": true + "end_time": "2019-12-20T11:10:02.544672Z", + "start_time": "2019-12-20T11:10:02.540859Z" + } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
y_0y_1y_2
2001-05-130.498604-1.11394NaN
2001-05-14-1.113940NaNNaN
\n", - "
" - ], - "text/plain": [ - " y_0 y_1 y_2\n", - "2001-05-13 0.498604 -1.11394 NaN\n", - "2001-05-14 -1.113940 NaN NaN" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" + "source": [ + "We forecast 3 time steps of the time series (we set this parameter in `FeatureCreation`).\n", + "\n", + "The format of the output is the following:\n", + "- the index is the step at which the prediction is made.\n", + "- the column `y_1` is the prediction one time step after and so on for `y_2` and `y_3`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2019-12-20T11:08:48.939181Z", + "start_time": "2019-12-20T11:08:48.931145Z" } - ], + }, + "outputs": [], "source": [ - "y_test" + "predictions = model.predict(X_test)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 15, "metadata": { "ExecuteTime": { - "end_time": "2019-12-19T14:45:22.160170Z", - "start_time": "2019-12-19T14:45:22.152303Z" + "end_time": "2019-12-20T11:08:50.014625Z", + "start_time": "2019-12-20T11:08:49.989948Z" } }, "outputs": [ @@ -502,35 +491,42 @@ " \n", " \n", " \n", - " y_0\n", " y_1\n", " y_2\n", + " y_3\n", " \n", " \n", " \n", " \n", + " 2001-05-12\n", + " -0.149298\n", + " -0.164899\n", + " -0.092473\n", + " \n", + " \n", " 2001-05-13\n", - " 0.168957\n", - " 0.024196\n", - " 0.083588\n", + " -0.150681\n", + " -0.085710\n", + " -0.063871\n", " \n", " \n", " 2001-05-14\n", - " 0.019240\n", - " 0.075777\n", - " 0.110530\n", + " -0.066199\n", + " -0.134353\n", + " -0.095745\n", " \n", " \n", "\n", "" ], "text/plain": [ - " y_0 y_1 y_2\n", - "2001-05-13 0.168957 0.024196 0.083588\n", - "2001-05-14 0.019240 0.075777 0.110530" + " y_1 y_2 y_3\n", + "2001-05-12 -0.149298 -0.164899 -0.092473\n", + "2001-05-13 -0.150681 -0.085710 -0.063871\n", + "2001-05-14 -0.066199 -0.134353 -0.095745" ] }, - "execution_count": 33, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -538,6 +534,13 @@ "source": [ "predictions" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 8484f61b86b96bf9487ad99bff5789750873bba4 Mon Sep 17 00:00:00 2001 From: Stefano Date: Fri, 20 Dec 2019 17:11:52 +0100 Subject: [PATCH 5/5] Added examples Signed-off-by: Stefano --- .gitignore | 1 + .../time_series_preparation/__init__.py | 2 +- .../tests/test_time_series_conversion.py | 14 ++++ .../time_series_conversion.py | 73 ++++++++++++++++++- .../time_series_preparation.py | 49 +++++++++++++ 5 files changed, 136 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 632e72b..0ea875b 100644 --- a/.gitignore +++ b/.gitignore @@ -78,6 +78,7 @@ target/ # Jupyter NB Checkpoints .ipynb_checkpoints/ +Untitled* # exclude data from source control by default /data/ diff --git a/giottotime/time_series_preparation/__init__.py b/giottotime/time_series_preparation/__init__.py index de21fa1..d59471a 100755 --- a/giottotime/time_series_preparation/__init__.py +++ b/giottotime/time_series_preparation/__init__.py @@ -1,6 +1,6 @@ """ The :mod:`giottotime.feature_creation` module deals with the preparation of time series -data, such as index conversions and resampling. +data, such as conversion to `pandas.DataFrame` with a `PeriodIndex`. """ from .time_series_conversion import ( diff --git a/giottotime/time_series_preparation/tests/test_time_series_conversion.py b/giottotime/time_series_preparation/tests/test_time_series_conversion.py index f23265a..884eb5d 100644 --- a/giottotime/time_series_preparation/tests/test_time_series_conversion.py +++ b/giottotime/time_series_preparation/tests/test_time_series_conversion.py @@ -332,6 +332,20 @@ def test_only_timedelta_index_as_input(self, timedelta_index_series: pd.Series): ) assert_series_equal(computed_series, expected_series) + def test_basic_timedelta_index_as_input(self): + timedelta_index_series = pd.Series( + index=pd.timedelta_range(start=pd.Timedelta(days=1), freq="10D", periods=3), + data=[1, 2, 3], + ) + expected_series = pd.Series( + index=pd.PeriodIndex(["1970-01-02", "1970-01-12", "1970-01-22"], freq="D"), + data=[1, 2, 3], + ) + computed_series = transform_time_index_series_into_period_index_series( + timedelta_index_series + ) + assert_series_equal(computed_series, expected_series) + @given(series_with_timedelta_index(), available_freqs()) def test_timedelta_index_and_freq_as_input( self, timedelta_index_series: pd.Series, freq: pd.Timedelta diff --git a/giottotime/time_series_preparation/time_series_conversion.py b/giottotime/time_series_preparation/time_series_conversion.py index bf0c2f1..cd04546 100644 --- a/giottotime/time_series_preparation/time_series_conversion.py +++ b/giottotime/time_series_preparation/time_series_conversion.py @@ -186,6 +186,19 @@ class SequenceToTimeIndexSeries(TimeSeriesConversion): frequency of the output time series. Not mandatory for all time series conversion. + Examples + -------- + >>> from giottotime.time_series_preparation import SequenceToTimeIndexSeries + >>> time_series = [1,2,3,5,5,7] + >>> sequence_to_time_index = SequenceToTimeIndexSeries(start='01-01-2010', freq='10D') + >>> sequence_to_time_index.transform(time_series) + 2010-01-01 1 + 2010-01-11 2 + 2010-01-21 3 + 2010-01-31 5 + 2010-02-10 5 + 2010-02-20 7 + Freq: 10D, dtype: int64 """ def __init__( @@ -209,7 +222,7 @@ def _get_values_from( class PandasSeriesToTimeIndexSeries(TimeSeriesConversion): """Returns a Pandas Series with time index (DatetimeIndex, TimedeltaIndex or - PeriodIndex from a standard Pandas Series + PeriodIndex) from a standard Pandas Series Parameters ---------- @@ -222,6 +235,20 @@ class PandasSeriesToTimeIndexSeries(TimeSeriesConversion): freq : pd.Timedelta``, optional, default: ``None`` The frequency of the time series. + Examples + -------- + >>> import pandas as pd + >>> from giottotime.time_series_preparation import PandasSeriesToTimeIndexSeries + >>> time_series = pd.Series([1,2,3,5,5,7]) + >>> sequence_to_time_index = PandasSeriesToTimeIndexSeries(start='01-01-2010', freq='10D') + >>> sequence_to_time_index.transform(time_series) + 2010-01-01 1 + 2010-01-11 2 + 2010-01-21 3 + 2010-01-31 5 + 2010-02-10 5 + 2010-02-20 7 + Freq: 10D, dtype: int64 """ def __init__( @@ -255,7 +282,8 @@ def _has_time_index(self, time_series: pd.Series) -> bool: class TimeIndexSeriesToPeriodIndexSeries(TimeSeriesConversion): - """Converts a series with a time index to a series with a PeriodIndex. + """Converts a series with a time index (DatetimeIndex, TimedeltaIndex or + PeriodIndex) to a series with a PeriodIndex. It may be necessary to specify a `freq` if not already provided. @@ -264,6 +292,47 @@ class TimeIndexSeriesToPeriodIndexSeries(TimeSeriesConversion): freq : pd.Timedelta, optional, default: ``None`` The frequency of the time series. + Examples + -------- + >>> import pandas as pd + >>> from giottotime.time_series_preparation import TimeIndexSeriesToPeriodIndexSeries + >>> period_index_time_series = pd.Series( + ... index = pd.period_range(start='01-01-2010', freq='10D', periods=6), + ... data=[1,2,3,5,5,7] + ... ) + >>> datetime_index_time_series = pd.Series( + ... index = pd.date_range(start='01-01-2010', freq='10D', periods=6), + ... data=[1,2,3,5,5,7] + ... ) + >>> timedelta_index_time_series = pd.Series( + ... index = pd.timedelta_range(start=pd.Timedelta(days=1), freq='10D', periods=6), + ... data=[1,2,3,5,5,7] + ... ) + >>> sequence_to_time_index = TimeIndexSeriesToPeriodIndexSeries() + >>> sequence_to_time_index.transform(period_index_time_series) + 2010-01-01 1 + 2010-01-11 2 + 2010-01-21 3 + 2010-01-31 5 + 2010-02-10 5 + 2010-02-20 7 + freq: 10d, dtype: int64 + >>> sequence_to_time_index.transform(datetime_index_time_series) + 2010-01-01 1 + 2010-01-11 2 + 2010-01-21 3 + 2010-01-31 5 + 2010-02-10 5 + 2010-02-20 7 + freq: 10d, dtype: int64 + >>> sequence_to_time_index.transform(timedelta_index_time_series) + 1970-01-02 1 + 1970-01-12 2 + 1970-01-22 3 + 1970-02-01 5 + 1970-02-11 5 + 1970-02-21 7 + Freq: D, dtype: int64 """ def __init__(self, freq: Optional[pd.Timedelta] = None): diff --git a/giottotime/time_series_preparation/time_series_preparation.py b/giottotime/time_series_preparation/time_series_preparation.py index 7b63914..d5685fe 100644 --- a/giottotime/time_series_preparation/time_series_preparation.py +++ b/giottotime/time_series_preparation/time_series_preparation.py @@ -51,6 +51,55 @@ class TimeSeriesPreparation: ValueError Of the three parameters: start, end, and periods, exactly two must be specified. + Examples + -------- + >>> time_series = [1,2,3,5,5,7] + >>> period_index_time_series = pd.Series( + ... index = pd.period_range(start='01-01-2010', freq='10D', periods=6), + ... data=[1,2,3,5,5,7] + ... ) + >>> datetime_index_time_series = pd.Series( + ... index = pd.date_range(start='01-01-2010', freq='10D', periods=6), + ... data=[1,2,3,5,5,7] + ... ) + >>> timedelta_index_time_series = pd.Series( + ... index = pd.timedelta_range(start=pd.Timedelta(days=1), freq='10D', periods=6), + ... data=[1,2,3,5,5,7] + ... ) + >>> time_series_preparation = TimeSeriesPreparation() + >>> time_series_preparation.transform(time_series) + time_series + 1970-01-01 1 + 1970-01-02 2 + 1970-01-03 3 + 1970-01-04 5 + 1970-01-05 5 + 1970-01-06 7 + >>> time_series_preparation.transform(period_index_time_series) + time_series + 2010-01-01 1 + 2010-01-11 2 + 2010-01-21 3 + 2010-01-31 5 + 2010-02-10 5 + 2010-02-20 7 + >>> time_series_preparation.transform(datetime_index_time_series) + time_series + 2010-01-01 1 + 2010-01-11 2 + 2010-01-21 3 + 2010-01-31 5 + 2010-02-10 5 + 2010-02-20 7 + >>> time_series_preparation.transform(timedelta_index_time_series) + time_series + 1970-01-02 1 + 1970-01-12 2 + 1970-01-22 3 + 1970-02-01 5 + 1970-02-11 5 + 1970-02-21 7 + """ def __init__(