From e7d64dab82864deed4b3a3f16f5007fec0b5202f Mon Sep 17 00:00:00 2001 From: vecxoz Date: Mon, 12 Aug 2019 16:07:25 +0300 Subject: [PATCH 1/3] v0.4.0 --- .travis.yml | 7 +- CHANGELOG.md | 53 ++++++++ LICENSE.txt | 2 +- PY2.md | 22 ++++ README.md | 15 +-- .../04_sklearn_api_regression_pipeline.ipynb | 4 +- setup.py | 13 +- tests/test_func_api_classification_binary.py | 68 ++++++++++ ...test_func_api_classification_multiclass.py | 69 +++++++++++ tests/test_func_api_regression.py | 117 +++++++++++++++++- tests/test_sklearn_api_regression.py | 68 ++++++++++ vecstack/__init__.py | 2 +- vecstack/core.py | 32 ++--- vecstack/coresk.py | 27 ++-- 14 files changed, 454 insertions(+), 45 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 PY2.md diff --git a/.travis.yml b/.travis.yml index 6eeab9e..30d59e9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,10 +2,14 @@ # check if .travis.yml is valid: http://lint.travis-ci.org/ # to skip build for given commit put [ci skip] or [skip ci] in commit message +# required for Python >= 3.7 +dist: xenial + language: python -# versions supported by scikit-learn +# versions supported by scikit-learn and some additional versions python: + - "3.7" - "3.6" - "3.5" - "3.4" @@ -16,6 +20,7 @@ branches: only: - master - dev + - py2 install: - pip install numpy diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c3afb9b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,53 @@ +# Changelog + +### v0.4.0 -- August 12, 2019 + +Since v0.4.0 vecstack provides official support for Python 3.5 and higher only, +but still there is unofficial support for Python 2.7 and Python 3.4. +Please see [details](https://github.com/vecxoz/vecstack/blob/master/PY2.md). + +Scikit-learn API: +* Fixed #31. `sklearn.externals.six` deprecation +* Fixed #29. Out-of-memory in `np.random.choice` for very large ranges + +Functional API: +* Feature #18. Added support for N-dimensional input. Useful for convolutional nets. +* Added aliases for `mode` parameter values which correspond to respective `variant` parameter values of `StackingTransformer`: + * 'oof_pred_bag' == 'A' + * 'oof_pred' == 'B' + +### v0.3.0 -- April 6, 2018 + +Introducing Scikit-learn API: `StackingTransformer` + +* Standard transformer class with `fit` and `transform` methods +* Compatible with `Pipeline` and `FeatureUnion` + +### v0.2.2 -- February 23, 2018 + +* Fixed #5. Wrong behavior during sparse matrix processing +* Improved input data validation +* Improved sparse matrix processing + +### v0.2.1 -- January 24, 2018 -- Maintenance release + +* Minor modifications + +### v0.2 -- January 23, 2018 + +New features: + +* Classification with probabilities +* Modes: compute only what you need (only OOF, only predictions, both, etc.) +* Save resulting arrays and log with model parameters + +### v0.1 -- November 22, 2016 -- Initial release + +Features: + +* Functional stacking API +* Regression +* Classification with class labels +* Ordinary and stratified k-fold split +* User-defined metric +* User-defined transformations for target and prediction diff --git a/LICENSE.txt b/LICENSE.txt index 5dd5141..38165f0 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,7 +1,7 @@ MIT License Vecstack. Python package for stacking (machine learning technique) -Copyright (c) 2016-2018 Igor Ivanov +Copyright (c) 2016-2019 Igor Ivanov Email: vecxoz@gmail.com Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/PY2.md b/PY2.md new file mode 100644 index 0000000..94c5945 --- /dev/null +++ b/PY2.md @@ -0,0 +1,22 @@ +### Python 3.x + +Since v0.4.0 vecstack provides official support for Python 3.5 and higher only, +but still there is unofficial support for Python 2.7 and Python 3.4. See details below. + +The reason for these changes is global movement in Python 3.x direction. +Vecstack depends on scikit-learn which has already stopped support for Python < 3.5. +Scikit-learn v0.20.x is the last version supporting Python 2.7 and Python 3.4. +Vecstack follows this direction as well. +Please see [python3statement.org](https://python3statement.org/) for more details. + +### Unofficial support for Python 2.7 and Python 3.4 + +You can still install and run latest vecstack on Python 2.7 and Python 3.4. +NOTE. It will require legacy versions of the following packages: +* numpy<1.17 +* scipy<1.3 +* scikit-learn>=0.18,<0.21 +There is a dedicated branch on GitHub called `py2` with appropriate requirements in `setup.py`. +Installation: + +`pip install https://github.com/vecxoz/vecstack/archive/py2.zip` \ No newline at end of file diff --git a/README.md b/README.md index c49b925..585a1d5 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ Convenient way to automate OOF computation, prediction and bagging using any num * Predict [class labels or probabilities](https://github.com/vecxoz/vecstack/blob/master/vecstack/coresk.py#L119) in classification task * Apply any [user-defined metric](https://github.com/vecxoz/vecstack/blob/master/vecstack/coresk.py#L124) * Apply any [user-defined transformations](https://github.com/vecxoz/vecstack/blob/master/vecstack/coresk.py#L87) for target and prediction - * Python 2, Python 3 + * Python 3.5 and higher, [unofficial support for Python 2.7 and 3.4](https://github.com/vecxoz/vecstack/blob/master/PY2.md) * Win, Linux, Mac * [MIT license](https://github.com/vecxoz/vecstack/blob/master/LICENSE.txt) - * Depends on **numpy**, **scipy**, **scikit-learn>=18.0** + * Depends on **numpy**, **scipy**, **scikit-learn>=0.18** # Get started * [FAQ](https://github.com/vecxoz/vecstack#stacking-faq) @@ -292,14 +292,15 @@ Stacking API comparison: | Estimator implementation restrictions | Must have only `fit` and `predict` (`predict_proba`) methods | Must be fully scikit-learn compatible | | `NaN` and `inf` in input data | Allowed | Not allowed | | Can automatically save OOF and log in files | Yes | No | +| Input dimensionality (`X_train`, `X_test`) | Arbitrary | 2-D | ### 21. How do parameters of `stacking` function and `StackingTransformer` correspond? -| **stacking function** | **StackingTransformer** | -|-------------------------|-----------------------------------| -| `models=[Ridge()]` | `estimators=[('ridge', Ridge())]` | -| `mode='oof_pred_bag'` | `variant='A'` | -| `mode='oof_pred'` | `variant='B'` | +| **stacking function** | **StackingTransformer** | +|---------------------------------------|-----------------------------------| +| `models=[Ridge()]` | `estimators=[('ridge', Ridge())]` | +| `mode='oof_pred_bag'` (alias `'A'`) | `variant='A'` | +| `mode='oof_pred'` (alias `'B'`) | `variant='B'` | ### 22. Why Scikit-learn API was implemented as transformer and not predictor? diff --git a/examples/04_sklearn_api_regression_pipeline.ipynb b/examples/04_sklearn_api_regression_pipeline.ipynb index 9a63df8..f29f146 100644 --- a/examples/04_sklearn_api_regression_pipeline.ipynb +++ b/examples/04_sklearn_api_regression_pipeline.ipynb @@ -512,7 +512,7 @@ "source": [ "# 2. Pipeline\n", "\n", - "StackingTransformer is fully scikit-learn compatible so we can easily implement **arbitrary number of stacking layers** using Pipeline\n" + "StackingTransformer is fully scikit-learn compatible so we can easily implement **arbitrary number of stacking levels** using Pipeline\n" ] }, { @@ -535,7 +535,7 @@ "metadata": {}, "outputs": [], "source": [ - "# If we have several stacking layers our Pipeline steps would be:\n", + "# If we have several stacking levels our Pipeline steps would be:\n", "# steps = [('stack_L1', stack_L1),\n", "# ('stack_L2', stack_L2),\n", "# ('stack_L99', stack_L99), # :-)\n", diff --git a/setup.py b/setup.py index 7358454..96289c1 100644 --- a/setup.py +++ b/setup.py @@ -2,10 +2,15 @@ from setuptools import setup +long_desc = ''' +Python package for stacking (stacked generalization) featuring lightweight functional API and fully compatible scikit-learn API. +Convenient way to automate OOF computation, prediction and bagging using any number of models. +''' + setup(name='vecstack', - version='0.3.0', + version='0.4.0', description='Python package for stacking (machine learning technique)', - long_description='Convenient way to automate OOF computation, prediction and bagging using any number of models', + long_description=long_desc, classifiers=[ 'License :: OSI Approved :: MIT License', 'Operating System :: MacOS', @@ -13,12 +18,10 @@ 'Operating System :: POSIX', 'Operating System :: Unix', 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Information Analysis', diff --git a/tests/test_func_api_classification_binary.py b/tests/test_func_api_classification_binary.py index 5cc1a0e..6bc6419 100644 --- a/tests/test_func_api_classification_binary.py +++ b/tests/test_func_api_classification_binary.py @@ -63,6 +63,33 @@ y_test = y[ind_test] +# Create 4-dim data +np.random.seed(42) +X_train_4d = np.random.normal(size=(400, 8, 8, 3)) +X_test_4d = np.random.normal(size=(100, 8, 8, 3)) +y_train_4d = np.random.randint(n_classes, size=400) + +# Reshape 4-dim to 2-dim +X_train_4d_unrolled = X_train_4d.reshape(X_train_4d.shape[0], -1) +X_test_4d_unrolled = X_test_4d.reshape(X_test_4d.shape[0], -1) + +#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------ + +class LogisticRegressionUnrolled(LogisticRegression): + """ + For tests related to N-dim input. + Estimator accepts N-dim array and reshape it to 2-dim array + """ + def fit(self, X, y): + return super(LogisticRegressionUnrolled, self).fit(X.reshape(X.shape[0], -1), y) + + def predict(self, X): + return super(LogisticRegressionUnrolled, self).predict(X.reshape(X.shape[0], -1)) + + def predict_proba(self, X): + return super(LogisticRegressionUnrolled, self).predict_proba(X.reshape(X.shape[0], -1)) + #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- @@ -775,7 +802,48 @@ def test_oof_pred_bag_mode_proba_2_models(self): assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) + + def test_N_dim_input(self): + """ + This is `test_oof_pred_bag_mode` function with `LogisticRegressionUnrolled` estimator + """ + S_test_temp = np.zeros((X_test_4d_unrolled.shape[0], n_folds)) + # Usind StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold + kf = StratifiedKFold(n_splits = n_folds, shuffle = False, random_state = 0) + for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train_4d_unrolled, y_train_4d)): + # Split data and target + X_tr = X_train_4d_unrolled[tr_index] + y_tr = y_train_4d[tr_index] + X_te = X_train_4d_unrolled[te_index] + y_te = y_train_4d[te_index] + model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') + _ = model.fit(X_tr, y_tr) + S_test_temp[:, fold_counter] = model.predict(X_test_4d_unrolled) + S_test_1 = st.mode(S_test_temp, axis = 1)[0] + model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') + S_train_1 = cross_val_predict(model, X_train_4d_unrolled, y = y_train_4d, cv = n_folds, + n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) + + models = [LogisticRegressionUnrolled(random_state=0, solver='liblinear', multi_class='ovr')] + S_train_2, S_test_2 = stacking(models, X_train_4d, y_train_4d, X_test_4d, + regression = False, n_folds = n_folds, shuffle = False, save_dir=temp_dir, + mode = 'oof_pred_bag', random_state = 0, verbose = 0, stratified = True) + + # Load OOF from file + # Normally if cleaning is performed there is only one .npy file at given moment + # But if we have no cleaning there may be more then one file so we take the latest + file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file + S = np.load(file_name) + S_train_3 = S[0] + S_test_3 = S[1] + + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) + #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- diff --git a/tests/test_func_api_classification_multiclass.py b/tests/test_func_api_classification_multiclass.py index f9e5fa5..8e3c3a5 100644 --- a/tests/test_func_api_classification_multiclass.py +++ b/tests/test_func_api_classification_multiclass.py @@ -60,6 +60,33 @@ y_test = y[ind_test] +# Create 4-dim data +np.random.seed(42) +X_train_4d = np.random.normal(size=(400, 8, 8, 3)) +X_test_4d = np.random.normal(size=(100, 8, 8, 3)) +y_train_4d = np.random.randint(n_classes, size=400) + +# Reshape 4-dim to 2-dim +X_train_4d_unrolled = X_train_4d.reshape(X_train_4d.shape[0], -1) +X_test_4d_unrolled = X_test_4d.reshape(X_test_4d.shape[0], -1) + +#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------ + +class LogisticRegressionUnrolled(LogisticRegression): + """ + For tests related to N-dim input. + Estimator accepts N-dim array and reshape it to 2-dim array + """ + def fit(self, X, y): + return super(LogisticRegressionUnrolled, self).fit(X.reshape(X.shape[0], -1), y) + + def predict(self, X): + return super(LogisticRegressionUnrolled, self).predict(X.reshape(X.shape[0], -1)) + + def predict_proba(self, X): + return super(LogisticRegressionUnrolled, self).predict_proba(X.reshape(X.shape[0], -1)) + #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- @@ -772,7 +799,49 @@ def test_oof_pred_bag_mode_proba_2_models(self): assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) + + + def test_N_dim_input(self): + """ + This is `test_oof_pred_bag_mode` function with `LogisticRegressionUnrolled` estimator + """ + S_test_temp = np.zeros((X_test_4d_unrolled.shape[0], n_folds)) + # Usind StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold + kf = StratifiedKFold(n_splits = n_folds, shuffle = False, random_state = 0) + for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train_4d_unrolled, y_train_4d)): + # Split data and target + X_tr = X_train_4d_unrolled[tr_index] + y_tr = y_train_4d[tr_index] + X_te = X_train_4d_unrolled[te_index] + y_te = y_train_4d[te_index] + model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') + _ = model.fit(X_tr, y_tr) + S_test_temp[:, fold_counter] = model.predict(X_test_4d_unrolled) + S_test_1 = st.mode(S_test_temp, axis = 1)[0] + model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') + S_train_1 = cross_val_predict(model, X_train_4d_unrolled, y = y_train_4d, cv = n_folds, + n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) + + models = [LogisticRegressionUnrolled(random_state=0, solver='liblinear', multi_class='ovr')] + S_train_2, S_test_2 = stacking(models, X_train_4d, y_train_4d, X_test_4d, + regression = False, n_folds = n_folds, shuffle = False, save_dir=temp_dir, + mode = 'oof_pred_bag', random_state = 0, verbose = 0, stratified = True) + + # Load OOF from file + # Normally if cleaning is performed there is only one .npy file at given moment + # But if we have no cleaning there may be more then one file so we take the latest + file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file + S = np.load(file_name) + S_train_3 = S[0] + S_test_3 = S[1] + + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) + #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- diff --git a/tests/test_func_api_regression.py b/tests/test_func_api_regression.py index ed6c047..ca4432c 100644 --- a/tests/test_func_api_regression.py +++ b/tests/test_func_api_regression.py @@ -136,7 +136,34 @@ def test_oof_pred_mode(self): assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) - + + def test_B_mode(self): + """ 'B' is alias for 'oof_pred' """ + model = LinearRegression() + S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, + n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) + _ = model.fit(X_train, y_train) + S_test_1 = model.predict(X_test).reshape(-1, 1) + + models = [LinearRegression()] + S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, + regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, + mode = 'B', random_state = 0, verbose = 0) + + # Load OOF from file + # Normally if cleaning is performed there is only one .npy file at given moment + # But if we have no cleaning there may be more then one file so we take the latest + file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file + S = np.load(file_name) + S_train_3 = S[0] + S_test_3 = S[1] + + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) + def test_oof_mode(self): model = LinearRegression() @@ -226,6 +253,44 @@ def test_oof_pred_bag_mode(self): assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) + + def test_A_mode(self): + """ 'A' is alias for 'oof_pred_bag' """ + S_test_temp = np.zeros((X_test.shape[0], n_folds)) + kf = KFold(n_splits = n_folds, shuffle = False, random_state = 0) + for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): + # Split data and target + X_tr = X_train[tr_index] + y_tr = y_train[tr_index] + X_te = X_train[te_index] + y_te = y_train[te_index] + model = LinearRegression() + _ = model.fit(X_tr, y_tr) + S_test_temp[:, fold_counter] = model.predict(X_test) + S_test_1 = np.mean(S_test_temp, axis = 1).reshape(-1, 1) + + model = LinearRegression() + S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, + n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) + + models = [LinearRegression()] + S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, + regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, + mode = 'A', random_state = 0, verbose = 0) + + # Load OOF from file + # Normally if cleaning is performed there is only one .npy file at given moment + # But if we have no cleaning there may be more then one file so we take the latest + file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file + S = np.load(file_name) + S_train_3 = S[0] + S_test_3 = S[1] + + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) def test_pred_bag_mode(self): @@ -811,9 +876,9 @@ def test_oof_pred_mode_no_get_params(self): assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) - #------------------------------------------------------------------------------- + #-------------------------------------------------------------------------- # Test inconsistent data shape or type - #------------------------------------------------------------------------------- + #-------------------------------------------------------------------------- def test_inconsistent_data(self): # nan or inf in y y_train_nan = y_train.copy() @@ -828,7 +893,53 @@ def test_inconsistent_data(self): # X_train and y_train shape nismatch assert_raises(ValueError, stacking, [LinearRegression()], X_train, y_train[:10], X_test) + + #--------------------------------------------------------------------------- + # Test small input + #--------------------------------------------------------------------------- + + def test_small_input(self): + """ + This is `test_oof_pred_bag_mode` with small input data + Train: 20 examples + Test: 10 examples + """ + S_test_temp = np.zeros((X_test[:10].shape[0], n_folds)) + kf = KFold(n_splits = n_folds, shuffle = False, random_state = 0) + for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train[:20], y_train[:20])): + # Split data and target + X_tr = X_train[:20][tr_index] + y_tr = y_train[:20][tr_index] + X_te = X_train[:20][te_index] + y_te = y_train[:20][te_index] + model = LinearRegression() + _ = model.fit(X_tr, y_tr) + S_test_temp[:, fold_counter] = model.predict(X_test[:10]) + S_test_1 = np.mean(S_test_temp, axis = 1).reshape(-1, 1) + model = LinearRegression() + S_train_1 = cross_val_predict(model, X_train[:20], y = y_train[:20], cv = n_folds, + n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) + + models = [LinearRegression()] + S_train_2, S_test_2 = stacking(models, X_train[:20], y_train[:20], X_test[:10], + regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, + mode = 'oof_pred_bag', random_state = 0, verbose = 0) + + # Load OOF from file + # Normally if cleaning is performed there is only one .npy file at given moment + # But if we have no cleaning there may be more then one file so we take the latest + file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file + S = np.load(file_name) + S_train_3 = S[0] + S_test_3 = S[1] + + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) + #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- diff --git a/tests/test_sklearn_api_regression.py b/tests/test_sklearn_api_regression.py index 9b12184..d7e9bad 100644 --- a/tests/test_sklearn_api_regression.py +++ b/tests/test_sklearn_api_regression.py @@ -1640,6 +1640,24 @@ def test_check_identity(self): assert_raises(AssertionError, assert_raises, ValueError, stack._check_identity, X_train) # ``X`` argument is INcorrect - MUST raise assert_raises(ValueError, stack._check_identity, 5) + + # ------------------------------------------------------------------------- + # Test ``_random_choice`` method + # ------------------------------------------------------------------------- + def test_random_choice(self): + # fit then transform + estimators = [('lr', LinearRegression())] + stack = StackingTransformer(estimators, regression=True, + n_folds=n_folds, shuffle=False, + variant='B', random_state=0, + verbose=0) + stack = stack.fit(X_train, y_train) + # very large range - must NOT raise + assert_raises(AssertionError, assert_raises, ValueError, stack._random_choice, 19999999999, 1000) + # ``size`` is less than ``n`` - must NOT raise + assert_raises(AssertionError, assert_raises, ValueError, stack._random_choice, 200, 20) + # ``size`` is greater than ``n`` - MUST raise + assert_raises(ValueError, stack._random_choice, 20, 200) # ------------------------------------------------------------------------- # Test case where X_test has the same shape as X_train @@ -1678,6 +1696,56 @@ def test_x_test_has_same_shape(self): assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) + # ------------------------------------------------------------------------- + # Test small input + # ------------------------------------------------------------------------- + + def test_small_input(self): + """ + This is `test_variant_A` with small input data + Train: 20 examples + Test: 10 examples + """ + S_test_temp = np.zeros((X_test[:10].shape[0], n_folds)) + kf = KFold(n_splits=n_folds, shuffle=False, random_state=0) + for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train[:20], y_train[:20])): + # Split data and target + X_tr = X_train[:20][tr_index] + y_tr = y_train[:20][tr_index] + # X_te = X_train[:20][te_index] + # y_te = y_train[:20][te_index] + model = LinearRegression() + model = model.fit(X_tr, y_tr) + S_test_temp[:, fold_counter] = model.predict(X_test[:10]) + S_test_1 = np.mean(S_test_temp, axis=1).reshape(-1, 1) + + model = LinearRegression() + S_train_1 = cross_val_predict(model, X_train[:20], y=y_train[:20], + cv=n_folds, n_jobs=1, verbose=0, + method='predict').reshape(-1, 1) + + # fit then transform + estimators = [('lr', LinearRegression())] + stack = StackingTransformer(estimators, regression=True, + n_folds=n_folds, shuffle=False, + variant='A', random_state=0, + verbose=0) + stack = stack.fit(X_train[:20], y_train[:20]) + S_train_2 = stack.transform(X_train[:20]) + S_test_2 = stack.transform(X_test[:10]) + + # fit_transform + # also check refitting already fitted transformer + S_train_3 = stack.fit_transform(X_train[:20], y_train[:20]) + S_test_3 = stack.transform(X_test[:10]) + + # compare + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) + # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- diff --git a/vecstack/__init__.py b/vecstack/__init__.py index 3ecea4e..3ce96cb 100644 --- a/vecstack/__init__.py +++ b/vecstack/__init__.py @@ -38,7 +38,7 @@ __author__ = 'Igor Ivanov > kaggle.com/vecxoz' __license__ = 'MIT' -__version__ = '0.3.0' +__version__ = '0.4.0' __all__ = ['stacking', 'StackingTransformer'] diff --git a/vecstack/core.py b/vecstack/core.py index 8c1bb4b..5deaf67 100644 --- a/vecstack/core.py +++ b/vecstack/core.py @@ -145,13 +145,13 @@ def stacking(models, X_train, y_train, X_test, task with probabilities model must return probabilities for each class (i.e. two columns). - X_train : numpy array or sparse matrix of shape [n_train_samples, n_features] + X_train : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_train_samples, n_features] Training data y_train : numpy 1d array Target values - X_test : numpy array or sparse matrix of shape [n_test_samples, n_features] + X_test : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_test_samples, n_features] Test data sample_weight : numpy array of shape [n_train_samples] @@ -186,11 +186,11 @@ def stacking(models, X_train, y_train, X_test, respective backward transformation like numpy.expm1. Look at description of parameter transform_target - mode: str, default 'oof_pred_bag' + mode: str, default 'oof_pred_bag' (alias 'A') Note: for detailes see terminology below 'oof' - return only oof - 'oof_pred' - return oof and pred - 'oof_pred_bag' - return oof and bagged pred + 'oof_pred' (alias 'B') - return oof and pred + 'oof_pred_bag' (alias 'A') - return oof and bagged pred 'pred' - return pred only 'pred_bag' - return bagged pred only Terminology: @@ -427,7 +427,7 @@ def your_metric(y_true, y_pred): # regression = bool(regression) # If wrong - if mode not in ['pred', 'pred_bag', 'oof', 'oof_pred', 'oof_pred_bag']: + if mode not in ['pred', 'pred_bag', 'oof', 'oof_pred', 'B', 'oof_pred_bag', 'A']: raise ValueError('Parameter must be set properly') # needs_proba = bool(needs_proba) @@ -511,7 +511,7 @@ def your_metric(y_true, y_pred): #--------------------------------------------------------------------------- # Create empty numpy arrays for OOF #--------------------------------------------------------------------------- - if mode in ['oof_pred', 'oof_pred_bag']: + if mode in ['oof_pred', 'B', 'oof_pred_bag', 'A']: S_train = np.zeros(( X_train.shape[0], len(models) * n_classes )) S_test = np.zeros(( X_test.shape[0], len(models) * n_classes )) elif mode in ['oof']: @@ -542,7 +542,7 @@ def your_metric(y_true, y_pred): print(model_str) # Create empty numpy array, which will contain temporary predictions for test set made in each fold - if mode in ['pred_bag', 'oof_pred_bag']: + if mode in ['pred_bag', 'oof_pred_bag', 'A']: S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes)) # Create empty array to store scores for each fold (to find mean) @@ -551,7 +551,7 @@ def your_metric(y_true, y_pred): #----------------------------------------------------------------------- # Loop across folds #----------------------------------------------------------------------- - if mode in ['pred_bag', 'oof', 'oof_pred', 'oof_pred_bag']: + if mode in ['pred_bag', 'oof', 'oof_pred', 'B', 'oof_pred_bag', 'A']: for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] @@ -572,11 +572,11 @@ def your_metric(y_true, y_pred): model = clone(model, safe=False) # Fit 1-st level model - if mode in ['pred_bag', 'oof', 'oof_pred', 'oof_pred_bag']: + if mode in ['pred_bag', 'oof', 'oof_pred', 'B', 'oof_pred_bag', 'A']: _ = model_action(model, X_tr, y_tr, None, sample_weight = sample_weight_tr, action = 'fit', transform = transform_target) # Predict out-of-fold part of train set - if mode in ['oof', 'oof_pred', 'oof_pred_bag']: + if mode in ['oof', 'oof_pred', 'B', 'oof_pred_bag', 'A']: if 'predict_proba' == action: col_slice_model = slice(model_counter * n_classes, model_counter * n_classes + n_classes) else: @@ -584,7 +584,7 @@ def your_metric(y_true, y_pred): S_train[te_index, col_slice_model] = model_action(model, None, None, X_te, action = action, transform = transform_pred) # Predict full test set in each fold - if mode in ['pred_bag', 'oof_pred_bag']: + if mode in ['pred_bag', 'oof_pred_bag', 'A']: if 'predict_proba' == action: col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) else: @@ -592,7 +592,7 @@ def your_metric(y_true, y_pred): S_test_temp[:, col_slice_fold] = model_action(model, None, None, X_test, action = action, transform = transform_pred) # Compute scores - if mode in ['oof', 'oof_pred', 'oof_pred_bag']: + if mode in ['oof', 'oof_pred', 'B', 'oof_pred_bag', 'A']: if save_dir is not None or verbose > 0: score = metric(y_te, S_train[te_index, col_slice_model]) scores = np.append(scores, score) @@ -603,7 +603,7 @@ def your_metric(y_true, y_pred): print(fold_str) # Compute mean or mode of predictions for test set in bag modes - if mode in ['pred_bag', 'oof_pred_bag']: + if mode in ['pred_bag', 'oof_pred_bag', 'A']: if 'predict_proba' == action: # Here we copute means of probabilirties for each class for class_id in range(n_classes): @@ -615,7 +615,7 @@ def your_metric(y_true, y_pred): S_test[:, model_counter] = st.mode(S_test_temp, axis = 1)[0].ravel() # Compute scores: mean + std and full - if mode in ['oof', 'oof_pred', 'oof_pred_bag']: + if mode in ['oof', 'oof_pred', 'B', 'oof_pred_bag', 'A']: if save_dir is not None or verbose > 0: sep_str = ' ----' mean_str = ' MEAN: [%.8f] + [%.8f]' % (np.mean(scores), np.std(scores)) @@ -630,7 +630,7 @@ def your_metric(y_true, y_pred): print(full_str) # Fit model on full train set and predict test set - if mode in ['pred', 'oof_pred']: + if mode in ['pred', 'oof_pred', 'B']: if verbose > 0: print(' Fitting on full train set...\n') _ = model_action(model, X_train, y_train, None, sample_weight = sample_weight, action = 'fit', transform = transform_target) diff --git a/vecstack/coresk.py b/vecstack/coresk.py index 2a90e92..712f3c1 100644 --- a/vecstack/coresk.py +++ b/vecstack/coresk.py @@ -54,7 +54,7 @@ from sklearn.metrics import mean_absolute_error from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss -from sklearn.externals import six +from sklearn.metrics import mean_squared_error # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- @@ -853,7 +853,7 @@ def _estimator_action(self, estimator, X_train, y_train, X_test, def _random_choice(self, n, size, bound=2**30): """ - Memory efficient (but slower) version of np.random.choice + Memory efficient substitute for np.random.choice without replacement Parameters: =========== @@ -870,12 +870,21 @@ def _random_choice(self, n, size, bound=2**30): ======== ids : 1d numpy array of shape (size, ) dtype=np.int32 """ - ids = [] - while len(ids) < size: - rnd = np.random.randint(min(bound, n)) - if rnd not in ids: - ids.append(rnd) - return np.array(ids, dtype=np.int32) + try: + if n < size: + raise ValueError('Drawing without replacement: ' + '``n`` cannot be less than ``size``') + + ids = [] + while len(ids) < size: + rnd = np.random.randint(min(bound, n)) + if rnd not in ids: + ids.append(rnd) + return np.array(ids, dtype=np.int32) + + except Exception: + raise ValueError('Internal error. ' + 'Please save traceback and inform developers.') # ------------------------------------------------------------------------- # ------------------------------------------------------------------------- @@ -946,7 +955,7 @@ def _get_params(self, attr, deep=True): return out out.update(estimators) for name, estimator in estimators: - for key, value in six.iteritems(estimator.get_params(deep=True)): + for key, value in iter(estimator.get_params(deep=True).items()): out['%s__%s' % (name, key)] = value return out From 65937dfa3f1c64da4e0134876c4a9d53766e3d14 Mon Sep 17 00:00:00 2001 From: vecxoz Date: Mon, 12 Aug 2019 17:12:55 +0300 Subject: [PATCH 2/3] allow_pickle=True for np.load in tests --- tests/test_func_api_classification_binary.py | 32 +++++++-------- ...test_func_api_classification_multiclass.py | 32 +++++++-------- tests/test_func_api_regression.py | 40 +++++++++---------- 3 files changed, 52 insertions(+), 52 deletions(-) diff --git a/tests/test_func_api_classification_binary.py b/tests/test_func_api_classification_binary.py index 6bc6419..efabc24 100644 --- a/tests/test_func_api_classification_binary.py +++ b/tests/test_func_api_classification_binary.py @@ -145,7 +145,7 @@ def test_oof_pred_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -171,7 +171,7 @@ def test_oof_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -197,7 +197,7 @@ def test_pred_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -236,7 +236,7 @@ def test_oof_pred_bag_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -273,7 +273,7 @@ def test_pred_bag_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -304,7 +304,7 @@ def test_oof_pred_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -330,7 +330,7 @@ def test_oof_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -356,7 +356,7 @@ def test_pred_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -398,7 +398,7 @@ def test_oof_pred_bag_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -447,7 +447,7 @@ def test_pred_bag_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -491,7 +491,7 @@ def test_oof_pred_bag_mode_shuffle(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -609,7 +609,7 @@ def test_oof_pred_mode_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -672,7 +672,7 @@ def test_oof_pred_bag_mode_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -712,7 +712,7 @@ def test_oof_pred_mode_proba_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -784,7 +784,7 @@ def test_oof_pred_bag_mode_proba_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -834,7 +834,7 @@ def test_N_dim_input(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] diff --git a/tests/test_func_api_classification_multiclass.py b/tests/test_func_api_classification_multiclass.py index 8e3c3a5..f74eb4d 100644 --- a/tests/test_func_api_classification_multiclass.py +++ b/tests/test_func_api_classification_multiclass.py @@ -142,7 +142,7 @@ def test_oof_pred_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -168,7 +168,7 @@ def test_oof_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -194,7 +194,7 @@ def test_pred_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -233,7 +233,7 @@ def test_oof_pred_bag_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -270,7 +270,7 @@ def test_pred_bag_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -301,7 +301,7 @@ def test_oof_pred_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -327,7 +327,7 @@ def test_oof_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -353,7 +353,7 @@ def test_pred_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -395,7 +395,7 @@ def test_oof_pred_bag_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -444,7 +444,7 @@ def test_pred_bag_mode_proba(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -488,7 +488,7 @@ def test_oof_pred_bag_mode_shuffle(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -606,7 +606,7 @@ def test_oof_pred_mode_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -669,7 +669,7 @@ def test_oof_pred_bag_mode_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -709,7 +709,7 @@ def test_oof_pred_mode_proba_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -781,7 +781,7 @@ def test_oof_pred_bag_mode_proba_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -832,7 +832,7 @@ def test_N_dim_input(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] diff --git a/tests/test_func_api_regression.py b/tests/test_func_api_regression.py index ca4432c..05bf9f3 100644 --- a/tests/test_func_api_regression.py +++ b/tests/test_func_api_regression.py @@ -127,7 +127,7 @@ def test_oof_pred_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -154,7 +154,7 @@ def test_B_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -180,7 +180,7 @@ def test_oof_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -206,7 +206,7 @@ def test_pred_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -244,7 +244,7 @@ def test_oof_pred_bag_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -282,7 +282,7 @@ def test_A_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -318,7 +318,7 @@ def test_pred_bag_mode(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -352,7 +352,7 @@ def test_oof_pred_mode_sample_weight_one(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -387,7 +387,7 @@ def test_oof_pred_mode_sample_weight_random(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -419,7 +419,7 @@ def test_oof_pred_mode_transformations(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -557,7 +557,7 @@ def test_oof_pred_mode_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -619,7 +619,7 @@ def test_oof_pred_bag_mode_2_models(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -650,7 +650,7 @@ def test_oof_pred_mode_sparse_csr(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -677,7 +677,7 @@ def test_oof_pred_mode_sparse_csc(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -704,7 +704,7 @@ def test_oof_pred_mode_sparse_coo(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -735,7 +735,7 @@ def test_oof_pred_mode_sparse_csr_coo(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -766,7 +766,7 @@ def test_oof_pred_mode_sparse_csr_dense(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -795,7 +795,7 @@ def test_oof_mode_xtest_is_none(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -866,7 +866,7 @@ def test_oof_pred_mode_no_get_params(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] @@ -930,7 +930,7 @@ def test_small_input(self): # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file - S = np.load(file_name) + S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] From 55f9cad303076c625cd3e1125b1875ca9f4a3b88 Mon Sep 17 00:00:00 2001 From: Igor Ivanov Date: Mon, 12 Aug 2019 18:31:16 +0300 Subject: [PATCH 3/3] Improve text formatting --- PY2.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PY2.md b/PY2.md index 94c5945..da2c587 100644 --- a/PY2.md +++ b/PY2.md @@ -16,7 +16,8 @@ NOTE. It will require legacy versions of the following packages: * numpy<1.17 * scipy<1.3 * scikit-learn>=0.18,<0.21 + There is a dedicated branch on GitHub called `py2` with appropriate requirements in `setup.py`. Installation: -`pip install https://github.com/vecxoz/vecstack/archive/py2.zip` \ No newline at end of file +`pip install https://github.com/vecxoz/vecstack/archive/py2.zip`