Add XGBoost as MLAlgo (#160)

* add xgb * fix only first 4 gmb models usage * remove hypex tests * tox add py311 config; fix python supporting for versions >=3.11.1 * change macos from arm to x86-64 gh actions
sb-ai-lab · Aug 6, 2024 · d8635a2 · d8635a2
1 parent 8eb1a18
commit d8635a2
Show file tree

Hide file tree

Showing 19 changed files with 864 additions and 236 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -43,8 +43,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, windows-latest, macos-latest]  # FIX: 'ubuntu-latest'(ubunut-22.04) -> 'ubuntu-20.04': Python version 3.6 was not found in the local cache
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        os: [ubuntu-20.04, windows-latest, macos-14-large]  # FIX: 'ubuntu-latest'(ubunut-22.04) -> 'ubuntu-20.04': Python version 3.6 was not found in the local cache
+        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]
 
     steps:
       - uses: actions/checkout@v2
@@ -59,7 +59,7 @@ jobs:
           poetry-version: 1.1.7
 
       - name: install deps for MacOS
-        if: ${{ matrix.os == 'macos-latest' }}
+        if: ${{ matrix.os == 'macos-14-large' }}
         run: brew install libomp cairo pango gdk-pixbuf libffi
 
       - name: install deps for Ubuntu

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,7 +38,7 @@ repos:
     hooks:
       - id: set-py-versions
         name: set python versions
-        description: set python versions := [3.6.1, 3.11) to `pyproject.toml`
+        description: set python versions := [3.6.1, 3.12) to `pyproject.toml`
         language: python
         entry: python scripts/poetry_fix.py -f
         pass_filenames: false

diff --git a/lightautoml/addons/hypex/tests/test_aa.py b/lightautoml/addons/hypex/tests/test_aa.py
@@ -1,72 +1,72 @@
-import pandas as pd
-import pytest
+# import pandas as pd
+# import pytest
 
-from lightautoml.addons.hypex.ABTesting.ab_tester import AATest
-from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data
+# from lightautoml.addons.hypex.ABTesting.ab_tester import AATest
+# from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data
 
 
-@pytest.fixture
-def data():
-    return create_test_data(rs=52)
+# @pytest.fixture
+# def data():
+#     return create_test_data(rs=52)
 
 
-@pytest.fixture
-def iterations():
-    return 20
+# @pytest.fixture
+# def iterations():
+#     return 20
 
 
-@pytest.fixture
-def info_col():
-    return "user_id"
+# @pytest.fixture
+# def info_col():
+#     return "user_id"
 
 
-def test_aa_simple(data, iterations, info_col):
-    model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col)
-    res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
+# def test_aa_simple(data, iterations, info_col):
+#     model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col)
+#     res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
 
-    assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
-    assert res.shape[0] == iterations, (
-        "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
-    )
-    assert isinstance(datas_dict, dict), "Result is not dict"
-    assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
-    assert all(data.columns) == all(
-        datas_dict[0].drop(columns=["group"]).columns
-    ), "Columns in the result are not the same as columns in initial data "
+#     assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
+#     assert res.shape[0] == iterations, (
+#         "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
+#     )
+#     assert isinstance(datas_dict, dict), "Result is not dict"
+#     assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
+#     assert all(data.columns) == all(
+#         datas_dict[0].drop(columns=["group"]).columns
+#     ), "Columns in the result are not the same as columns in initial data "
 
 
-def test_aa_group(data, iterations, info_col):
-    group_cols = "industry"
+# def test_aa_group(data, iterations, info_col):
+#     group_cols = "industry"
 
-    model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols)
-    res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
+#     model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols)
+#     res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
 
-    assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
-    assert res.shape[0] == iterations, (
-        "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
-    )
-    assert isinstance(datas_dict, dict), "Result is not dict"
-    assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
-    assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
-        "Columns in the result are not " "the same as columns in initial " "data "
-    )
+#     assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
+#     assert res.shape[0] == iterations, (
+#         "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
+#     )
+#     assert isinstance(datas_dict, dict), "Result is not dict"
+#     assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
+#     assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
+#         "Columns in the result are not " "the same as columns in initial " "data "
+#     )
 
 
-def test_aa_quantfields(data, iterations, info_col):
-    group_cols = "industry"
-    quant_field = "gender"
+# def test_aa_quantfields(data, iterations, info_col):
+#     group_cols = "industry"
+#     quant_field = "gender"
 
-    model = AATest(
-        target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols, quant_field=quant_field
-    )
-    res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
+#     model = AATest(
+#         target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols, quant_field=quant_field
+#     )
+#     res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
 
-    assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
-    assert res.shape[0] == iterations, (
-        "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
-    )
-    assert isinstance(datas_dict, dict), "Result is not dict"
-    assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
-    assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
-        "Columns in the result are not " "the same as columns in initial " "data "
-    )
+#     assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
+#     assert res.shape[0] == iterations, (
+#         "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
+#     )
+#     assert isinstance(datas_dict, dict), "Result is not dict"
+#     assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
+#     assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
+#         "Columns in the result are not " "the same as columns in initial " "data "
+#     )
diff --git a/lightautoml/addons/hypex/tests/test_ab.py b/lightautoml/addons/hypex/tests/test_ab.py
@@ -1,92 +1,92 @@
-from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest
+# from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest
 
-import pytest
-import pandas as pd
-import numpy as np
+# import pytest
+# import pandas as pd
+# import numpy as np
 
-DATA_SIZE = 100
+# DATA_SIZE = 100
 
 
-@pytest.fixture
-def ab_test():
-    return ABTest()
+# @pytest.fixture
+# def ab_test():
+#     return ABTest()
 
 
-@pytest.fixture
-def data():
-    # Generate synthetic data for group A
-    group_a_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE)
-    # Generate synthetic data for group B
-    group_b_data = np.random.normal(loc=12, scale=2, size=DATA_SIZE)
-    group_bp_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE * 2)
-    return pd.DataFrame(
-        {
-            "group": ["control"] * len(group_a_data) + ["test"] * len(group_b_data),
-            "value": list(group_a_data) + list(group_b_data),
-            "previous_value": group_bp_data,
-        }
-    )
+# @pytest.fixture
+# def data():
+#     # Generate synthetic data for group A
+#     group_a_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE)
+#     # Generate synthetic data for group B
+#     group_b_data = np.random.normal(loc=12, scale=2, size=DATA_SIZE)
+#     group_bp_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE * 2)
+#     return pd.DataFrame(
+#         {
+#             "group": ["control"] * len(group_a_data) + ["test"] * len(group_b_data),
+#             "value": list(group_a_data) + list(group_b_data),
+#             "previous_value": group_bp_data,
+#         }
+#     )
 
 
-@pytest.fixture
-def target_field():
-    return "value"
+# @pytest.fixture
+# def target_field():
+#     return "value"
 
 
-@pytest.fixture
-def group_field():
-    return "group"
+# @pytest.fixture
+# def group_field():
+#     return "group"
 
 
-@pytest.fixture
-def previous_value():
-    return "previous_value"
+# @pytest.fixture
+# def previous_value():
+#     return "previous_value"
 
 
-@pytest.fixture
-def alpha():
-    return 0.05
+# @pytest.fixture
+# def alpha():
+#     return 0.05
 
 
-def test_split_ab(ab_test, data, group_field):
-    result = ab_test.split_ab(data, group_field)
-    assert len(result["test"]) == DATA_SIZE
-    assert len(result["control"]) == DATA_SIZE
+# def test_split_ab(ab_test, data, group_field):
+#     result = ab_test.split_ab(data, group_field)
+#     assert len(result["test"]) == DATA_SIZE
+#     assert len(result["control"]) == DATA_SIZE
 
 
-def test_calc_difference(ab_test, data, group_field, target_field, previous_value):
-    splitted_data = ab_test.split_ab(data, group_field)
-    result = ab_test.calc_difference(splitted_data, target_field, previous_value)
-    assert 1 < result["ate"] < 3
-    assert 1 < result["cuped"] < 3
-    assert 1 < result["diff_in_diff"] < 3
+# def test_calc_difference(ab_test, data, group_field, target_field, previous_value):
+#     splitted_data = ab_test.split_ab(data, group_field)
+#     result = ab_test.calc_difference(splitted_data, target_field, previous_value)
+#     assert 1 < result["ate"] < 3
+#     assert 1 < result["cuped"] < 3
+#     assert 1 < result["diff_in_diff"] < 3
 
 
-def test_calc_difference_with_previous_value(ab_test, data, group_field, target_field, previous_value):
-    ab_test.calc_difference_method = "ate"
-    splitted_data = ab_test.split_ab(data, group_field)
-    result = ab_test.calc_difference(splitted_data, previous_value)
-    assert -1 < result["ate"] < 1
+# def test_calc_difference_with_previous_value(ab_test, data, group_field, target_field, previous_value):
+#     ab_test.calc_difference_method = "ate"
+#     splitted_data = ab_test.split_ab(data, group_field)
+#     result = ab_test.calc_difference(splitted_data, previous_value)
+#     assert -1 < result["ate"] < 1
 
 
-def test_calc_p_value(ab_test, data, group_field, target_field, previous_value, alpha):
-    splitted_data = ab_test.split_ab(data, group_field)
-    result = ab_test.calc_p_value(splitted_data, target_field)
-    assert result["t_test"] < alpha
-    assert result["mann_whitney"] < alpha
+# def test_calc_p_value(ab_test, data, group_field, target_field, previous_value, alpha):
+#     splitted_data = ab_test.split_ab(data, group_field)
+#     result = ab_test.calc_p_value(splitted_data, target_field)
+#     assert result["t_test"] < alpha
+#     assert result["mann_whitney"] < alpha
 
-    result = ab_test.calc_p_value(splitted_data, previous_value)
-    assert result["t_test"] > alpha
-    assert result["mann_whitney"] > alpha
+#     result = ab_test.calc_p_value(splitted_data, previous_value)
+#     assert result["t_test"] > alpha
+#     assert result["mann_whitney"] > alpha
 
 
-def test_execute(ab_test, data, group_field, target_field, previous_value, alpha):
-    result = ab_test.execute(data, target_field, group_field, previous_value)
-    print(result)
-    assert result["size"]["test"] == DATA_SIZE
-    assert result["size"]["control"] == DATA_SIZE
-    assert 1 < result["difference"]["ate"] < 3
-    assert 1 < result["difference"]["cuped"] < 3
-    assert 1 < result["difference"]["diff_in_diff"] < 3
-    assert result["p_value"]["t_test"] < alpha
-    assert result["p_value"]["mann_whitney"] < alpha
+# def test_execute(ab_test, data, group_field, target_field, previous_value, alpha):
+#     result = ab_test.execute(data, target_field, group_field, previous_value)
+#     print(result)
+#     assert result["size"]["test"] == DATA_SIZE
+#     assert result["size"]["control"] == DATA_SIZE
+#     assert 1 < result["difference"]["ate"] < 3
+#     assert 1 < result["difference"]["cuped"] < 3
+#     assert 1 < result["difference"]["diff_in_diff"] < 3
+#     assert result["p_value"]["t_test"] < alpha
+#     assert result["p_value"]["mann_whitney"] < alpha