Skip to content

Commit

Permalink
Add XGBoost as MLAlgo (#160)
Browse files Browse the repository at this point in the history
* add xgb
* fix only first 4 gmb models usage
* remove hypex tests
* tox add py311 config; fix python supporting for versions >=3.11.1
* change macos from arm to x86-64 gh actions
  • Loading branch information
dev-rinchin authored Aug 6, 2024
1 parent 8eb1a18 commit d8635a2
Show file tree
Hide file tree
Showing 19 changed files with 864 additions and 236 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04, windows-latest, macos-latest] # FIX: 'ubuntu-latest'(ubunut-22.04) -> 'ubuntu-20.04': Python version 3.6 was not found in the local cache
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
os: [ubuntu-20.04, windows-latest, macos-14-large] # FIX: 'ubuntu-latest'(ubunut-22.04) -> 'ubuntu-20.04': Python version 3.6 was not found in the local cache
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v2
Expand All @@ -59,7 +59,7 @@ jobs:
poetry-version: 1.1.7

- name: install deps for MacOS
if: ${{ matrix.os == 'macos-latest' }}
if: ${{ matrix.os == 'macos-14-large' }}
run: brew install libomp cairo pango gdk-pixbuf libffi

- name: install deps for Ubuntu
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ repos:
hooks:
- id: set-py-versions
name: set python versions
description: set python versions := [3.6.1, 3.11) to `pyproject.toml`
description: set python versions := [3.6.1, 3.12) to `pyproject.toml`
language: python
entry: python scripts/poetry_fix.py -f
pass_filenames: false
Expand Down
108 changes: 54 additions & 54 deletions lightautoml/addons/hypex/tests/test_aa.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,72 @@
import pandas as pd
import pytest
# import pandas as pd
# import pytest

from lightautoml.addons.hypex.ABTesting.ab_tester import AATest
from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data
# from lightautoml.addons.hypex.ABTesting.ab_tester import AATest
# from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data


@pytest.fixture
def data():
return create_test_data(rs=52)
# @pytest.fixture
# def data():
# return create_test_data(rs=52)


@pytest.fixture
def iterations():
return 20
# @pytest.fixture
# def iterations():
# return 20


@pytest.fixture
def info_col():
return "user_id"
# @pytest.fixture
# def info_col():
# return "user_id"


def test_aa_simple(data, iterations, info_col):
model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col)
res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
# def test_aa_simple(data, iterations, info_col):
# model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col)
# res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, (
"Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
)
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(
datas_dict[0].drop(columns=["group"]).columns
), "Columns in the result are not the same as columns in initial data "
# assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
# assert res.shape[0] == iterations, (
# "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
# )
# assert isinstance(datas_dict, dict), "Result is not dict"
# assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
# assert all(data.columns) == all(
# datas_dict[0].drop(columns=["group"]).columns
# ), "Columns in the result are not the same as columns in initial data "


def test_aa_group(data, iterations, info_col):
group_cols = "industry"
# def test_aa_group(data, iterations, info_col):
# group_cols = "industry"

model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols)
res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
# model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols)
# res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, (
"Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
)
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
"Columns in the result are not " "the same as columns in initial " "data "
)
# assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
# assert res.shape[0] == iterations, (
# "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
# )
# assert isinstance(datas_dict, dict), "Result is not dict"
# assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
# assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
# "Columns in the result are not " "the same as columns in initial " "data "
# )


def test_aa_quantfields(data, iterations, info_col):
group_cols = "industry"
quant_field = "gender"
# def test_aa_quantfields(data, iterations, info_col):
# group_cols = "industry"
# quant_field = "gender"

model = AATest(
target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols, quant_field=quant_field
)
res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
# model = AATest(
# target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols, quant_field=quant_field
# )
# res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, (
"Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
)
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
"Columns in the result are not " "the same as columns in initial " "data "
)
# assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
# assert res.shape[0] == iterations, (
# "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
# )
# assert isinstance(datas_dict, dict), "Result is not dict"
# assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
# assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
# "Columns in the result are not " "the same as columns in initial " "data "
# )
134 changes: 67 additions & 67 deletions lightautoml/addons/hypex/tests/test_ab.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,92 @@
from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest
# from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest

import pytest
import pandas as pd
import numpy as np
# import pytest
# import pandas as pd
# import numpy as np

DATA_SIZE = 100
# DATA_SIZE = 100


@pytest.fixture
def ab_test():
return ABTest()
# @pytest.fixture
# def ab_test():
# return ABTest()


@pytest.fixture
def data():
# Generate synthetic data for group A
group_a_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE)
# Generate synthetic data for group B
group_b_data = np.random.normal(loc=12, scale=2, size=DATA_SIZE)
group_bp_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE * 2)
return pd.DataFrame(
{
"group": ["control"] * len(group_a_data) + ["test"] * len(group_b_data),
"value": list(group_a_data) + list(group_b_data),
"previous_value": group_bp_data,
}
)
# @pytest.fixture
# def data():
# # Generate synthetic data for group A
# group_a_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE)
# # Generate synthetic data for group B
# group_b_data = np.random.normal(loc=12, scale=2, size=DATA_SIZE)
# group_bp_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE * 2)
# return pd.DataFrame(
# {
# "group": ["control"] * len(group_a_data) + ["test"] * len(group_b_data),
# "value": list(group_a_data) + list(group_b_data),
# "previous_value": group_bp_data,
# }
# )


@pytest.fixture
def target_field():
return "value"
# @pytest.fixture
# def target_field():
# return "value"


@pytest.fixture
def group_field():
return "group"
# @pytest.fixture
# def group_field():
# return "group"


@pytest.fixture
def previous_value():
return "previous_value"
# @pytest.fixture
# def previous_value():
# return "previous_value"


@pytest.fixture
def alpha():
return 0.05
# @pytest.fixture
# def alpha():
# return 0.05


def test_split_ab(ab_test, data, group_field):
result = ab_test.split_ab(data, group_field)
assert len(result["test"]) == DATA_SIZE
assert len(result["control"]) == DATA_SIZE
# def test_split_ab(ab_test, data, group_field):
# result = ab_test.split_ab(data, group_field)
# assert len(result["test"]) == DATA_SIZE
# assert len(result["control"]) == DATA_SIZE


def test_calc_difference(ab_test, data, group_field, target_field, previous_value):
splitted_data = ab_test.split_ab(data, group_field)
result = ab_test.calc_difference(splitted_data, target_field, previous_value)
assert 1 < result["ate"] < 3
assert 1 < result["cuped"] < 3
assert 1 < result["diff_in_diff"] < 3
# def test_calc_difference(ab_test, data, group_field, target_field, previous_value):
# splitted_data = ab_test.split_ab(data, group_field)
# result = ab_test.calc_difference(splitted_data, target_field, previous_value)
# assert 1 < result["ate"] < 3
# assert 1 < result["cuped"] < 3
# assert 1 < result["diff_in_diff"] < 3


def test_calc_difference_with_previous_value(ab_test, data, group_field, target_field, previous_value):
ab_test.calc_difference_method = "ate"
splitted_data = ab_test.split_ab(data, group_field)
result = ab_test.calc_difference(splitted_data, previous_value)
assert -1 < result["ate"] < 1
# def test_calc_difference_with_previous_value(ab_test, data, group_field, target_field, previous_value):
# ab_test.calc_difference_method = "ate"
# splitted_data = ab_test.split_ab(data, group_field)
# result = ab_test.calc_difference(splitted_data, previous_value)
# assert -1 < result["ate"] < 1


def test_calc_p_value(ab_test, data, group_field, target_field, previous_value, alpha):
splitted_data = ab_test.split_ab(data, group_field)
result = ab_test.calc_p_value(splitted_data, target_field)
assert result["t_test"] < alpha
assert result["mann_whitney"] < alpha
# def test_calc_p_value(ab_test, data, group_field, target_field, previous_value, alpha):
# splitted_data = ab_test.split_ab(data, group_field)
# result = ab_test.calc_p_value(splitted_data, target_field)
# assert result["t_test"] < alpha
# assert result["mann_whitney"] < alpha

result = ab_test.calc_p_value(splitted_data, previous_value)
assert result["t_test"] > alpha
assert result["mann_whitney"] > alpha
# result = ab_test.calc_p_value(splitted_data, previous_value)
# assert result["t_test"] > alpha
# assert result["mann_whitney"] > alpha


def test_execute(ab_test, data, group_field, target_field, previous_value, alpha):
result = ab_test.execute(data, target_field, group_field, previous_value)
print(result)
assert result["size"]["test"] == DATA_SIZE
assert result["size"]["control"] == DATA_SIZE
assert 1 < result["difference"]["ate"] < 3
assert 1 < result["difference"]["cuped"] < 3
assert 1 < result["difference"]["diff_in_diff"] < 3
assert result["p_value"]["t_test"] < alpha
assert result["p_value"]["mann_whitney"] < alpha
# def test_execute(ab_test, data, group_field, target_field, previous_value, alpha):
# result = ab_test.execute(data, target_field, group_field, previous_value)
# print(result)
# assert result["size"]["test"] == DATA_SIZE
# assert result["size"]["control"] == DATA_SIZE
# assert 1 < result["difference"]["ate"] < 3
# assert 1 < result["difference"]["cuped"] < 3
# assert 1 < result["difference"]["diff_in_diff"] < 3
# assert result["p_value"]["t_test"] < alpha
# assert result["p_value"]["mann_whitney"] < alpha
Loading

0 comments on commit d8635a2

Please sign in to comment.