diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5a5350b..604d972 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -34,13 +34,13 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pandas==1.3.5 numpy==1.22.3 tqdm==4.62.3 psutil==5.9.0 + pip install pytest pip install . - name: Test with pytest run: | cd tests - python global-unit-test.py + python -m pytest . -s --disable-warnings linting: needs: unit-tests diff --git a/Makefile b/Makefile index ab7f41a..168bbb7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ test: - cd tests && python global-unit-test.py + cd tests && python -m pytest . -s --disable-warnings lint: python -m pylint chefboost/ --fail-under=10 \ No newline at end of file diff --git a/chefboost/Chefboost.py b/chefboost/Chefboost.py index bd9ef1e..9cf0e1a 100644 --- a/chefboost/Chefboost.py +++ b/chefboost/Chefboost.py @@ -24,6 +24,7 @@ def fit( config: Optional[dict] = None, target_label: str = "Decision", validation_df: Optional[pd.DataFrame] = None, + silent: bool = False, ) -> Dict[str, Any]: """ Build (a) decision tree model(s) @@ -55,6 +56,9 @@ def fit( if nothing is passed to validation data frame, then the function validates built trees for training data frame + silent (bool): set this to True if you do not want to see + any informative logs + Returns: chefboost model """ @@ -139,7 +143,8 @@ def fit( if enableParallelism == True: num_cores = config["num_cores"] - logger.info(f"[INFO]: {num_cores} CPU cores will be allocated in parallel running") + if silent is False: + logger.info(f"[INFO]: {num_cores} CPU cores will be allocated in parallel running") from multiprocessing import set_start_method, freeze_support @@ -169,7 +174,8 @@ def fit( config["algorithm"] = "Regression" if enableGBM == True: - logger.info("Gradient Boosting Machines...") + if silent is False: + logger.info("Gradient Boosting Machines...") algorithm = "Regression" config["algorithm"] = "Regression" @@ -184,7 +190,8 @@ def fit( # ------------------------- - logger.info(f"{algorithm} tree is going to be built...") + if silent is False: + logger.info(f"{algorithm} tree is going to be built...") # initialize a dictionary. this is going to be used to check features numeric or nominal. # numeric features should be transformed to nominal values based on scales. @@ -212,7 +219,13 @@ def fit( if enableAdaboost == True: trees, alphas = adaboost_clf.apply( - df, config, header, dataset_features, validation_df=validation_df, process_id=process_id + df, + config, + header, + dataset_features, + validation_df=validation_df, + process_id=process_id, + silent=silent, ) elif enableGBM == True: @@ -224,6 +237,7 @@ def fit( dataset_features, validation_df=validation_df, process_id=process_id, + silent=silent, ) # classification = True @@ -235,12 +249,19 @@ def fit( dataset_features, validation_df=validation_df, process_id=process_id, + silent=silent, ) # classification = False elif enableRandomForest == True: trees = randomforest.apply( - df, config, header, dataset_features, validation_df=validation_df, process_id=process_id + df, + config, + header, + dataset_features, + validation_df=validation_df, + process_id=process_id, + silent=silent, ) else: # regular decision tree building root = 1 @@ -264,8 +285,9 @@ def fit( main_process_id=process_id, ) - logger.info("-------------------------") - logger.info(f"finished in {time.time() - begin} seconds") + if silent is False: + logger.info("-------------------------") + logger.info(f"finished in {time.time() - begin} seconds") obj = {"trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values} @@ -273,13 +295,13 @@ def fit( # train set accuracy df = base_df.copy() - evaluate(obj, df, task="train") + trainset_evaluation = evaluate(obj, df, task="train", silent=silent) + obj["evaluation"] = {"train": trainset_evaluation} # validation set accuracy if isinstance(validation_df, pd.DataFrame): - evaluate(obj, validation_df, task="validation") - - # ----------------------------------------- + validationset_evaluation = evaluate(obj, validation_df, task="validation", silent=silent) + obj["evaluation"]["validation"] = validationset_evaluation return obj @@ -455,31 +477,38 @@ def restoreTree(module_name) -> Any: return functions.restoreTree(module_name) -def feature_importance(rules: Union[str, list]) -> pd.DataFrame: +def feature_importance(rules: Union[str, list], silent: bool = False) -> pd.DataFrame: """ Show the feature importance values of a built model Args: - rules (str or list): e.g. decision_rules = "outputs/rules/rules.py" + rules (str or list): e.g. decision_rules = "outputs/rules/rules.py" or this could be retrieved from built model as shown below. - decision_rules = [] - for tree in model["trees"]: - rule = .__dict__["__spec__"].origin - decision_rules.append(rule) + ```python + decision_rules = [] + for tree in model["trees"]: + rule = .__dict__["__spec__"].origin + decision_rules.append(rule) + ``` + silent (bool): set this to True if you do want to see + any informative logs. Returns: feature importance (pd.DataFrame) """ if not isinstance(rules, list): rules = [rules] - logger.info(f"rules: {rules}") + + if silent is False: + logger.info(f"rules: {rules}") # ----------------------------- dfs = [] for rule in rules: - logger.info("Decision rule: {rule}") + if silent is False: + logger.info(f"Decision rule: {rule}") with open(rule, "r", encoding="UTF-8") as file: lines = file.readlines() @@ -564,8 +593,12 @@ def feature_importance(rules: Union[str, list]) -> pd.DataFrame: def evaluate( - model: dict, df: pd.DataFrame, target_label: str = "Decision", task: str = "test" -) -> None: + model: dict, + df: pd.DataFrame, + target_label: str = "Decision", + task: str = "test", + silent: bool = False, +) -> dict: """ Evaluate the performance of a built model on a data set Args: @@ -573,8 +606,10 @@ def evaluate( df (pandas data frame): data frame you would like to evaluate target_label (str): target label task (string): set this to train, validation or test + silent (bool): set this to True if you do not want to see + any informative logs Returns: - None + evaluation results (dict) """ # -------------------------- @@ -598,4 +633,4 @@ def evaluate( df["Decision"] = df["Decision"].astype(str) df["Prediction"] = df["Prediction"].astype(str) - cb_eval.evaluate(df, task=task) + return cb_eval.evaluate(df, task=task, silent=silent) diff --git a/chefboost/commons/evaluate.py b/chefboost/commons/evaluate.py index 44eba39..2cb480d 100644 --- a/chefboost/commons/evaluate.py +++ b/chefboost/commons/evaluate.py @@ -1,4 +1,5 @@ import math +import pandas as pd from chefboost.commons.logger import Logger # pylint: disable=broad-except @@ -6,25 +7,38 @@ logger = Logger(module="chefboost/commons/evaluate.py") -def evaluate(df, task="train"): +def evaluate(df: pd.DataFrame, task: str = "train", silent: bool = False) -> dict: + """ + Evaluate results + Args: + df (pd.DataFrame): data frame + task (str): train, test + silent (bool): set this to True if you do not want to + see any informative logs + Returns: + evaluation results (dict) + """ if df["Decision"].dtypes == "object": problem_type = "classification" else: problem_type = "regression" - # ------------------------------------- - + evaluation_results = {} instances = df.shape[0] - logger.info("-------------------------") - logger.info(f"Evaluate {task} set") - logger.info("-------------------------") + if silent is False: + logger.info("-------------------------") + logger.info(f"Evaluate {task} set") + logger.info("-------------------------") if problem_type == "classification": idx = df[df["Prediction"] == df["Decision"]].index accuracy = 100 * len(idx) / df.shape[0] - logger.info(f"Accuracy: {accuracy}% on {instances} instances") + if silent is False: + logger.info(f"Accuracy: {accuracy}% on {instances} instances") + evaluation_results["Accuracy"] = accuracy + evaluation_results["Instances"] = instances # ----------------------------- predictions = df.Prediction.values @@ -48,8 +62,12 @@ def evaluate(df, task="train"): confusion_row.append(item) confusion_matrix.append(confusion_row) - logger.info(f"Labels: {labels}") - logger.info(f"Confusion matrix: {confusion_matrix}") + if silent is False: + logger.info(f"Labels: {labels}") + logger.info(f"Confusion matrix: {confusion_matrix}") + + evaluation_results["Labels"] = labels + evaluation_results["Confusion matrix"] = confusion_matrix # ----------------------------- # precision and recall @@ -79,11 +97,19 @@ def evaluate(df, task="train"): accuracy = round(100 * (tp + tn) / (tp + tn + fp + fn + epsilon), 4) if len(labels) >= 3: - logger.info(f"Decision {decision_class}") - logger.info(f"Accuray: {accuracy}") + if silent is False: + logger.info(f"Decision {decision_class}") + logger.info(f"Accuracy: {accuracy}") + + evaluation_results[f"Decision {decision_class}'s Accuracy"] = accuracy - logger.info(f"Precision: {precision}%, Recall: {recall}%, F1: {f1_score}%") - logger.debug(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}") + if silent is False: + logger.info(f"Precision: {precision}%, Recall: {recall}%, F1: {f1_score}%") + logger.debug(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}") + + evaluation_results["Precision"] = precision + evaluation_results["Recall"] = recall + evaluation_results["F1"] = f1_score if len(labels) < 3: break @@ -99,13 +125,17 @@ def evaluate(df, task="train"): if instances > 0: mae = df["Absolute_Error"].sum() / instances - logger.info(f"MAE: {mae}") - mse = df["Absolute_Error_Squared"].sum() / instances - logger.info(f"MSE: {mse}") - rmse = math.sqrt(mse) - logger.info(f"RMSE: {rmse}") + + evaluation_results["MAE"] = mae + evaluation_results["MSE"] = mse + evaluation_results["RMSE"] = rmse + + if silent is False: + logger.info(f"MAE: {mae}") + logger.info(f"MSE: {mse}") + logger.info(f"RMSE: {rmse}") rae = 0 rrse = 0 @@ -122,12 +152,26 @@ def evaluate(df, task="train"): except Exception as err: logger.error(str(err)) - logger.info(f"RAE: {rae}") - logger.info(f"RRSE {rrse}") + if silent is False: + logger.info(f"RAE: {rae}") + logger.info(f"RRSE {rrse}") + + evaluation_results["RAE"] = rae + evaluation_results["RRSE"] = rrse mean = df["Decision"].mean() - logger.info(f"Mean: {mean}") + + if silent is False: + logger.info(f"Mean: {mean}") + + evaluation_results["Mean"] = mean if mean > 0: - logger.info(f"MAE / Mean: {100 * mae / mean}%") - logger.info(f"RMSE / Mean: {100 * rmse / mean}%") + if silent is False: + logger.info(f"MAE / Mean: {100 * mae / mean}%") + logger.info(f"RMSE / Mean: {100 * rmse / mean}%") + + evaluation_results["MAE / Mean"] = 100 * mae / mean + evaluation_results["RMSE / Mean"] = 100 * rmse / mean + + return evaluation_results diff --git a/chefboost/training/Training.py b/chefboost/training/Training.py index ee37ca7..b7c6e06 100644 --- a/chefboost/training/Training.py +++ b/chefboost/training/Training.py @@ -510,16 +510,10 @@ def buildDecisionTree( # add else condition in the decision tree if df.Decision.dtypes == "object": # classification - pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() - - if pd.__version__.split(".")[0] == "1": - pivot = pivot.rename(columns={"Decision": "Instances", "index": "Decision"}) - else: # if pd.__version__.split(".")[0] == "2": - pivot = pivot.rename(columns={"Decision": "Instances", "count": "Decision"}) - - pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() - - else_decision = f"return '{pivot.iloc[0].Decision}'" + pivot = pd.DataFrame(subdataset.Decision.value_counts()).sort_values( + by=["count"], ascending=False + ) + else_decision = f"return '{str(pivot.iloc[0].name)}'" if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) @@ -669,7 +663,7 @@ def buildDecisionTree( # this is reguler decision tree. find accuracy here. module_name = "outputs/rules/rules" - myrules = load_module(module_name) # rules0 + myrules = load_module(module_name) # rules0 models.append(myrules) return models @@ -682,7 +676,7 @@ def findPrediction(row): params.append(row[j]) module_name = "outputs/rules/rules" - myrules = load_module(module_name) # rules0 + myrules = load_module(module_name) # rules0 prediction = myrules.findDecision(params) return prediction diff --git a/chefboost/tuning/adaboost.py b/chefboost/tuning/adaboost.py index cda488e..3623347 100644 --- a/chefboost/tuning/adaboost.py +++ b/chefboost/tuning/adaboost.py @@ -31,7 +31,9 @@ def findPrediction(row): return prediction -def apply(df, config, header, dataset_features, validation_df=None, process_id=None): +def apply( + df, config, header, dataset_features, validation_df=None, process_id=None, silent: bool = False +): models = [] alphas = [] @@ -53,8 +55,7 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N best_epoch_idx = 0 best_epoch_value = 1000000 - # for i in range(0, num_of_weak_classifier): - pbar = tqdm(range(0, num_of_weak_classifier), desc="Adaboosting") + pbar = tqdm(range(0, num_of_weak_classifier), desc="Adaboosting", disable=silent) for i in pbar: worksheet["Decision"] = worksheet["Weight"] * worksheet["Decision"] @@ -139,8 +140,8 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N pbar.set_description(f"Epoch {i + 1}. Loss: {mae}. Process: ") # ------------------------------ - - logger.info(f"The best epoch is {best_epoch_idx} with the {best_epoch_value} MAE score") + if silent is False: + logger.info(f"The best epoch is {best_epoch_idx} with the {best_epoch_value} MAE score") models = models[0 : best_epoch_idx + 1] alphas = alphas[0 : best_epoch_idx + 1] diff --git a/chefboost/tuning/gbm.py b/chefboost/tuning/gbm.py index 95bb17f..85ac99b 100644 --- a/chefboost/tuning/gbm.py +++ b/chefboost/tuning/gbm.py @@ -1,4 +1,5 @@ import gc +from typing import Optional, Union import pandas as pd import numpy as np @@ -14,7 +15,7 @@ logger = Logger(module="chefboost/tuning/gbm.py") -def findPrediction(row): +def findPrediction(row: pd.Series) -> Union[str, float]: epoch = row["Epoch"] row = row.drop(labels=["Epoch"]) columns = row.shape[0] @@ -32,7 +33,15 @@ def findPrediction(row): return prediction -def regressor(df, config, header, dataset_features, validation_df=None, process_id=None): +def regressor( + df: pd.DataFrame, + config: dict, + header: str, + dataset_features: dict, + validation_df: Optional[pd.DataFrame] = None, + process_id: Optional[int] = None, + silent: bool = False, +) -> list: models = [] # we will update decisions in every epoch, this will be used to restore @@ -69,10 +78,7 @@ def regressor(df, config, header, dataset_features, validation_df=None, process_ best_epoch_idx = 0 best_epoch_loss = 1000000 - pbar = tqdm(range(1, epochs + 1), desc="Boosting") - - # for index in range(1,epochs+1): - # for index in tqdm(range(1,epochs+1), desc='Boosting'): + pbar = tqdm(range(1, epochs + 1), desc="Boosting", disable=silent) for index in pbar: logger.debug(f"epoch {index} - ") loss = 0 @@ -155,22 +161,33 @@ def regressor(df, config, header, dataset_features, validation_df=None, process_ # --------------------------------- - logger.info(f"The best epoch is {best_epoch_idx} with {best_epoch_loss} loss value") + if silent is False: + logger.info(f"The best epoch is {best_epoch_idx} with {best_epoch_loss} loss value") models = models[0:best_epoch_idx] config["epochs"] = best_epoch_idx - logger.info( - f"MSE of {num_of_instances} instances are boosted from {boosted_from}" - f"to {best_epoch_loss} in {epochs} epochs" - ) + if silent is False: + logger.info( + f"MSE of {num_of_instances} instances are boosted from {boosted_from}" + f"to {best_epoch_loss} in {epochs} epochs" + ) return models -def classifier(df, config, header, dataset_features, validation_df=None, process_id=None): +def classifier( + df: pd.DataFrame, + config: dict, + header: str, + dataset_features: dict, + validation_df: Optional[pd.DataFrame] = None, + process_id: Optional[int] = None, + silent: bool = False, +) -> tuple: models = [] - logger.info("gradient boosting for classification") + if silent is False: + logger.info("gradient boosting for classification") epochs = config["epochs"] enableParallelism = config["enableParallelism"] @@ -182,7 +199,7 @@ def classifier(df, config, header, dataset_features, validation_df=None, process boosted_predictions = np.zeros([df.shape[0], len(classes)]) - pbar = tqdm(range(0, epochs), desc="Boosting") + pbar = tqdm(range(0, epochs), desc="Boosting", disable=silent) # store actual set, we will use this to calculate loss actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes) @@ -317,9 +334,11 @@ def classifier(df, config, header, dataset_features, validation_df=None, process # -------------------------------- - logger.info( - f"The best accuracy got in {best_accuracy_idx} epoch with the score {best_accuracy_value}" - ) + if silent is False: + logger.info( + f"The best accuracy got in {best_accuracy_idx} epoch" + f" with the score {best_accuracy_value}" + ) models = models[0 : best_accuracy_idx * len(classes) + len(classes)] diff --git a/chefboost/tuning/randomforest.py b/chefboost/tuning/randomforest.py index d6dfe6e..a8a1c40 100644 --- a/chefboost/tuning/randomforest.py +++ b/chefboost/tuning/randomforest.py @@ -1,7 +1,9 @@ +from typing import Optional import multiprocessing from contextlib import closing from tqdm import tqdm +import pandas as pd from chefboost.commons import functions from chefboost.training import Training @@ -10,7 +12,15 @@ # pylint: disable=unused-argument -def apply(df, config, header, dataset_features, validation_df=None, process_id=None): +def apply( + df: pd.DataFrame, + config: dict, + header: str, + dataset_features: dict, + validation_df: Optional[pd.DataFrame] = None, + process_id: Optional[int] = None, + silent: bool = False, +): models = [] num_of_trees = config["num_of_trees"] @@ -24,9 +34,10 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N input_params = [] - pbar = tqdm(range(0, num_of_trees), desc="Bagging") + pbar = tqdm(range(0, num_of_trees), desc="Bagging", disable=silent) for i in pbar: - pbar.set_description(f"Sub decision tree {i + 1} is processing") + if silent is False: + pbar.set_description(f"Sub decision tree {i + 1} is processing") subset = df.sample(frac=1 / num_of_trees) root = 1 @@ -38,7 +49,19 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N if parallelism_on: # parallel run input_params.append( - (subset, root, file, config, dataset_features, 0, 0, "root", i, None, process_id) + ( + subset, + root, + file, + config, + dataset_features, + 0, + 0, + "root", + i, + None, + process_id, + ) ) else: # serial run @@ -75,7 +98,7 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N # all functions registered here # results = [] - for f in tqdm(funclist): + for f in tqdm(funclist, disable=silent): _ = f.get(timeout=100000) # this was branch_results # results.append(branch_results) diff --git a/tests/__pycache__/test_adaboost.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_adaboost.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..5469e37 Binary files /dev/null and b/tests/__pycache__/test_adaboost.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/__pycache__/test_c45.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_c45.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..83ddd63 Binary files /dev/null and b/tests/__pycache__/test_c45.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/__pycache__/test_cart.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_cart.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..8a45a50 Binary files /dev/null and b/tests/__pycache__/test_cart.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/__pycache__/test_chaid.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_chaid.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..1a9c50c Binary files /dev/null and b/tests/__pycache__/test_chaid.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/__pycache__/test_gbm.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_gbm.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..5e9d7c0 Binary files /dev/null and b/tests/__pycache__/test_gbm.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/__pycache__/test_id3.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_id3.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..f6454ca Binary files /dev/null and b/tests/__pycache__/test_id3.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/__pycache__/test_randomforest.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_randomforest.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..f961ab0 Binary files /dev/null and b/tests/__pycache__/test_randomforest.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/__pycache__/test_regression.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_regression.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000..4991e46 Binary files /dev/null and b/tests/__pycache__/test_regression.cpython-38-pytest-7.1.2.pyc differ diff --git a/tests/global-unit-test.py b/tests/global-unit-test.py deleted file mode 100644 index a519828..0000000 --- a/tests/global-unit-test.py +++ /dev/null @@ -1,339 +0,0 @@ -import gc -import pandas as pd -from chefboost import Chefboost as cb -from chefboost.commons.logger import Logger - -pd.set_option("display.max_rows", 500) -pd.set_option("display.max_columns", 500) -pd.set_option("display.width", 1000) - -logger = Logger(module="tests/global-unit-test.py") - -# ---------------------------------------------- - -parallelism_cases = [True] -# parallelism_cases = [False] -# parallelism_cases = [False, True] - -if __name__ == "__main__": - - for enableParallelism in parallelism_cases: - - logger.info("*************************") - logger.info(f"enableParallelism is set to {enableParallelism}") - logger.info("*************************") - - logger.info("no config passed") - df = pd.read_csv("dataset/golf.txt") - model = cb.fit(df) - - gc.collect() - - logger.info("-------------------------") - - logger.info("Validation set case") - - df = pd.read_csv("dataset/golf.txt") - validation_df = pd.read_csv("dataset/golf.txt") - config = {"algorithm": "ID3", "enableParallelism": enableParallelism} - model = cb.fit(df, config, validation_df=validation_df) - - gc.collect() - - logger.info("-------------------------") - - logger.info("Feature importance") - # decision_rules = model["trees"][0].__dict__["__name__"]+".py" - decision_rules = model["trees"][0].__dict__["__spec__"].origin - logger.info(cb.feature_importance(decision_rules)) - - logger.info("-------------------------") - - logger.info("ID3 for nominal features and nominal target:") - df = pd.read_csv("dataset/golf.txt") - - config = {"algorithm": "ID3", "enableParallelism": enableParallelism} - model = cb.fit(df, config) - - validation_df = pd.read_csv("dataset/golf.txt") - - logger.info("External validation") - cb.evaluate(model, validation_df) - - cb.save_model(model) - logger.info("built model is saved to model.pkl") - - restored_model = cb.load_model("model.pkl") - logger.info("built model is restored from model.pkl") - - instance = ["Sunny", "Hot", "High", "Weak"] - prediction = cb.predict(restored_model, instance) - - logger.info(f"prediction for {instance} is {prediction}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("ID3 for nominal/numeric features and nominal target:") - config = {"algorithm": "ID3", "enableParallelism": enableParallelism} - model = cb.fit(pd.read_csv("dataset/golf2.txt"), config) - - instance = ["Sunny", 85, 85, "Weak"] - prediction = cb.predict(model, instance) - logger.info(f"prediction for {instance} is {prediction}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("C4.5 for nominal/numeric features and nominal target:") - config = {"algorithm": "C4.5", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/golf2.txt"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("CART for nominal/numeric features and nominal target:") - config = {"algorithm": "CART", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/golf2.txt"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("CHAID for nominal features and nominal target:") - config = {"algorithm": "CHAID", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/golf.txt"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("CHAID for nominal/numeric features and nominal target:") - config = {"algorithm": "CHAID", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/golf2.txt"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("regression tree for nominal features, numeric target") - config = {"algorithm": "Regression", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/golf3.txt"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("regression tree for nominal/numeric features, numeric target") - config = {"algorithm": "Regression", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/golf4.txt"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info( - "algorithm must be regression tree for numetic target. set any other algorithm." - ) - config = {"algorithm": "ID3", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/golf4.txt"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("ID3 for nominal features and target (large data set)") - config = {"algorithm": "ID3", "enableParallelism": enableParallelism} - model = cb.fit(pd.read_csv("dataset/car.data"), config) - - instance = ["vhigh", "vhigh", 2, "2", "small", "low"] - prediction = cb.predict(model, instance) - logger.info(prediction) - - instance = ["high", "high", "4", "more", "big", "high"] - prediction = cb.predict(model, instance) - logger.info(prediction) - - gc.collect() - - logger.info("-------------------------") - - logger.info("C4.5 for nominal features and target (large data set)") - config = {"algorithm": "C4.5", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/car.data"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("CART for nominal features and target (large data set)") - config = {"algorithm": "CART", "enableParallelism": enableParallelism} - cb.fit(pd.read_csv("dataset/car.data"), config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("CHAID for nominal features and target (large data set)") - config = {"algorithm": "CHAID", "enableParallelism": enableParallelism} - df = pd.read_csv("dataset/car.data") - cb.fit(df, config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("Iris with regular decision tree") - config = {"algorithm": "ID3"} - df = pd.read_csv( - "dataset/iris.data", - names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], - ) - model = cb.fit(df, config) - - gc.collect() - - logger.info("-------------------------") - - logger.info("Adaboost") - config = { - "algorithm": "ID3", - "enableAdaboost": True, - "num_of_weak_classifier": 10, - "enableParallelism": False, - } - df = pd.read_csv("dataset/adaboost.txt") - validation_df = df.copy() - - model = cb.fit(df, config, validation_df=validation_df) - - instance = [4, 3.5] - - gc.collect() - - logger.info("-------------------------") - - logger.info("Regular GBM") - config = { - "algorithm": "CART", - "enableGBM": True, - "epochs": 10, - "learning_rate": 1, - "enableParallelism": enableParallelism, - } - df = pd.read_csv("dataset/golf4.txt") - validation_df = pd.read_csv("dataset/golf4.txt") - model = cb.fit(df, config, validation_df=validation_df) - - instance = ["Sunny", 85, 85, "Weak"] - prediction = cb.predict(model, instance) - logger.info(f"prediction for {instance} is {prediction}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("GBM for classification") - config = { - "algorithm": "ID3", - "enableGBM": True, - "epochs": 10, - "learning_rate": 1, - "enableParallelism": enableParallelism, - } - - df = pd.read_csv( - "dataset/iris.data", - names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], - ) - validation_df = df.copy() - - model = cb.fit(df, config, validation_df=validation_df) - - instance = [7.0, 3.2, 4.7, 1.4] - prediction = cb.predict(model, instance) - logger.info(f"prediction for {instance} is {prediction}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("Random forest") - config = { - "algorithm": "ID3", - "enableRandomForest": True, - "num_of_trees": 3, - "enableParallelism": enableParallelism, - } - df = pd.read_csv("dataset/car.data") - validation_df = pd.read_csv("dataset/car.data") - model = cb.fit( - pd.read_csv("dataset/car.data"), - config - # , validation_df = validation_df - ) - - logger.info("Feature importance of random forest") - decision_rules = [] - for tree in model["trees"]: - - decision_rule = tree.__dict__["__spec__"].origin - decision_rules.append(decision_rule) - - df = cb.feature_importance(decision_rules) - logger.info(df) - - instance = ["vhigh", "vhigh", 2, "2", "small", "low"] - - prediction = cb.predict(model, instance) - logger.info(f"prediction for {instance} is {prediction}") - - instance = ["high", "high", 4, "more", "big", "high"] - - prediction = cb.predict(model, instance) - logger.info(f"prediction for {instance} is {prediction}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("Random forest for regression") - - config = { - "algorithm": "ID3", - "enableRandomForest": True, - "num_of_trees": 5, - "enableMultitasking": False, - "enableParallelism": enableParallelism, - } - - df = pd.read_csv("dataset/car_reg.data") - model = cb.fit(pd.read_csv("dataset/car_reg.data"), config) - - validation_df = pd.read_csv("dataset/car_reg.data") - cb.evaluate(model, validation_df) - - instance = ["high", "high", 4, "more", "big", "high"] - prediction = cb.predict(model, instance) - logger.info(f"prediction for {instance} is {prediction}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("Is there any none predictions?") - config = {"algorithm": "C4.5", "enableParallelism": enableParallelism} - model = cb.fit(pd.read_csv("dataset/none_train.txt"), config) - test_set = pd.read_csv("dataset/none_test.txt") - instance = test_set.iloc[3] - logger.info(f"{instance.values} -> {cb.predict(model, instance)}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("-------------------------") - logger.info("unit tests completed successfully...") diff --git a/tests/test_adaboost.py b/tests/test_adaboost.py new file mode 100644 index 0000000..25d928b --- /dev/null +++ b/tests/test_adaboost.py @@ -0,0 +1,27 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_adaboost.py") + + +def test_adaboost(): + config = { + "algorithm": "Regression", + "enableAdaboost": True, + "num_of_weak_classifier": 10, + "enableParallelism": False, + } + df = pd.read_csv("dataset/adaboost.txt") + validation_df = df.copy() + + model = cb.fit(df, config, validation_df=validation_df, silent=True) + + instance = [4, 3.5] + + prediction = cb.predict(model, instance) + + assert prediction == -1 + assert len(model["trees"]) > 1 + + logger.info("✅ adaboost model restoration test done") diff --git a/tests/test_c45.py b/tests/test_c45.py new file mode 100644 index 0000000..bec68ea --- /dev/null +++ b/tests/test_c45.py @@ -0,0 +1,24 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_c45.py") + + +def test_c45_for_nominal_features_and_nominal_target(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True) + assert model["config"]["algorithm"] == "C4.5" + logger.info("✅ build c4.5 for nominal and numeric features and nominal target test done") + +def test_c45_for_nominal_and_numeric_features_and_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True) + assert model["config"]["algorithm"] == "C4.5" + logger.info("✅ build c4.5 for nominal and numeric features and nominal target test done") + +def test_large_dataset(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True) + assert model["config"]["algorithm"] == "C4.5" + logger.info("✅ build c4.5 for large dataset test done") \ No newline at end of file diff --git a/tests/test_cart.py b/tests/test_cart.py new file mode 100644 index 0000000..8e1c6d9 --- /dev/null +++ b/tests/test_cart.py @@ -0,0 +1,25 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_cart.py") + + +def test_cart_for_nominal_features_and_nominal_target(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, config={"algorithm": "CART"}, silent=True) + assert model["config"]["algorithm"] == "CART" + logger.info("✅ build cart for nominal and numeric features and nominal target test done") + + +def test_cart_for_nominal_and_numeric_features_and_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, config={"algorithm": "CART"}, silent=True) + assert model["config"]["algorithm"] == "CART" + logger.info("✅ build cart for nominal and numeric features and nominal target test done") + +def test_large_dataset(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, config={"algorithm": "CART"}, silent=True) + assert model["config"]["algorithm"] == "CART" + logger.info("✅ build c4.5 for large dataset test done") \ No newline at end of file diff --git a/tests/test_chaid.py b/tests/test_chaid.py new file mode 100644 index 0000000..45fba69 --- /dev/null +++ b/tests/test_chaid.py @@ -0,0 +1,26 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_c45.py") + + +def test_c45_for_nominal_features_and_nominal_target(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True) + assert model["config"]["algorithm"] == "CHAID" + logger.info("✅ build chaid for nominal features and nominal target test done") + + +def test_c45_for_nominal_and_numeric_features_and_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True) + assert model["config"]["algorithm"] == "CHAID" + logger.info("✅ build chaid for nominal and numeric features and nominal target test done") + + +def test_large_dataset(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True) + assert model["config"]["algorithm"] == "CHAID" + logger.info("✅ build c4.5 for large dataset test done") diff --git a/tests/test_gbm.py b/tests/test_gbm.py new file mode 100644 index 0000000..6800f4c --- /dev/null +++ b/tests/test_gbm.py @@ -0,0 +1,48 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_gbm.py") + + +def test_gbm_regression(): + config = { + "algorithm": "Regression", + "enableGBM": True, + "epochs": 10, + "learning_rate": 1, + } + + df = pd.read_csv("dataset/golf4.txt") + validation_df = pd.read_csv("dataset/golf4.txt") + + model = cb.fit(df, config, validation_df=validation_df, silent=True) + assert model["config"]["algorithm"] == "Regression" + assert len(model["trees"]) > 1 + + features = ["Sunny", 85, 85, "Weak"] + target = 25 + prediction = cb.predict(model, features) + assert abs(prediction - target) < 1 + + +def test_gbm_classification(): + config = { + "algorithm": "ID3", + "enableGBM": True, + "epochs": 10, + "learning_rate": 1, + } + + df = pd.read_csv( + "dataset/iris.data", + names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], + ) + validation_df = df.copy() + + model = cb.fit(df, config, validation_df=validation_df, silent=True) + + instance = [7.0, 3.2, 4.7, 1.4] + target = "Iris-versicolor" + prediction = cb.predict(model, instance) + assert prediction == target diff --git a/tests/test_id3.py b/tests/test_id3.py new file mode 100644 index 0000000..d83cbf9 --- /dev/null +++ b/tests/test_id3.py @@ -0,0 +1,114 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_id3.py") + + +def test_build_id3_with_no_config(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, silent=True) + assert model["config"]["algorithm"] == "ID3" + logger.info("✅ standard id3 test done") + + +def test_build_id3_with_internal_validation_df(): + df = pd.read_csv("dataset/golf.txt") + validation_df = pd.read_csv("dataset/golf.txt") + + model = cb.fit(df, validation_df=validation_df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + validation_eval_results = model["evaluation"]["validation"] + + assert validation_eval_results.get("Accuracy", 0) > 99 + assert validation_eval_results.get("Precision", 0) > 99 + assert validation_eval_results.get("Recall", 0) > 99 + assert validation_eval_results.get("F1", 0) > 99 + assert validation_eval_results.get("Instances", 0) == validation_df.shape[0] + assert "Confusion matrix" in validation_eval_results.keys() + assert "Labels" in validation_eval_results.keys() + + # decision_rules = model["trees"][0].__dict__["__name__"]+".py" + decision_rules = model["trees"][0].__dict__["__spec__"].origin + + fi_df = cb.feature_importance(decision_rules, silent=True) + assert fi_df.shape[0] == 4 + + logger.info("✅ id3 test with internal validation data frame done") + + +def test_build_id3_with_external_validation_set(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + validation_df = pd.read_csv("dataset/golf.txt") + results = cb.evaluate(model, validation_df, silent=True) + + assert results.get("Accuracy", 0) > 99 + assert results.get("Precision", 0) > 99 + assert results.get("Recall", 0) > 99 + assert results.get("F1", 0) > 99 + assert results.get("Instances", 0) == validation_df.shape[0] + assert "Confusion matrix" in results.keys() + assert "Labels" in results.keys() + + logger.info("✅ id3 test with external validation data frame done") + + +def test_model_restoration(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, silent=True) + assert model["config"]["algorithm"] == "ID3" + + cb.save_model(model) + + restored_model = cb.load_model("model.pkl") + + assert restored_model["config"]["algorithm"] == "ID3" + + instance = ["Sunny", "Hot", "High", "Weak"] + + prediction = cb.predict(restored_model, instance) + assert prediction == "No" + + logger.info("✅ id3 model restoration test done") + + +def test_build_id3_for_nominal_and_numeric_features_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + instance = ["Sunny", 85, 85, "Weak"] + prediction = cb.predict(model, instance) + assert prediction == "No" + logger.info("✅ build id3 for nominal and numeric features and nominal target test done") + + +def test_large_data_set(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + instance = ["vhigh", "vhigh", 2, "2", "small", "low"] + prediction = cb.predict(model, instance) + assert prediction == "unacc" + + instance = ["high", "high", "4", "more", "big", "high"] + prediction = cb.predict(model, instance) + assert prediction == "acc" + + +def test_iris_dataset(): + df = pd.read_csv( + "dataset/iris.data", + names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], + ) + model = cb.fit(df, silent=True) + assert model["config"]["algorithm"] == "ID3" diff --git a/tests/test_randomforest.py b/tests/test_randomforest.py new file mode 100644 index 0000000..8068ee0 --- /dev/null +++ b/tests/test_randomforest.py @@ -0,0 +1,55 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_randomforest.py") + + +def test_randomforest_for_classification(): + config = { + "algorithm": "ID3", + "enableRandomForest": True, + "num_of_trees": 3, + } + df = pd.read_csv("dataset/car.data") + + model = cb.fit(df, config, silent=True) + + assert model["config"]["algorithm"] == "ID3" + assert model["evaluation"]["train"]["Accuracy"] > 90 + + # feature importance + decision_rules = [] + for tree in model["trees"]: + decision_rule = tree.__dict__["__spec__"].origin + decision_rules.append(decision_rule) + + df = cb.feature_importance(decision_rules, silent=True) + assert df.shape[0] == 6 + + # this is not in train data + instance = ["high", "high", 4, "more", "big", "high"] + prediction = cb.predict(model, instance) + assert prediction in ["unacc", "acc"] + + instance = ["vhigh", "vhigh", 2, "2", "small", "low"] + prediction = cb.predict(model, instance) + assert prediction in ["unacc", "acc"] + + +def test_randomforest_for_regression(): + config = { + "algorithm": "ID3", + "enableRandomForest": True, + "num_of_trees": 5, + } + df = pd.read_csv("dataset/car_reg.data") + model = cb.fit(df, config, silent=True) + + assert model["evaluation"]["train"]["MAE"] < 30 + assert model["config"]["algorithm"] == "Regression" + + instance = ["high", "high", 4, "more", "big", "high"] + target = 100 + prediction = cb.predict(model, instance) + assert abs(prediction - target) < 30 diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000..35ffea4 --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,27 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_regression.py") + + +def test_c45_for_nominal_features_and_numeric_target(): + df = pd.read_csv("dataset/golf3.txt") + _ = cb.fit(df, config={"algorithm": "Regression"}, silent=True) + logger.info("✅ build regression for nominal features and numeric target test done") + + +def test_c45_for_nominal_and_numeric_features_and_numeric_target(): + df = pd.read_csv("dataset/golf4.txt") + _ = cb.fit(df, config={"algorithm": "Regression"}, silent=True) + logger.info( + "✅ build regression tree for nominal and numeric features and numeric target test done" + ) + + +def test_switching_to_regression_tree(): + df = pd.read_csv("dataset/golf4.txt") + config = {"algorithm": "ID3"} + model = cb.fit(df, config, silent=True) + assert model["config"]["algorithm"] == "Regression" + logger.info("✅ switching to regression tree test done")