Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
jrudar authored May 19, 2023
1 parent 89abd33 commit 93106db
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 28 deletions.
9 changes: 6 additions & 3 deletions LANDMark/LANDMark.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(
max_features: float = 0.80,
min_gain: float = 0.0,
impurity: str = "gain",
q: float = 1.5,
use_oracle: bool = True,
use_lm_l2: bool = True,
use_lm_l1: bool = True,
Expand All @@ -26,7 +27,7 @@ def __init__(
use_etc: bool = True,
etc_max_depth: int = 5,
etc_max_trees: int = 128,
max_samples_tree: int = -1,
resampler = None,
bootstrap: bool = False,
n_jobs: int = 4,
):
Expand All @@ -37,6 +38,7 @@ def __init__(
self.max_features = max_features
self.min_gain = min_gain
self.impurity = impurity
self.q = q
self.use_oracle = use_oracle
self.use_lm_l2 = use_lm_l2
self.use_lm_l1 = use_lm_l1
Expand All @@ -45,7 +47,7 @@ def __init__(
self.use_etc = use_etc
self.etc_max_depth = etc_max_depth
self.etc_max_trees = etc_max_trees
self.max_samples_tree = max_samples_tree
self.resampler = resampler
self.bootstrap = bootstrap

self.n_jobs = n_jobs
Expand All @@ -72,6 +74,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
max_features=self.max_features,
min_gain=self.min_gain,
impurity=self.impurity,
q = self.q,
use_oracle=self.use_oracle,
bootstrap=self.bootstrap,
use_lm_l2=self.use_lm_l2,
Expand All @@ -82,7 +85,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
etc_max_depth=self.etc_max_depth,
etc_max_trees=self.etc_max_trees,
),
max_samples_tree=self.max_samples_tree,
resampler=self.resampler,
n_estimators=self.n_estimators,
class_names=self.classes_,
n_jobs=self.n_jobs,
Expand Down
95 changes: 79 additions & 16 deletions LANDMark/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,78 @@
)


def purity_function(N, N_lab, L, R, y, purity_fun="gain"):
# Calculate Information Gain
if purity_fun == "gain":
L_outcome, L_counts = np.unique(y[L], return_counts=True)
L_prob = L_counts / L_counts.sum()
H_L = entropy(L_prob) * (L_counts.sum() / N)
def tsallis_fun(N, N_lab, L, R, y, mode, q):

R_outcome, R_counts = np.unique(y[R], return_counts=True)
R_prob = R_counts / R_counts.sum()
H_R = entropy(R_prob) * (R_counts.sum() / N)
if q == 1: #Special case
if "ratio" in mode.split("-"):
return entropy_fun(N, N_lab, L, R, y, "gain-ratio")

H_parent = entropy(N_lab)
else:
return entropy_fun(N, N_lab, L, R, y, "gain")

scaler = 1 / (1 - q)

L_outcome, L_counts = np.unique(y[L], return_counts=True)
L_prob = L_counts / L_counts.sum()
H_L = (L_counts.sum() / N) * (scaler * (np.power(L_prob, q).sum() - 1))

R_outcome, R_counts = np.unique(y[R], return_counts=True)
R_prob = R_counts / R_counts.sum()
H_R = (R_counts.sum() / N) * (scaler * (np.power(R_prob, q).sum() - 1))

H_parent = scaler * (np.power(N_lab, q).sum() - 1)

IG = H_parent - H_R - H_L

IG = H_parent - H_L - H_R
if mode == "tsallis":

return IG

else:
norm_factor = np.asarray([(L_counts.sum() / N), (R_counts.sum() / N)])
norm_factor = 1 + (scaler * (np.power(norm_factor, q).sum() - 1))

GR = IG / norm_factor

return GR


def entropy_fun(N, N_lab, L, R, y, mode):
L_outcome, L_counts = np.unique(y[L], return_counts=True)
L_prob = L_counts / L_counts.sum()
H_L = entropy(L_prob) * (L_counts.sum() / N)

R_outcome, R_counts = np.unique(y[R], return_counts=True)
R_prob = R_counts / R_counts.sum()
H_R = entropy(R_prob) * (R_counts.sum() / N)

H_parent = entropy(N_lab)

IG = H_parent - H_L - H_R

if mode == "gain":

return IG

else:
norm_factor = np.asarray([(L_counts.sum() / N), (R_counts.sum() / N)])
norm_factor = 1 + entropy(norm_factor)

GR = IG / norm_factor

return GR


def purity_function(N, N_lab, L, R, y, purity_fun, q):

if purity_fun == "gain" or purity_fun == "gain-ratio":

return entropy_fun(N, N_lab, L, R, y, purity_fun)

elif purity_fun == "tsallis-gain-ratio" or purity_fun == "tsallis":

return tsallis_fun(N, N_lab, L, R, y, purity_fun, q)


class PredictData:
def __init__(self, node_lab):
Expand Down Expand Up @@ -72,6 +127,7 @@ def get_split(self,
max_features,
min_gain,
impurity,
q,
use_lm_l2,
use_lm_l1,
use_nnet,
Expand Down Expand Up @@ -123,7 +179,7 @@ def get_split(self,
L = np.where(D > 0, True, False)
R = np.where(D <= 0, True, False)

IG = purity_function(counts_sum, counts_prob, L, R, y, impurity)
IG = purity_function(counts_sum, counts_prob, L, R, y, impurity, q)

self.gain = IG

Expand All @@ -135,6 +191,7 @@ def get_split(self,
max_features = max_features,
min_gain = min_gain,
impurity = impurity,
q = q,
use_lm_l2 = use_lm_l2,
use_lm_l1 = use_lm_l1,
use_nnet = use_nnet,
Expand All @@ -153,6 +210,7 @@ def get_split(self,
max_features = max_features,
min_gain = min_gain,
impurity = impurity,
q = q,
use_lm_l2 = use_lm_l2,
use_lm_l1 = use_lm_l1,
use_nnet = use_nnet,
Expand Down Expand Up @@ -192,7 +250,7 @@ def get_split(self,
# Calculate Information Gain
if X_L_n > 0 and X_R_n > 0:
IG = purity_function(
counts_sum, counts_prob, L, R, y, impurity
counts_sum, counts_prob, L, R, y, impurity, q
)

gains.append(IG)
Expand All @@ -219,7 +277,7 @@ def get_split(self,
# Calculate Information Gain
if X_L_n > 0 and X_R_n > 0:
IG = purity_function(
counts_sum, counts_prob, L, R, y, impurity
counts_sum, counts_prob, L, R, y, impurity, q
)

gains.append(IG)
Expand All @@ -244,7 +302,7 @@ def get_split(self,
# Calculate Information Gain
if X_L_n > 0 and X_R_n > 0:
IG = purity_function(
counts_sum, counts_prob, L, R, y, impurity
counts_sum, counts_prob, L, R, y, impurity, q
)

gains.append(IG)
Expand Down Expand Up @@ -274,7 +332,7 @@ def get_split(self,
# Calculate Information Gain
if X_L_n > 0 and X_R_n > 0:
IG = purity_function(
counts_sum, counts_prob, L, R, y, impurity
counts_sum, counts_prob, L, R, y, impurity, q
)

gains.append(IG)
Expand Down Expand Up @@ -312,6 +370,7 @@ def get_split(self,
max_features = max_features,
min_gain = min_gain,
impurity = impurity,
q = q,
use_lm_l2 = use_lm_l2,
use_lm_l1 = use_lm_l1,
use_nnet = use_nnet,
Expand All @@ -330,6 +389,7 @@ def get_split(self,
max_features = max_features,
min_gain = min_gain,
impurity = impurity,
q = q,
use_lm_l2 = use_lm_l2,
use_lm_l1 = use_lm_l1,
use_nnet = use_nnet,
Expand Down Expand Up @@ -361,6 +421,7 @@ def __init__(
max_features,
min_gain,
impurity,
q,
use_oracle,
bootstrap,
use_lm_l2,
Expand All @@ -376,6 +437,7 @@ def __init__(
self.max_features = max_features
self.min_gain = min_gain
self.impurity = impurity
self.q = q
self.use_oracle = use_oracle
self.bootstrap = bootstrap
self.use_lm_l2 = use_lm_l2
Expand Down Expand Up @@ -408,6 +470,7 @@ def fit(self, X, y):
max_features = self.max_features,
min_gain = self.min_gain,
impurity = self.impurity,
q = self.q,
use_lm_l2 = self.use_lm_l2,
use_lm_l1 = self.use_lm_l1,
use_nnet = self.use_nnet,
Expand Down
16 changes: 7 additions & 9 deletions LANDMark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,17 @@
##########################################################################################
# For Bagging Classifier
from sklearn.base import ClassifierMixin, BaseEstimator, clone
from sklearn.utils import resample
from scipy.special import softmax
from joblib import Parallel, delayed, parallel_backend


def _parallel_build(estimator, X, y, max_samples_tree):
if X.shape[0] <= max_samples_tree or max_samples_tree == -1:
def _parallel_build(estimator, X, y, resampler):
if isinstance(resampler, type(None)):
X_trf = X
y_trf = y

else:
X_trf, y_trf = resample(
X, y, replace=True, n_samples=max_samples_tree, stratify=y
)
X_trf, y_trf = clone(resampler).fit_resample(X, y)

trained_estimator = estimator.fit(X_trf, y_trf)

Expand All @@ -25,10 +22,10 @@ def _parallel_build(estimator, X, y, max_samples_tree):

class Ensemble(ClassifierMixin, BaseEstimator):
def __init__(
self, base_estimator, max_samples_tree, n_estimators, class_names, n_jobs
self, base_estimator, resampler, n_estimators, class_names, n_jobs
):
self.base_estimator = base_estimator
self.max_samples_tree = max_samples_tree
self.resampler = resampler
self.n_estimators = n_estimators
self.classes_ = class_names
self.n_jobs = n_jobs
Expand All @@ -37,7 +34,7 @@ def fit(self, X, y):

self.estimators_ = Parallel(n_jobs=self.n_jobs)(
delayed(_parallel_build)(
clone(self.base_estimator), X, y, self.max_samples_tree
clone(self.base_estimator), X, y, self.resampler
)
for i in range(self.n_estimators)
)
Expand Down Expand Up @@ -98,6 +95,7 @@ def predict_proba(self, X):


##########################################################################################
# For Neural Network Models
import tensorflow as tf
import keras.backend as K

Expand Down

0 comments on commit 93106db

Please sign in to comment.