Skip to content

Commit 8119d93

Browse files
committed
Update 0.1.6
- ANAI Open Source Build 6 - Updated Documentation - df_loader can now take kwargs related to pandas it shall be given while we are creating ANAI objects in form of df_kwargs argument - Added opion to show graphs while explaining ANAI Models - Fit method will run Automaticaly if ANAI is ran through anai.run() if Regerssion or Classification are used separately fit shall be called - Explain Method now returns result in for dataframe - Removed Unnecessary Import from Predictor - Added more error handeling - Now Preprocessor can be called without target var - Stats Summary will drop columns which have 100% missing values - Added support for legacy data loading. While calling anai.load() if legacy arg is true then it will give pandas loaded dataframe instead of modin.pandas Signed-off-by: Arsh <lucifer78908@gmail.com>
1 parent 5de8127 commit 8119d93

File tree

6 files changed

+57
-33
lines changed

6 files changed

+57
-33
lines changed

anai/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,11 +293,14 @@ def __task(df, target):
293293
return False
294294

295295

296-
def load(df_filepath, **df_kwargs):
296+
def load(df_filepath, legacy =False, **df_kwargs):
297297
"""Loads a dataframe from a filepath.
298298
299299
Args:
300300
df_filepath (str): Filepath of the dataframe to be loaded.
301+
legacy (bool, optional): If True, loads the dataframe using pandas.read_csv.
302+
If False, loads the dataframe using modin.pandas.read_csv.
303+
Defaults to False.
301304
df_kwargs (dict): Keyword arguments to be passed to df_loader function.
302305
303306
Returns:
@@ -307,12 +310,13 @@ def load(df_filepath, **df_kwargs):
307310

308311
suppress = False
309312
if type(df_filepath) is str:
310-
df = __df_loader_single(df_filepath, suppress=False, **df_kwargs)
313+
df = __df_loader_single(df_filepath, suppress=False, legacy = legacy, **df_kwargs)
311314
elif type(df_filepath) is list:
312315
print(Fore.YELLOW + "Loading Data [*]\n")
313316
df = pd.concat(
314317
[
315-
__df_loader_single(df_filepath[i], suppress=True, **df_kwargs)
318+
__df_loader_single(
319+
df_filepath[i], suppress=True, legacy=legacy, **df_kwargs)
316320
for i in range(len(df_filepath))
317321
]
318322
)

anai/preprocessing/__init__.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import category_encoders as ce
88
import matplotlib.pyplot as plt
99
import modin
10-
import modin.pandas as pd
10+
import pandas as pd
1111
import numpy as np
1212
import seaborn as sns
1313
from anai.preprocessing import *
@@ -28,7 +28,7 @@ class Preprocessor:
2828
def __init__(
2929
self,
3030
dataset,
31-
target: str,
31+
target: str = None,
3232
except_columns: list = [],
3333
):
3434
""" Initialize the Preprocessor class.
@@ -56,8 +56,9 @@ def __init__(
5656

5757
self.encoder = Encoder()
5858
self.scaler = Scaler()
59-
self.features = self.__dataset.drop(self.target, axis=1)
60-
self.labels = self.__dataset[self.target]
59+
if self.target:
60+
self.features = self.__dataset.drop(self.target, axis=1)
61+
self.labels = self.__dataset[self.target]
6162

6263
def prepare(self, features, labels, test_size, random_state, smote, k_neighbors):
6364
"""

anai/preprocessing/statistics/__init__.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import numpy as np
2-
import modin.pandas as pd
32
from scipy.stats import shapiro
43
from dateutil.parser import parse
54
from fuzzywuzzy import fuzz
@@ -31,41 +30,55 @@ def is_date(string, fuzzy=False):
3130
return True
3231
except ValueError:
3332
return False
33+
except OverflowError:
34+
return False
35+
except Exception as e:
36+
return False
3437

3538

3639
def dtype(df, col):
37-
if df[col].dtype == "O":
38-
if not is_date(df[col].iloc[0]):
39-
return "Categorical"
40-
elif is_date(df[col].iloc[0]):
41-
return "Time Series"
42-
elif df[col].dtype == "int64" or df[col].dtype == "float64":
43-
return "Numeric"
44-
else:
40+
try:
41+
if df[col].dtype == "O":
42+
if not is_date(df[col].iloc[0]):
43+
return "Categorical"
44+
elif is_date(df[col].iloc[0]):
45+
return "Time Series"
46+
elif df[col].dtype == "int64" or df[col].dtype == "float64":
47+
return "Numeric"
48+
else:
49+
return "Unknown"
50+
except:
4551
return "Unknown"
4652

4753

4854
def dtype_ver(df, col):
49-
if df[col].dtype == "O":
50-
if not is_date(df[col].iloc[0]):
51-
return "Categorical", ""
52-
elif is_date(df[col].iloc[0]):
53-
return "Categorical", "Time Series"
54-
elif df[col].dtype == "int64" or df[col].dtype == "float64":
55-
return "Numeric", ""
56-
else:
55+
try:
56+
if df[col].dtype == "O":
57+
if not is_date(df[col].iloc[0]):
58+
return "Categorical", ""
59+
elif is_date(df[col].iloc[0]):
60+
return "Categorical", "Time Series"
61+
elif df[col].dtype == "int64" or df[col].dtype == "float64":
62+
return "Numeric", ""
63+
else:
64+
return "Unknown", ""
65+
except Exception as e:
5766
return "Unknown", ""
5867

68+
5969
def shap(df, col):
6070
return "{:0.2f}".format(
6171
float(shapiro(df[col])[0]) if df[col].dtype != "O" else "NA"
6272
)
6373

74+
6475
def most_frequent_values(df, col):
6576
return (
66-
df[col].value_counts()[:1].index.tolist()[0] if df[col].dtype == "O" else "NA"
77+
df[col].value_counts()[:1].index.tolist()[
78+
0] if df[col].dtype == "O" else "NA"
6779
)
6880

81+
6982
def column_stats_summary(df, col):
7083
if "identi" in col.lower():
7184
return {
@@ -159,7 +172,8 @@ def column_stats_summary(df, col):
159172

160173
def data_stats_summary(df):
161174
anom = AnomalyDetector()
162-
df2 = df.fillna(df.mean())
175+
df2 = df.dropna(axis=1, how='all')
176+
df2 = df2.fillna(df2.mean())
163177
X = []
164178
for i in df2.columns:
165179
if dtype_ver(df2, i)[0] == "Numeric":

anai/utils/connectors/data_handler.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ def __df_loader_single(
88
obj=None,
99
objfilepath=None,
1010
suppress=False,
11-
df_kwargs = {}
11+
legacy=False,
12+
**df_kwargs
1213
):
13-
kwargs = df_kwargs
1414
df = None
1515
flag = 0
1616
if obj is None:
@@ -62,17 +62,17 @@ def __df_loader_single(
6262
print(
6363
Fore.RED + "Data Loading Failed [", "\u2717", "]\n"
6464
) if not suppress else None
65-
return df
65+
return df._to_pandas() if legacy else df
6666

6767

68-
def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False, df_kwargs={}):
68+
def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False, df_kwargs={}, legacy=False):
6969
if type(df_filepath) is str:
70-
df = __df_loader_single(df_filepath, obj, objfilepath, suppress, **df_kwargs)
70+
df = __df_loader_single(df_filepath, obj, objfilepath, suppress, legacy, **df_kwargs)
7171
elif type(df_filepath) is list:
7272
print(Fore.YELLOW + "Loading Data [*]\n")
7373
df = pd.concat(
7474
[
75-
__df_loader_single(df_filepath[i], obj, objfilepath, True, **df_kwargs)
75+
__df_loader_single(df_filepath[i], obj, objfilepath, True,legacy, **df_kwargs)
7676
for i in range(len(df_filepath))
7777
]
7878
)

docs/Features.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
df = anai.load("data/bodyPerformance.csv", df_kwargs={"header": None})
1414
prep = Preprocessor(dataset=df, target="class", except_columns=['weight_kg'])
1515

16+
### Data Loading
17+
Load data from a file
18+
df = anai.load("data/bodyPerformance.csv", df_kwargs={"header": None}, legacy=False)
19+
Returns a pandas dataframe
20+
1621
### Available Preprocessing Methods
1722
#### Data Summary
1823
Gives a summary of the data.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
],
2222
include=["anai.*", "anai"],
2323
),
24-
version="0.1.6-alpha-1",
24+
version="0.1.6",
2525
license="Apache License 2.0",
2626
description="Automated ML",
2727
url="https://github.com/Revca-ANAI/ANAI",

0 commit comments

Comments
 (0)