Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
minor updates (loading bars etc)
  • Loading branch information
Metalkiler committed Jul 16, 2020
1 parent 7bf7f3a commit b63b784
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 65 deletions.
26 changes: 7 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,36 +23,24 @@ This method results in 689 binary inputs, which is much less than the 10690 bina

It is possible to apply these transformations to specific columns only instead of the full dataset (follow the example).

New Feature :

# Installation

## Stable Version
To install this package please run the following command

``` cmd
pip install cane
```
## Beta Version

Which in this version will contain pre-release versions of Cane that have new function which the stable version has not, and allow the users for their feedback and usage.

BETA Version

[x] - New function called multicolumn (for PCP and IDF only). This function will aggregate 2 or more columns into a single one and apply the transformation to it. Afterwards it will map the transformation obtained into the disaggregated columns.

More to come!


# Installation

## Stable Version
To install this package please run the following command

``` cmd
pip install cane==0.0.1.7.7b1
pip install cane
```


# Suggestions and feedback

Any feedback will be appreciated.
Expand Down Expand Up @@ -104,7 +92,7 @@ dataH4 = cane.one_hot(df, column_prefix='column', n_coresJob=2



#specific example with multicolumn BETA ONLY!
#specific example with multicolumn
x2 = [k for s in ([k] * n for k, n in [('a', 50),
('b', 10),
('c', 20),
Expand Down
14 changes: 9 additions & 5 deletions cane/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,22 @@ This method results in 689 binary inputs, which is much less than the 10690 bina

It is possible to apply these transformations to specific columns only instead of the full dataset (follow the example).

New Feature :


[x] - New function called multicolumn (for PCP and IDF only). This function will aggregate 2 or more columns into a single one and apply the transformation to it. Afterwards it will map the transformation obtained into the disaggregated columns.



# Installation

## Stable Version
To install this package please run the following command

``` cmd
pip install cane
```

# Suggestions and feedback
Expand Down Expand Up @@ -84,8 +92,7 @@ dataH4 = cane.one_hot(df, column_prefix='column', n_coresJob=2




#specific example with multicolumn
#specific example with multicolumn
x2 = [k for s in ([k] * n for k, n in [('a', 50),
('b', 10),
('c', 20),
Expand All @@ -110,9 +117,6 @@ print("multicolumn idf \n",dataIDF2)






#Time Measurement in 10 runs
print("Time Measurement in 10 runs (unicore)")
OT = timeit.timeit(lambda:cane.one_hot(df, column_prefix='column', n_coresJob=1),number = 10)
Expand Down
16 changes: 7 additions & 9 deletions cane/build/lib/cane/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pqdm.processes import pqdm
from functools import partial
import itertools
from tqdm import tqdm


def __pcp_single__(f, perc_inner=0.05, mergeCategoryinner="Others"):
Expand Down Expand Up @@ -90,11 +91,12 @@ def pcp(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others", n_coresJob=1,


def pcp_multicolumn(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others",
columns_use=None):
columns_use=None, disableLoadBar=True):
"""
Similarly to the normal PCP this function uses X columns given merges and applies the pcp transformation to it.
Next it will apply the transformation into the disaggregated columns sharing the transformation obtained previously
:param disableLoadBar: Chooses if you want load bar or not (default = True)
:param columns_use: Specific columns to apply transformation.
:param mergeCategory: Category for merging the data (by default "Others")
:param dataset: dataset to transform
Expand Down Expand Up @@ -126,25 +128,21 @@ def pcp_multicolumn(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others",

d = __pcp_single__(dfTesting, perc_inner=perc, mergeCategoryinner=mergeCategory)
dic = {v: [i for i in np.unique(v)][0] for _, v in d.items()}
for column in columns_use:
for column in tqdm(columns_use, desc="Transformation", total=len(columns_use), disable=disableLoadBar):
TransformedData[column] = TransformedData[column].map(dic)
TransformedData[column] = TransformedData[column].fillna(mergeCategory) # because of others
# dfFinal = pd.concat([i for i in d], axis=1)
# dfFinal.columns = columns_use
# dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use, sort=False)]],
# axis=1,
# sort=True)

return TransformedData


def idf_multicolumn(dataset, columns_use=None):
def idf_multicolumn(dataset, columns_use=None, disableLoadBar=True):
"""
The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x),
where n is the length of x and f_x is the frequency of x.
Next it will apply the transformation into the disaggregated columns sharing
the transformation obtained previously
:param disableLoadBar: Chooses if you want load bar or not (default = True)
:param columns_use: List of columns to use
:param dataset: dataset to transform
Expand All @@ -170,7 +168,7 @@ def idf_multicolumn(dataset, columns_use=None):
dfTesting = pd.Series([y for x in mergedColumn for y in x], name="X")

d = __idf_single_dic__(dfTesting)
for column in columns_use:
for column in tqdm(columns_use, desc="Transformation", total=len(columns_use), disable=disableLoadBar):
TransformedData[column] = TransformedData[column].replace(d)
return TransformedData

Expand Down
16 changes: 10 additions & 6 deletions cane/cane.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: cane
Version: 0.0.1.7.7
Version: 2.0.1
Summary: Cane - Categorical Attribute traNsformation Environment
Home-page: https://github.com/Metalkiler/Cane-Categorical-Attribute-traNsformation-Environment
Author: Luís Miguel Matos, Paulo Cortez, Rui Mendes
Expand Down Expand Up @@ -31,14 +31,22 @@ Description: # Cane - Categorical Attribute traNsformation Environment

It is possible to apply these transformations to specific columns only instead of the full dataset (follow the example).

New Feature :


[x] - New function called multicolumn (for PCP and IDF only). This function will aggregate 2 or more columns into a single one and apply the transformation to it. Afterwards it will map the transformation obtained into the disaggregated columns.



# Installation

## Stable Version
To install this package please run the following command

``` cmd
pip install cane


```

# Suggestions and feedback
Expand Down Expand Up @@ -92,8 +100,7 @@ Description: # Cane - Categorical Attribute traNsformation Environment




#specific example with multicolumn
#specific example with multicolumn
x2 = [k for s in ([k] * n for k, n in [('a', 50),
('b', 10),
('c', 20),
Expand All @@ -118,9 +125,6 @@ Description: # Cane - Categorical Attribute traNsformation Environment






#Time Measurement in 10 runs
print("Time Measurement in 10 runs (unicore)")
OT = timeit.timeit(lambda:cane.one_hot(df, column_prefix='column', n_coresJob=1),number = 10)
Expand Down
17 changes: 9 additions & 8 deletions cane/cane.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
bounded-pool-executor
numpy
pandas
pqdm
python-dateutil
pytz
tqdm
typing-extensions
bounded-pool-executor==0.0.3
numpy==1.18.4
pandas==1.0.4
pqdm==0.1.0
python-dateutil==2.8.1
pytz==2020.1
tqdm==4.46.0
typing-extensions==3.7.4.2
pqdm==0.1.0
16 changes: 7 additions & 9 deletions cane/cane/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pqdm.processes import pqdm
from functools import partial
import itertools
from tqdm import tqdm


def __pcp_single__(f, perc_inner=0.05, mergeCategoryinner="Others"):
Expand Down Expand Up @@ -90,11 +91,12 @@ def pcp(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others", n_coresJob=1,


def pcp_multicolumn(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others",
columns_use=None):
columns_use=None, disableLoadBar=True):
"""
Similarly to the normal PCP this function uses X columns given merges and applies the pcp transformation to it.
Next it will apply the transformation into the disaggregated columns sharing the transformation obtained previously
:param disableLoadBar: Chooses if you want load bar or not (default = True)
:param columns_use: Specific columns to apply transformation.
:param mergeCategory: Category for merging the data (by default "Others")
:param dataset: dataset to transform
Expand Down Expand Up @@ -126,25 +128,21 @@ def pcp_multicolumn(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others",

d = __pcp_single__(dfTesting, perc_inner=perc, mergeCategoryinner=mergeCategory)
dic = {v: [i for i in np.unique(v)][0] for _, v in d.items()}
for column in columns_use:
for column in tqdm(columns_use, desc="Transformation", total=len(columns_use), disable=disableLoadBar):
TransformedData[column] = TransformedData[column].map(dic)
TransformedData[column] = TransformedData[column].fillna(mergeCategory) # because of others
# dfFinal = pd.concat([i for i in d], axis=1)
# dfFinal.columns = columns_use
# dfFinal = pd.concat([dfFinal, TransformedData[TransformedData.columns.difference(columns_use, sort=False)]],
# axis=1,
# sort=True)

return TransformedData


def idf_multicolumn(dataset, columns_use=None):
def idf_multicolumn(dataset, columns_use=None, disableLoadBar=True):
"""
The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x),
where n is the length of x and f_x is the frequency of x.
Next it will apply the transformation into the disaggregated columns sharing
the transformation obtained previously
:param disableLoadBar: Chooses if you want load bar or not (default = True)
:param columns_use: List of columns to use
:param dataset: dataset to transform
Expand All @@ -170,7 +168,7 @@ def idf_multicolumn(dataset, columns_use=None):
dfTesting = pd.Series([y for x in mergedColumn for y in x], name="X")

d = __idf_single_dic__(dfTesting)
for column in columns_use:
for column in tqdm(columns_use, desc="Transformation", total=len(columns_use), disable=disableLoadBar):
TransformedData[column] = TransformedData[column].replace(d)
return TransformedData

Expand Down
Binary file added cane/dist/cane-2.0.1-py3-none-any.whl
Binary file not shown.
Binary file added cane/dist/cane-2.0.1.tar.gz
Binary file not shown.
17 changes: 9 additions & 8 deletions cane/req.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
bounded-pool-executor
numpy
pandas
pqdm
python-dateutil
pytz
tqdm
typing-extensions
bounded-pool-executor==0.0.3
numpy==1.18.4
pandas==1.0.4
pqdm==0.1.0
python-dateutil==2.8.1
pytz==2020.1
tqdm==4.46.0
typing-extensions==3.7.4.2
pqdm==0.1.0
2 changes: 1 addition & 1 deletion cane/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


setuptools.setup(name='cane',
version='0.0.1.7.7',
version='2.0.1',
description='Cane - Categorical Attribute traNsformation Environment',
author='Luís Miguel Matos, Paulo Cortez, Rui Mendes',
license='MIT',
Expand Down

0 comments on commit b63b784

Please sign in to comment.