-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathclusterbycolumn.py
49 lines (33 loc) · 1.21 KB
/
clusterbycolumn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
def group(df, column='endpoint'):
"""accepts a pandas DataFrame, sorts it by endpoints, groups the rows by
endpoint and returns the DataFrameGroupBy object"""
# sort the DataFrame by endpoint
df = df.sort_values(by=column)
# group the rows by endpoint
df = df.groupby(column)
return df
def aggregate(dfgroupby, value, function):
"""accepts a DataFrameGroupBy object, aggregates the column (value)
specified by the aggregation functions passed as list argument, returns
the DataFrame"""
# aggregate the values
df = dfgroupby[value].agg(function)
return df
def merge(df1, df2, column='endpoint'):
"""accepts two DataFrame objects, merges them by endpoint,
and returns the DataFrame"""
# merge on endpoint
df = pd.merge(df1, df2, left_on=column, right_index=True)
return df
def reducedim(df, cols):
"""accepts a DataFrame object and list of columns, deletes the columns from
the DataFrame and returns the reduced DataFrame"""
# reduce dimensionality
for col in cols: del df[col]
return df
def uniq(df):
"""accepts a DataFrame object, remove duplicates and returns DataFrame"""
# remove duplicates
df = df.drop_duplicates()
return df