-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData Wrangling - Create Binning
31 lines (23 loc) · 1.44 KB
/
Data Wrangling - Create Binning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
url = r"https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, header=None)
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
"drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
"num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
"peak-rpm","city-mpg","highway-mpg","price"]
df.columns = headers
#replace data with "?" with NAN so it will be treat as Null
df.replace("?", np.nan, inplace = True)
avg_horsepower=round(df["horsepower"].astype('float').mean(axis=0),2)
df["horsepower"]=df["horsepower"].astype('float')
df["horsepower"]=df["horsepower"].replace(np.nan, avg_horsepower)
#Create 3 bins of equal size bandwidth by using Numpy's linspace(start_value, end_value, numbers_generated function.
#use the set start_value = min(df["horsepower"]) and set end_value = max(df["horsepower"]).
#Since you are building 3 bins of equal length, you need 4 dividers, so numbers_generated = 4.
bins = np.linspace(min(df["horsepower"]), max(df["horsepower"]), 4)
group_names = ['Low', 'Medium', 'High']
df['horsepower-binned'] = pd.cut(df['horsepower'], bins, labels=group_names )
print(df[['horsepower','horsepower-binned']].head(20))
print(df["horsepower"].value_counts())