-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path22_data_cleaning.py
84 lines (61 loc) · 2.18 KB
/
22_data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Data cleaning
from os import read
from pandas import read_csv
from numpy import unique
from urllib.request import urlopen
from numpy import loadtxt
# define the location of the dataset
path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/oil-spill.csv'
# load the dataset
data = loadtxt(urlopen(path), delimiter=',')
# summarize the number of unique values in each column
for i in range(data.shape[1]):
print(i, len(unique(data[:, i])))
# We can see that, column 22 has only one unique value
# We can do the same using pandas
data = read_csv('oil-spill.csv', header=None)
print(data.nunique())
# Now we delete the column(s) which have only one unique value
counts = data.nunique()
columns_to_delete = [i for i,v in enumerate(counts) if v == 1]
print(columns_to_delete)
new = data.drop(columns_to_delete, axis=1, inplace=False)
# If True, original `data` is changed, else a copy is made and then changed
# For true, no need to store it in variable `new`, `data` will be changed itself.
print(data.shape)
print(new.shape)
# We can remove the columns which have few unique values
columns_to_delete = [i for i,v in enumerate(counts) if (float(v)/data.shape[0]*100) < 1]
print(columns_to_delete)
new = data.drop(columns_to_delete, axis=1, inplace=False)
print(new.shape)
# Remove columns with low variance
from sklearn.feature_selection import VarianceThreshold
df = read_csv("oil-spill.csv", header=None)
# split data into inputs and outputs
data = df.values
X = data[:, :-1]
y = data[:, -1]
print(X.shape, y.shape)
# define the transform
transform = VarianceThreshold()
# The default value of parameter in `VarianceThreshold` is `threshold` is 0
# transform the input data
X_sel = transform.fit_transform(X)
print(X_sel.shape)
# As the threshold is 0, only column 22 is removed
# Removal of duplicate rows
from pandas import read_csv
path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv'
# load the dataset
df = read_csv(path, header=None)
# calculate duplicates
dups = df.duplicated()
# report if there are any duplicates
print(dups.any())
# list all duplicate rows
print(df[dups])
print(df.shape)
# delete duplicate rows
df.drop_duplicates(inplace=True)
print(df.shape)