-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaning.py
74 lines (59 loc) · 2.17 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
import time
import pandas as pd
import pickle
class Cleaner:
def __init__(self, list_of_paths):
super().__init__()
self.list_of_paths = list_of_paths
self.df = None
self.init()
# print(self.df.head())
def init(self):
list_of_df = []
for i, path in enumerate(self.list_of_paths):
with open(path, 'rb') as f:
data = pickle.load(f)
list_of_df.append(pd.DataFrame(data))
self.df = pd.concat(list_of_df, ignore_index=True)
def save_df_as_image(self, path_to_save_the_image):
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import seaborn as sns
matplotlib.rcParams['figure.figsize'] = (20, 20) # (12,8)
print('visualizing...')
cols = self.df.columns
colors = ['#b00b69', '#2b2b2b'] # 9e9e9e
sns_plot = sns.heatmap(
self.df[cols].isnull(), cmap=sns.color_palette(colors))
print('...going to save the image...')
fig = sns_plot.get_figure()
fig.savefig(path_to_save_the_image + ".png")
print('...saved')
def save_df_as_csv(self, path_to_save_the_df):
# self.df.head(100).to_csv('data/df_head_100.csv')
# self.df.to_csv('data/cleared_data.csv') # ~160Mb
self.df.to_csv(path_to_save_the_df + '.csv')
def drop_redundant_data(self):
df = self.df
print('original shape:')
print(df.shape)
print('deleted all duplicates:')
df = df.loc[~df.timestamp.duplicated(keep='first')]
print(df.shape)
print('deleted all items without an article:')
df = df[df.article_text.notnull()]
print(df.shape)
self.df = df
def show_basic_info(self):
df = self.df
print(df.shape)
print(df.dtypes)
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(numeric_cols)
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(non_numeric_cols)