-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre-process.py
132 lines (108 loc) · 4.71 KB
/
pre-process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/python3
import numpy as np
import pandas as pd
from shutil import get_terminal_size
pd.set_option('display.width', get_terminal_size()[0])
# ## LOOK AT DATA
df1 = pd.read_csv("details.csv"); #print(df1)
df2 = pd.read_csv("details_fixed.csv"); #print(df2)
#print(df2.describe())
df3 = pd.read_csv("payments.csv"); print(df3)
df4 = pd.read_csv("profiling.csv"); print(df4)
## MERGE BY ID
df23 = df2.merge(df3,on = 'ID')
df234 = df23.merge(df4,on = 'ID'); print(df234.describe())
## CHECK FOR MISSING VALUES
def missing(data):
cols = data.columns
for (i,col) in enumerate(cols):
print("Column %s - missing values = " %(col), data[col].isnull().sum())
missing(df234)
## SUSPECT PLAYERS
df5 = pd.read_csv("suspect.csv"); print(df5); print(df5.describe())
print(df234.loc[df234['ID'] == 100095464]) # LOOKS LIKE THESE ALREADY IN OTHER DATAFRAME
df2345 = pd.merge(df234, df5, how='outer', on='ID');
df2345['Target_ml'] = df2345['Target_ml'].replace(np.nan, 0)
df2345['Target_ml'] = df2345['Target_ml'].astype(int)
print(df2345); print(df2345.describe())
## SAVE DATA
#df2345.to_csv('all_data.csv', index = False)
## PLOTS HISTOGRAMS OF FEATURES
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.ticker as mticker
import os
import sys
df = df2345.copy()
#df = pd.read_csv("all_data.csv");
#print(df.columns)
#paras = ['e_23456'] # TO TEST
paras = ['Life_Time', 'Age', 'CountDeposit', 'CountWithdrawal', 'TotalDeposits', 'TotalWithdrawal', 'CountPaymentMethod', 'DifferentMethodWithdrawals', 'IP_Counts', 'e_11234', 'e_23456', 'e_34454', 'e_43568','e_64645']
for (i, para) in enumerate(paras):
xmin = min(df[para]);
xmax = max(df[para]);
if para == 'CountPaymentMethod':
desired_bin_size = 1
elif para == 'IP_Counts':
desired_bin_size = 1
elif para == 'e_64645':
desired_bin_size = 1
else:
desired_bin_size = int((xmax-xmin)/10)
data = df[para]
min_val = np.min(data); max_val = np.max(data)
min_boundary = -1.0 * (min_val % desired_bin_size - min_val)
max_boundary = max_val - max_val % desired_bin_size + desired_bin_size
n_bins = int((max_boundary - min_boundary) / desired_bin_size) + 1
bins = np.linspace(min_boundary, max_boundary, n_bins)
plt.rcParams.update({'font.size': 14})
plt.figure(figsize=(6,4))
ax = plt.gca()
plt.setp(ax.spines.values(),linewidth=2)
text = '%s' %(para)
plt.ylabel('Number', size=14); plt.xlabel(text, size=14)
sus = df.loc[df['Target_ml'] == 1]
non = df.loc[df['Target_ml'] == 0]; #print(len(sus),len(non))
ax.hist(non[para], bins=bins, color="w", edgecolor='grey', linewidth=3);
mean = np.mean(non[para]); std = np.std(non[para])
ymin, ymax = plt.ylim(); #print(ymax)
if xmax < 10:
text = "\u03BC = %1.2f, \u03C3 = %1.2f" %(mean,std); xoff = 3.9
elif xmax >10 and mean <= 100:
text = "\u03BC = %1.1f, \u03C3 = %1.1f" %(mean,std); xoff = 2.9
elif xmax >100 and mean <= 1000:
text = "\u03BC = %1.1f, \u03C3 = %1.0f" %(mean,std);xoff = 2.5
elif xmax > 1000 and mean <= 10000:
text = "\u03BC = %1.0f, \u03C3 = %1.0f" %(mean,std); xoff = 3.5
else:
text = "\u03BC = %1.0f, \u03C3 = %1.0f" %(mean,std); xoff = 2.5
x_pos = xmax-((xmax-xmin)/xoff); y_pos = ymax; y_skip = ymax-((ymax-ymin)/3);
plt.text(x_pos,y_pos, text, fontsize = 12, c = 'k', horizontalalignment='left',verticalalignment='top')
ax.hist(sus[para], bins=bins, color="w", edgecolor='r', linewidth=3);
mean = np.mean(sus[para]); std = np.std(sus[para])
if xmax < 10:
text = "\u03BC = %1.2f, \u03C3 = %1.2f" %(mean,std); xoff = 3.9
elif xmax >10 and mean < 100:
text = "\u03BC = %1.1f, \u03C3 = %1.1f" %(mean,std); xoff = 2.9
elif xmax >100 and mean < 1000:
text = "\u03BC = %1.1f, \u03C3 = %1.0f" %(mean,std);xoff = 2.5
else:
text = "\u03BC = %1.0f, \u03C3 = %1.0f" %(mean,std);xoff = 2.2
plt.text(x_pos,y_pos-y_skip, text, fontsize = 12, c = 'r', horizontalalignment='left',verticalalignment='top')
ax.set_yscale('log');
def update_ticks(z, pos):
if z ==1:
return '1 '
elif z >1 and z <1000:
return '%d' %(z)
elif z < 1 and z > 0.001:
return z
else:
return '10$^{%1.0f}$' %(np.log10(z))
ax.yaxis.set_major_formatter(mticker.FuncFormatter(update_ticks))
plt.tight_layout(pad=0.1)
plot = "histo_%s" %(para); png = "%s.png" %(plot);
eps = "convert %s %s.eps; mv %s.eps media/." % (png, plot,plot);
plt.savefig(png); os.system(eps);print("Plot written to", png);
#plt.show()
plt.clf(); plt.cla(); plt.close()