-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
229 lines (173 loc) · 5.97 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#####################################################################
################ numpy http://www.numpy.org/ ########################
#####################################################################
import numpy as np
# vectorized operations and relative scientific tools based on vectors
# e.g. np.mean(), np.std(), np.sin(), np.log(), np.random(), etc
# Python standard to store numerical data -> efficient, fast and clean
help(np)
help(np.mean)
# Matrices in python are just numpy arrays with 2 dimensions.
# Create a matrix by passing the numpy array function a list of lists.
# Each inner list will be taken to be one row of the matrix.
x = [1, 2, 3]
y = [4, 6, 8]
# 1D array
a = np.array(x)
a.shape
type(a)
a
# 2D array
b = np.array([x])
b.shape
b # two square brachets
c = np.array(x, ndmin=2)
c.shape
c
d = np.array([x]).T
d.shape
d
e = np.array(x)[:, None]
e.shape
# explicit shape
f = np.array(x).reshape(1, 3)
f.shape
f
g = np.array(y).reshape(3, 1)
g.shape
g
g-f # pair difference
g*f # pair product
np.dot(g, f) # matrix
f*g
np.dot(f, g) # scalar
# dot vs inner
r = np.array(x)
s = np.array(y)
np.dot(r, s)
np.inner(r, s) # no difference with 1D array
p = np.array([[1, 2], [3, 4]])
q = np.array([[5, 3], [1, 6]])
# difference with matrix
np.dot(p, q) # (7 = 1*5 + 2*1)
np.inner(p, q) # (11 = 1*5 + 2*3)
np.dot(a, b) == np.inner(a, b.T)
mat3x2 = np.array([[1, 2], [3, 4], [5, 6]])
print(mat3x2)
mat2x3 = np.array([[1, 2, 3], [4, 5, 6]])
print(mat2x3)
print(mat3x2 @ mat2x3) # matrix multiplication 3x3
mat3x2.dot(mat2x3) # dot product
print(mat2x3 @ mat3x2) # 2x2
mat2x3.dot(mat3x2)
print(mat2x3 @ mat2x3) # not conformable
mat2x3.dot(mat2x3)
M = np.matrix([[5, 5], [-1, 7]])
M.T*M
V = np.array([[5, 5], [-1, 7]])
V.T*V # KO: object type affects result
np.dot(V.T, V) # OK
# Sequences
t = np.linspace(0, 20, 50)
t
s = np.arange(0, 24, 3)
s
# import
dt = np.loadtxt('data.txt', delimiter=',') # mixed data types not supported
dt
# export
np.savetxt('data/array.txt', s, fmt='%i')
# fmt='%i' integer
# fmt='%10.5f' float rounded to five decimals
#######################################################################
############## pandas https://pandas.pydata.org/ ######################
#######################################################################
import pandas as pd
# data management and analysis
# two main data structures: Series and DataFrame
# Series: monodimensional and monotype indexed array
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s
# DataFrame: bidimensional and multitype matrix
index = pd.date_range('1/1/2020', periods=10)
df = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(np.random.randn(10), dtype='float32', index=index),
'D': np.array(np.random.randn(10), dtype='int32'),
'E': pd.Categorical(['red', 'green', 'yellow', 'red', 'yellow', 'yellow', 'white', 'red', 'red', 'green']),
'F': np.float64([27.25, 70.35, 68.25, 6.30, 29.65, 8.35, 7.85, 84.25, 32.5, 26.35])},
index=index)
df
df.shape # nrows ncols
# cols
df.columns
df['E'] # get series using column index (header)
df[['E']] # get column
# rows
df.index
df.loc['2020-01-10'] # localize using row index
df.loc[:'2020-01-05']
df.loc[df['E'] == 'red'] # boolean indexing
df[df['E'] == 'red']
# df.set_index('column')
df.loc['2020-01-02': '2020-01-05'] # slice indexing (row/col values)
df.loc[['2020-01-02', '2020-01-05'], ['C', 'E']]
df.loc['2020-01-02': '2020-01-05', ['C', 'E']]
df['E'].value_counts()
df.loc[3:6] # error!
df[3:6] # position indexing (row/col integer indexes)
df.iloc[3:6]
df.iloc['2020-01-02': '2020-01-05'] # error!
df.iloc[0, 2] # get cell
df.iloc[1:4, 3:5] # get rows and cols interval
df.iloc[:, [1, 3]] # all rows, some cols
# Lists: pandas reads lists as strings
fd = pd.DataFrame({
'name': ['tammy', 'bonny', 'helga', 'tom', 'jerome', 'hans', 'lisa', 'tyla', 'steve', 'antonie'],
'age': [6, 8, 8, 10, 7, 9, 10, 6, 8, 5],
'favorite_fruits': [
str(['banana', 'mango', 'orange']),
str(['apple', 'pear', 'banana', 'watermelon']),
str([]),
str(['banana', 'maracuja', 'watermelon', 'apple', 'pineapple']),
str(['strawberry', 'raspberry']),
str(['strawberry', 'apple', 'watermelon']),
str(['mango', 'pineapple', 'orange']),
str(['apple', 'pineapple']),
str(['apple', 'banana', 'orange', 'pear']),
str(['peach', 'strawberry'])]
})
fd
fd['favorite_fruits'] = fd['favorite_fruits'].apply(eval) # convert to lists
def get_all_fruits(series):
return pd.Series([x for ls in series for x in ls])
get_all_fruits(fd['favorite_fruits']).nunique()
get_all_fruits(fd['favorite_fruits']).unique()
get_all_fruits(fd['favorite_fruits']).value_counts()
fruits_expanded = fd['favorite_fruits'].apply(pd.Series)
fruits_expanded
def dummify_fruits(item_lists, unique_items):
bool_dict = {}
for i, item in enumerate(unique_items):
bool_dict[item] = item_lists.apply(lambda x: item in x)
return pd.DataFrame(bool_dict)
fruits_bool = dummify_fruits(fd['favorite_fruits'], get_all_fruits(fd['favorite_fruits']).unique())
fruits_bool
# import
df = pd.read_csv('https://raw.githubusercontent.com/rpalloni/dataset/master/titanic.csv')
# pd.set_option('display.max_columns', None) # full cols list
df.head()
df.loc[0] # first row
df.rename(columns={'pclass': 'class'}, inplace=True) # inplace sobstitute instead of create a copy
df.describe()
st = df.groupby(['class', 'sex']).agg({'age': 'mean', 'fare': 'mean'})
print(st)
pv = df.pivot_table(values=['age', 'fare'], index=['class', 'sex'], aggfunc='mean')
cr = pd.crosstab(df['class'], df['sex'], margins=True)
sr = df.sort_values(['fare'], ascending=False).head(10)
# export
ExcelObject = pd.ExcelWriter(path='stats.xlsx')
pv.to_excel(ExcelObject, sheet_name='pivoted', merge_cells=False) # repeat dimension level
cr.to_excel(ExcelObject, sheet_name='crossed')
sr.to_excel(ExcelObject, sheet_name='sorted')
ExcelObject.save()