jupyter | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
Calculating counts based on common spellings/pronunciations of similar names is very slow look at optimizing/parallelizing.
- see: https://stackoverflow.com/questions/26187759/parallelize-apply-after-pandas-groupby
- potentially look at https://github.com/jmcarpenter2/swifter
- see realted blog post: https://medium.com/@jmcarpenter2/swiftapply-automatically-efficient-pandas-apply-operations-50e1058909f9
import pandas as pd
import numpy as np
import os
import pickle
from timeit import default_timer as timer
%load_ext line_profiler
with open('babynames.pickle', 'rb') as f:
orig_df = pickle.load(f)
orig_df.shape
with open('names_beidermorse.pickle', 'rb') as f:
names = pickle.load(f)
names.shape
# this takes about 35 seconds
start = timer()
def f(name, bmset):
return pd.DataFrame(zip([name] * len(bmset), list(bmset)), columns=('name', 'beidermorse'))
kv_names = pd.concat([f(n,b) for n, b in zip(names['name'], names['bmset'])])
end = timer()
print(end - start, 'seconds')
print(kv_names.shape)
# setup test dataset
df = orig_df[orig_df.year == 1990].sample(frac=.01, random_state=2213).copy()
#df = orig_df[orig_df.year == 1990].copy()
df.sort_values(['year', 'sex', 'name'], inplace=True)
df.shape
start = timer() calc_sound_totals(alt) end = timer() print(end - start, 'seconds')
%lprun -f create_df_out_n calc_sound_totals(alt)
# this is very slow look at optimizing/parallelizing
# see: https://stackoverflow.com/questions/26187759/parallelize-apply-after-pandas-groupby
# potentially look at https://github.com/jmcarpenter2/swifter
# see realted blog post: https://medium.com/@jmcarpenter2/swiftapply-automatically-efficient-pandas-apply-operations-50e1058909f9
# process each row of dataframe
def create_df_out_n(row):
# should do no further processing if this row has already been counted
if (row['counted'] == True):
return
# find matching names
#checklist = names[names.name == row['name']].beidermorse.values[0].split()
#find = lambda i: any(x in i for x in checklist)
#found = names[names.bmset.map(find)].name
# this new method takes about half the time - 15 seconds for 250 rows
#checklist = names[names.name == row['name']].bmset.values[0]
#found = kv_names[kv_names.beidermorse.isin(checklist)]['name'].unique()
checklist = filt_names[filt_names.name == row['name']].beidermorse.values
found = filt_names[filt_names.beidermorse.isin(checklist)]['name'].unique()
# aggregate count, excluding counted names, for all found names into df_out_name
df_out.loc[(df_out.name == row['name']) &
(df_out.year == row.year) &
(df_out.sex == row.sex) ,
'alt_n'] = df_out[(df_out.name.isin(found)) &
(df_out.year == row.year) &
(df_out.sex == row.sex) &
(df_out.counted == False)]['n'].sum()
# set counted flag for found names in group
# ? how to update just group ?
df_out.loc[(df_out.name.isin(found)) & (df_out.year == row.year) & (df_out.sex == row.sex), 'counted'] = True
# create df_out_prop
def create_df_out_prop(row, gsum):
df_out.loc[(df_out.name == row['name']) &
(df_out.year == row.year) &
(df_out.sex == row.sex) ,
'alt_prop'] = row['alt_n'] / gsum
def calc_sound_totals():
gdf = df_out.groupby(['year', 'sex'])
for name, group in gdf:
print('processing name:', name)
g = group.sort_values('n', ascending=False).copy()
g.apply(create_df_out_n, axis=1)
for name, group in gdf:
gsum = group['alt_n'].sum()
group.apply(create_df_out_prop, axis=1, args=(gsum,))
df_out = df.copy()
df_out['counted'] = False
df_out['alt_n'] = 0
df_out['alt_prop'] = 0.0
filt_names = kv_names.merge(df, on='name')[['name', 'beidermorse']]
#%lprun -f create_df_out_n calc_sound_totals()
start = timer()
out = calc_sound_totals()
end = timer()
print('took:', end - start, 'seconds')
potentially look at https://github.com/jmcarpenter2/swifter
see realted blog post: https://medium.com/@jmcarpenter2/swiftapply-automatically-efficient-pandas-apply-operations-50e1058909f9
def calc_sound_totals(df_in): df_out = df_in.copy() df_out['counted'] = False df_out['alt_n'] = 0 df_out['alt_prop'] = 0.0
# process each row of dataframe
def create_df_out_n(row):
# should do no further processing if this row has already been counted
if (row['counted'] == True):
return
# find matching names
checklist = names[names.name == row['name']].beidermorse.values[0].split()
find = lambda i: any(x in i for x in checklist)
found = names[names.bmset.map(find)].name
# aggregate count, excluding counted names, for all found names into df_out_name
df_out.loc[(df_out.name == row['name']) &
(df_out.year == row.year) &
(df_out.sex == row.sex) ,
'alt_n'] = df_out[(df_out.name.isin(found)) &
(df_out.year == row.year) &
(df_out.sex == row.sex) &
(df_out.counted == False)]['n'].sum()
# set counted flag for found names in group
# ? how to update just group ?
df_out.loc[(df_out.name.isin(found)) & (df_out.year == row.year) & (df_out.sex == row.sex), 'counted'] = True
start = timer()
gdf = df_out.groupby(['year', 'sex'])
for name, group in gdf:
print('processing name:', name)
g = group.sort_values('n', ascending=False).copy()
g.apply(create_df_out_n, axis=1)
end = timer()
print('create out_n', end - start, 'seconds')
# create df_out_prop
def create_df_out_prop(row):
df_out.loc[(df_out.name == row['name']) &
(df_out.year == row.year) &
(df_out.sex == row.sex) ,
'alt_prop'] = row['alt_n'] / gsum
start = timer()
for name, group in gdf:
gsum = group['alt_n'].sum()
group.apply(create_df_out_prop, axis=1)
end = timer()
print('create out_prop', end - start, 'seconds')
return df_out
%%timeit
rn = 'Michael'
checklist = names[names.name == rn].bmset.values[0]
found = kv_names[kv_names.beidermorse.isin(checklist)]['name'].unique()
%%timeit
rn = 'Michael'
checklist = names[names.name == rn].beidermorse.values[0].split()
find = lambda i: any(x in i for x in checklist)
found = names[names.bmset.map(find)].name
%%timeit
rn = 'Michael'
checklist = kv_names[kv_names.name == rn].beidermorse
found = kv_names[kv_names.beidermorse.isin(checklist)]['name'].unique()
kn = kv_names.values
%%timeit
rn = 'Michael'
bm = kn[np.where(kn == rn)[0]][:,1]
out = [kn[np.where(kn == x)[0]][:,0] for x in bm]
found = np.unique(np.concatenate( out, axis=0 ))
ni = kv_names.set_index('name')
bi = kv_names.set_index('beidermorse')
%%timeit
rn = 'Michael'
found = bi.loc[ni.loc[rn].beidermorse].name.unique()
%%timeit
rn = 'Michael'
checklist = kv_names[kv_names.name == rn].beidermorse
found = kv_names.merge(checklist, on='beidermorse')['name'].unique()
# if I filter the list to just those in this year/group, performance is much improved.
sub = kv_names.merge(df, on='name')[['name', 'beidermorse']]
%%timeit
rn = 'Michael'
checklist = sub[sub.name == rn].beidermorse.values
found = sub[sub.beidermorse.isin(checklist)]['name'].unique()
%prun -l 10 kv_names[kv_names.beidermorse.isin(['zYsDki'])]['name'].unique()
checklist = kv_names[kv_names.name == 'Michael'].beidermorse
%prun -l 10 kv_names[kv_names.beidermorse.isin(checklist)]['name'].unique()
%%timeit
sub = kv_names.merge(df, on='name')[['name', 'beidermorse']]