diff --git a/report/qa.qmd b/report/qa.qmd index e7b5515..0801b7a 100644 --- a/report/qa.qmd +++ b/report/qa.qmd @@ -28,7 +28,7 @@ pd.set_option('display.max_columns', None) parse_dates = ['registration_date', 'transaction_date'] date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d') data = pd.read_csv(cons.fpath_randomtelecomtransdata, parse_dates=parse_dates) -userdata = pd.read_csv(cons.fpath_randomtelecomusersdata) +userdata = pd.read_parquet(cons.fpath_randomtelecomusersdata) # show head of data data.head() @@ -46,7 +46,7 @@ There shoule be exactly one unique userid for every UID. #| label: nunique-userids-per-uid nunique_userids_per_uid = data.groupby(by='uid').agg({'userid':'nunique'}) sns.histplot(data=nunique_userids_per_uid,x='userid', bins = 20) -tmp_data['userid'].notnull().value_counts() +data['userid'].notnull().value_counts() ``` ### Unique Fullnames per UID @@ -97,7 +97,7 @@ data['email_domain'].notnull().value_counts() ### Unique Device Hash per UID -A UID should have 1 to 3 devices. Note, device hash can be missing if the user uses an unregistered device. +A UID should have 1 to 3 devices. ```{python} #| label: nunique-devices-per-uid @@ -114,7 +114,7 @@ A UID should have 1 to 2 cards, with an overall distribution less than the corre #| label: nunique-cards-per-uid nunique_cards_per_uid = data.groupby(by='uid').agg({'card_hash':'nunique'}) sns.histplot(data=nunique_cards_per_uid,x='card_hash', bins = 20) -tmp_data['card_hash'].notnull().value_counts() +data['card_hash'].notnull().value_counts() ``` ### Unique IP Hash per UID @@ -124,8 +124,8 @@ A UID should have between 1 and 10 ips. ```{python} #| label: nunique-ips-per-uid nunique_ips_per_uid = data.groupby(by='uid').agg({'ip_hash':'nunique'}) -sns.histplot(data=nunique_ips_per_uid,x='ip_hash') -tmp_data['ip_hash'].notnull().value_counts() +sns.histplot(data=nunique_ips_per_uid,x='ip_hash', bins = 10) +data['ip_hash'].notnull().value_counts() ``` ### Unique Application Hash per UID @@ -133,8 +133,8 @@ tmp_data['ip_hash'].notnull().value_counts() ```{python} #| label: nunique-apps-per-uid nunique_apps_per_uid = data.groupby(by='uid').agg({'application_hash':'nunique'}) -sns.histplot(data=nunique_apps_per_uid,x='application_hash') -tmp_data['ip_hash'].notnull().value_counts() +sns.histplot(data=nunique_apps_per_uid,x='application_hash', bins = 10) +data['ip_hash'].notnull().value_counts() ``` ### Unique Transaction Hash per UID @@ -142,8 +142,8 @@ tmp_data['ip_hash'].notnull().value_counts() ```{python} #| label: nunique-ips-per-uid nunique_ips_per_uid = data.groupby(by='uid').agg({'transaction_hash':'nunique'}) -sns.histplot(data=nunique_ips_per_uid,x='transaction_hash') -tmp_data['transaction_hash'].notnull().value_counts() +sns.histplot(data=nunique_ips_per_uid,x='transaction_hash', bins=10) +data.assign(transaction_hash=data['transaction_hash'].notnull().astype(int)).groupby(by=['transaction_hash', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_amount':'sum'}) ``` ## Transaction @@ -177,7 +177,7 @@ Each transaction hash should have a single transaction payment method associated nunique_paymentmethod_per_trans = data.groupby(by=['transaction_hash']).agg({'transaction_payment_method':'nunique'}).sort_values('transaction_payment_method') sns.histplot(data=nunique_paymentmethod_per_trans,x='transaction_payment_method', bins = 20) data.loc[data['transaction_hash'].isin(nunique_paymentmethod_per_trans[(nunique_paymentmethod_per_trans == 0)].index[:5]), :] -data.loc[data['transaction_hash'].isin(nunique_paymentchannel_per_trans[(nunique_paymentchannel_per_trans == 0)].index[:5]), :] +data.loc[data['transaction_hash'].isin(nunique_paymentmethod_per_trans[(nunique_paymentmethod_per_trans == 0)].index[:5]), :] data.groupby(by=['card_type', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) ```