Skip to content

Commit

Permalink
#5 Loading users data as parquet file. Removed tmp_data references wh…
Browse files Browse the repository at this point in the history
…ere not required
  • Loading branch information
oislen committed Sep 22, 2024
1 parent f10ade2 commit 755f047
Showing 1 changed file with 11 additions and 11 deletions.
22 changes: 11 additions & 11 deletions report/qa.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pd.set_option('display.max_columns', None)
parse_dates = ['registration_date', 'transaction_date']
date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d')
data = pd.read_csv(cons.fpath_randomtelecomtransdata, parse_dates=parse_dates)
userdata = pd.read_csv(cons.fpath_randomtelecomusersdata)
userdata = pd.read_parquet(cons.fpath_randomtelecomusersdata)
# show head of data
data.head()
Expand All @@ -46,7 +46,7 @@ There shoule be exactly one unique userid for every UID.
#| label: nunique-userids-per-uid
nunique_userids_per_uid = data.groupby(by='uid').agg({'userid':'nunique'})
sns.histplot(data=nunique_userids_per_uid,x='userid', bins = 20)
tmp_data['userid'].notnull().value_counts()
data['userid'].notnull().value_counts()
```

### Unique Fullnames per UID
Expand Down Expand Up @@ -97,7 +97,7 @@ data['email_domain'].notnull().value_counts()

### Unique Device Hash per UID

A UID should have 1 to 3 devices. Note, device hash can be missing if the user uses an unregistered device.
A UID should have 1 to 3 devices.

```{python}
#| label: nunique-devices-per-uid
Expand All @@ -114,7 +114,7 @@ A UID should have 1 to 2 cards, with an overall distribution less than the corre
#| label: nunique-cards-per-uid
nunique_cards_per_uid = data.groupby(by='uid').agg({'card_hash':'nunique'})
sns.histplot(data=nunique_cards_per_uid,x='card_hash', bins = 20)
tmp_data['card_hash'].notnull().value_counts()
data['card_hash'].notnull().value_counts()
```

### Unique IP Hash per UID
Expand All @@ -124,26 +124,26 @@ A UID should have between 1 and 10 ips.
```{python}
#| label: nunique-ips-per-uid
nunique_ips_per_uid = data.groupby(by='uid').agg({'ip_hash':'nunique'})
sns.histplot(data=nunique_ips_per_uid,x='ip_hash')
tmp_data['ip_hash'].notnull().value_counts()
sns.histplot(data=nunique_ips_per_uid,x='ip_hash', bins = 10)
data['ip_hash'].notnull().value_counts()
```

### Unique Application Hash per UID

```{python}
#| label: nunique-apps-per-uid
nunique_apps_per_uid = data.groupby(by='uid').agg({'application_hash':'nunique'})
sns.histplot(data=nunique_apps_per_uid,x='application_hash')
tmp_data['ip_hash'].notnull().value_counts()
sns.histplot(data=nunique_apps_per_uid,x='application_hash', bins = 10)
data['ip_hash'].notnull().value_counts()
```

### Unique Transaction Hash per UID

```{python}
#| label: nunique-ips-per-uid
nunique_ips_per_uid = data.groupby(by='uid').agg({'transaction_hash':'nunique'})
sns.histplot(data=nunique_ips_per_uid,x='transaction_hash')
tmp_data['transaction_hash'].notnull().value_counts()
sns.histplot(data=nunique_ips_per_uid,x='transaction_hash', bins=10)
data.assign(transaction_hash=data['transaction_hash'].notnull().astype(int)).groupby(by=['transaction_hash', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_amount':'sum'})
```

## Transaction
Expand Down Expand Up @@ -177,7 +177,7 @@ Each transaction hash should have a single transaction payment method associated
nunique_paymentmethod_per_trans = data.groupby(by=['transaction_hash']).agg({'transaction_payment_method':'nunique'}).sort_values('transaction_payment_method')
sns.histplot(data=nunique_paymentmethod_per_trans,x='transaction_payment_method', bins = 20)
data.loc[data['transaction_hash'].isin(nunique_paymentmethod_per_trans[(nunique_paymentmethod_per_trans == 0)].index[:5]), :]
data.loc[data['transaction_hash'].isin(nunique_paymentchannel_per_trans[(nunique_paymentchannel_per_trans == 0)].index[:5]), :]
data.loc[data['transaction_hash'].isin(nunique_paymentmethod_per_trans[(nunique_paymentmethod_per_trans == 0)].index[:5]), :]
data.groupby(by=['card_type', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'})
```

Expand Down

0 comments on commit 755f047

Please sign in to comment.