Skip to content

Commit

Permalink
#5 added value frequency plots and investigated most frequent users
Browse files Browse the repository at this point in the history
  • Loading branch information
oislen committed Jun 10, 2023
1 parent 3e9aaf8 commit 0d91a33
Showing 1 changed file with 56 additions and 2 deletions.
58 changes: 56 additions & 2 deletions report/qa.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,70 @@
```{python}
import os
import sys
from time import time
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# set file path for custom python modules
root_dir = os.path.dirname(os.path.join(os.getcwd()))
sys.path.append(os.path.join(root_dir, 'scripts'))
import cons
pd.set_option('display.max_columns', None)
data = pd.read_csv(cons.fpath_randomtelecomdata)
```


```{python}
data.head()
```

```{python}
def valfreqplot(data, col, bins = 50):
""""""
# aggregate data to col level ordered largest to smallest
agg_data = data.groupby(by=col, as_index=False).size().rename(columns = {'size':'n'}).groupby(by = 'n', as_index = False).size().reset_index()
# create barplot from ordered aggregated data
fig, ax = plt.subplots()
graph = sns.histplot(data = agg_data, x = 'size', color = 'steelblue', ax = ax, bins = bins)
ax.set(xlabel='n', ylabel='count', title=col)
plt.show()
return 0
```

# Frequency Plots

```{python}
valfreqplot(data = data, col = 'uid', bins = 50)
valfreqplot(data = data, col = 'firstname', bins = 50)
valfreqplot(data = data, col = 'lastname', bins = 50)
valfreqplot(data = data, col = 'registration_date', bins = 50)
valfreqplot(data = data, col = 'registration_country_code', bins = 50)
valfreqplot(data = data, col = 'email_domain', bins = 50)
valfreqplot(data = data, col = 'device_hash', bins = 50)
valfreqplot(data = data, col = 'card_hash', bins = 50)
valfreqplot(data = data, col = 'ip_hash', bins = 50)
valfreqplot(data = data, col = 'transaction_hash', bins = 50)
valfreqplot(data = data, col = 'application_hash', bins = 50)
valfreqplot(data = data, col = 'device_type', bins = 50)
valfreqplot(data = data, col = 'card_type', bins = 50)
valfreqplot(data = data, col = 'payment_channel', bins = 50)
valfreqplot(data = data, col = 'transaction_date', bins = 50)
valfreqplot(data = data, col = 'ip_country_code', bins = 50)
valfreqplot(data = data, col = 'card_country_code', bins = 50)
valfreqplot(data = data, col = 'transaction_status', bins = 50)
valfreqplot(data = data, col = 'transaction_error_code', bins = 50)
```

# UID Investigation

```{python}
data['uid'].value_counts().head(20)
```


```{python}
data = pd.read_csv(cons.fpath_arch_randomtelecomdata)
data.loc[data['uid'] == 5601785694208571, :]
```

0 comments on commit 0d91a33

Please sign in to comment.