13 June 2016
The last process before analyze and visualize the results is to integrate all the data sets.
In this phase we query, transform, group, summarize and merge information in several ways to create two datasets. The first one will hold user data and the second withdrawl data.
R package data.table
was used again for high resource consuming tasks over a big data set with more than 7MM observations.
Reading data
In the first step we read the original dataset and the results obtained from running the clustering algorithm
knitr::opts_chunk$set(echo = TRUE,fig.align='center')
list.of.packages <- c("data.table", "ggplot2","ggplot2","knitr","viridis","ggthemes","knitr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
A quick plot to check how many users where assigned to each segment.
# Number of users by cluster
barplot(table(DT.r$cluster), main="Number of customers by segment.", col=viridis(3), border = "white")
# Load not scaled variables
DT.c <- readRDS("./data/DTc.rds")
# Add the cluster variable to scaled data frame
DT.clients <- merge(
DT.c,
DT.r[,c("PER_ID_PERSONA", "cluster"), with = FALSE],
by=c('PER_ID_PERSONA')
)
# Calculate the median for each variable by cluster
cluster_medians.df <- aggregate(DT.clients[, 2:7, with = FALSE], by = list(DT.clients$cluster), median)
write.table(cluster_medians.df, file.path('data/cluster_medians.csv'), row.names = F, col.names = TRUE, sep=",")
# Calculate the mean for each variable by cluster
setDT(DT.clients)
cluster_means.df <- DT.clients[, lapply(.SD, mean, na.rm=TRUE), by=list(DT.clients$cluster), .SDcols=c(2:7) ][order(DT.clients)]
write.table(cluster_means.df, file.path('data/cluster_means.csv'), row.names = F, col.names = TRUE, sep=",")
Mean and median values for each cluster:
Group.1 | F1 | F2 | F3 | F4 | F5 | F6 |
---|---|---|---|---|---|---|
0 | 60 | 0 | 0 | 2 | 0 | 0 |
1 | 0 | 70 | 0 | 0 | 2 | 0 |
2 | 0 | 0 | 100 | 0 | 0 | 1 |
3 | 400 | 0 | 0 | 2 | 0 | 0 |
DT.clients | F1 | F2 | F3 | F4 | F5 | F6 |
---|---|---|---|---|---|---|
0 | 87.42946 | 1.688919 | 2.525917 | 1.7822123 | 0.0470389 | 0.0531626 |
1 | 26.86719 | 116.261664 | 32.485884 | 0.2112874 | 1.5308704 | 0.3021373 |
2 | 28.92897 | 9.828363 | 149.377057 | 0.2467102 | 0.2318550 | 1.4915080 |
3 | 467.07228 | 1.556532 | 4.388723 | 1.7393149 | 0.0227448 | 0.0355559 |
A sample visualization to check the differences among clusters:
## Warning: Removed 1198 rows containing non-finite values (stat_density).
New features
To gain more insights form data we created some new variables to apply a RFM analysis of the users and get better knowledge from them.
# RFM
# Create a new varible with the days from the previous withdrawal for each user
setorder(DT, FECHA)
DT[, DIAS_DESDE_ULT_OPER := as.numeric(difftime(time1 = FECHA, time2 = min(FECHA), units = "days")),by=.(PER_ID_PERSONA)]
# Users requests withdraw funds every 34 days, and 50% of them made it between 10 and 55 days
summary(DT$DIAS_DESDE_ULT_OPER)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 10.00 32.00 34.16 56.00 90.00
# Frequency chart of days since last withdrawal
barplot(table(DT[DIAS_DESDE_ULT_OPER > 0,]$DIAS_DESDE_ULT_OPER), main="# number of withdrawals by days from last withdrawal" , col=viridis(1), border = "white")
Visualizing recency, frequency and average withdrawal amount
# New variables with recency, frequency and monetary value for each user
DT.rfm <- DT[, list( RECENCIA = mean(DIAS_DESDE_ULT_OPER, na.rm = T),
FRECUENCIA = unique(.N),
VALOR_MEDIO = median(IMPOPER, na.rm = T)
)
,by=.(PER_ID_PERSONA)]
# Merge the new variables with our users data frame
DT.clients <- merge(
DT.clients,
DT.rfm,
by=c('PER_ID_PERSONA')
)
p3 <- ggplot(DT.clients[,mean(RECENCIA, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Recency") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))
p4 <- ggplot(DT.clients[,mean(FRECUENCIA, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Frequency") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))
p5 <- ggplot(DT.clients[,mean(VALOR_MEDIO, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Average withdrawal amount") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))
The next three plot let us easily inspect the different behaviour among user in each segment.
New features
# Calculating user age and account duration
DT.clients <- merge(
DT.clients,
DT[!duplicated(DT$PER_ID_PERSONA), list(
PER_ANTIGUEDAD = round(julian(as.Date(Sys.Date()), as.Date(PER_FECHA_ALTA))/365.25, 0),
PER_EDAD = round(julian(as.Date(Sys.Date()), as.Date(PER_FECHA_NAC))/365.25,0)
), by = .(PER_ID_PERSONA)],
by=c('PER_ID_PERSONA')
)
p6 <- ggplot(DT.clients[,mean(PER_EDAD, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), y=V1, fill=as.factor(cluster))) + geom_bar(stat = "identity") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Average user age by segment") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))
# Adding user information
DT.clients <- merge(
DT.clients,
DT[!duplicated(DT$PER_ID_PERSONA),c("PER_ID_PERSONA", "PER_ID_SEXO", "PER_EST_CIVIL", "PER_COD_PAIS_NAC"), with = FALSE],
by=c('PER_ID_PERSONA')
)
# Derive new variables from the original ones for better undestanding of each cluster
client_means.df <- as.data.frame(DT.clients[, lapply(.SD, mean, na.rm=TRUE), by=list(DT.clients$cluster), .SDcols=c(9:11,13) ][order(DT.clients)])
# Export the result for data visualization in Tableu
write.table(client_means.df, file.path('data/tableau_rfm.csv'), row.names = F, col.names = TRUE, sep=",")
# Export client data for visualization in Tableu
write.table(DT.clients, file.path('data/tableau_clients.csv'), row.names = F, col.names = TRUE, sep=",")
Mean recency, frequency, amount and user age by cluster
DT.clients | RECENCIA | FRECUENCIA | VALOR_MEDIO | PER_EDAD |
---|---|---|---|---|
0 | 26 | 9 | 88 | 46 |
1 | 18 | 6 | 110 | 41 |
2 | 21 | 7 | 137 | 43 |
3 | 21 | 4 | 459 | 55 |
Users dataset
The final users dataset with relevant variables:
PER_ID_PERSONA | F1 | F2 | F3 | F4 | F5 | F6 | cluster | RECENCIA | FRECUENCIA | VALOR_MEDIO | PER_ANTIGUEDAD | PER_EDAD | PER_ID_SEXO | PER_EST_CIVIL | PER_COD_PAIS_NAC |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
00005E5DRZUPAFFA7H0C | 60 | 0 | 0 | 2 | 0 | 0 | 0 | 21.50000 | 4 | 60 | 5 | 57 | F | S | MEX |
0003QV0VUJPM1GO20U4O | 0 | 0 | 80 | 0 | 0 | 2 | 2 | 43.00000 | 13 | 80 | 2 | 33 | F | S | ESP |
0004ZGKKY4XVRU311WJK | 0 | 60 | 0 | 0 | 1 | 0 | 1 | 0.00000 | 1 | 60 | NA | NA | F | X | NA |
0008STWH9PQL3KZYB2MK | 60 | 0 | 0 | 2 | 0 | 0 | 0 | 41.30435 | 23 | 60 | 35 | 35 | M | S | ESP |
# Add user cluster assignment to each operation
DT.transactons <- merge(
DT,
DT.r[,c("PER_ID_PERSONA", "cluster"), with = FALSE],
by=c('PER_ID_PERSONA')
)
barplot(table(DT.transactons$cluster), main= "Number of successfull withdrowals requests by segment ",col=viridis(4), border = "white")
# Export withdrawals data for visualization in Tableu
write.table(DT.transactons, file.path('data/tableau_operaciones.csv'), row.names = F, col.names = TRUE, sep=",")
The final withdrawal dataset:
PER_ID_PERSONA | FECHA | ANO | MES | DIA | OP_ADQUIRENTE | ADQUIERENTE | DES_TIPO_ADQUIRENTE | OP_EMISOR | EMISOR | DES_TIPO_EMISOR | DES_AMBITO | OP_IDENT_TERMINAL | OP_COD_POST_COMERCIO | DES_PROVINCIA | LOCALIDAD | OP_COD_PAIS_COMERCIO | DES_MARCA | DES_GAMA | DES_PRODUCTO | TIPO_TARJETA | DES_CREDEB | DES_CLASE_OPERACION | DES_PAGO | DES_RESULTADO | PER_TIPO_PERS | PER_FECHA_ALTA | OF_COD_POST | PER_COD_PAIS_NAC | OF_COD_PAIS_RES | PER_ID_SEXO | PER_EST_CIVIL | PER_MARCA_EMP | PER_MARCA_FALL | PER_FECHA_NAC | NOPER | IMPOPER | DIAS_DESDE_ULT_OPER | cluster |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
00005E5DRZUPAFFA7H0C | 2016-01-08 | 2016 | 01 | 08 | 1066 | Entidad 1016 | EURO 6000 | CM8GMN7BQOF9JJ1XXCPE | KSPHEXET1G2LNR4OXAGU | EURO 6000 | On us | K4T6IAI4DWU3C8YF26M5 | 99999 | NA | NA | ESP | MasterCard | Estándar | MasterCard | P | Crédito | Reintegros | Diferido | OK | E | 2011-03-28 | 23200 | MEX | ESP | F | S | 0 | 0 | 1959-12-05 | 1 | 60 | 0 | 0 |
00005E5DRZUPAFFA7H0C | 2016-01-14 | 2016 | 01 | 14 | 1066 | Entidad 1016 | EURO 6000 | CM8GMN7BQOF9JJ1XXCPE | KSPHEXET1G2LNR4OXAGU | EURO 6000 | On us | K4T6IAI4DWU3C8YF26M5 | 99999 | NA | NA | ESP | MasterCard | Estándar | MasterCard | P | Crédito | Reintegros | Diferido | OK | E | 2011-03-28 | 23200 | MEX | ESP | F | S | 0 | 0 | 1959-12-05 | 1 | 10 | 6 | 0 |