Skip to content

Latest commit

 

History

History
542 lines (484 loc) · 17.4 KB

data_integration.md

File metadata and controls

542 lines (484 loc) · 17.4 KB

Data integration

13 June 2016

The last process before analyze and visualize the results is to integrate all the data sets.

In this phase we query, transform, group, summarize and merge information in several ways to create two datasets. The first one will hold user data and the second withdrawl data.

R package data.table was used again for high resource consuming tasks over a big data set with more than 7MM observations.

Reading data

In the first step we read the original dataset and the results obtained from running the clustering algorithm

knitr::opts_chunk$set(echo = TRUE,fig.align='center')

list.of.packages <- c("data.table", "ggplot2","ggplot2","knitr","viridis","ggthemes","knitr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

A quick plot to check how many users where assigned to each segment.

# Number of users by cluster
barplot(table(DT.r$cluster), main="Number of customers by segment.", col=viridis(3), border = "white")

# Load not scaled variables
DT.c <- readRDS("./data/DTc.rds")

# Add the cluster variable to scaled data frame
DT.clients <- merge(
  DT.c, 
  DT.r[,c("PER_ID_PERSONA", "cluster"), with = FALSE], 
  by=c('PER_ID_PERSONA')
)

# Calculate the median for each variable by cluster
cluster_medians.df <- aggregate(DT.clients[, 2:7, with = FALSE], by = list(DT.clients$cluster), median)

write.table(cluster_medians.df, file.path('data/cluster_medians.csv'), row.names = F, col.names = TRUE, sep=",")

# Calculate the mean for each variable by cluster
setDT(DT.clients)
cluster_means.df <- DT.clients[, lapply(.SD, mean, na.rm=TRUE), by=list(DT.clients$cluster), .SDcols=c(2:7) ][order(DT.clients)]
write.table(cluster_means.df, file.path('data/cluster_means.csv'), row.names = F, col.names = TRUE, sep=",")

Mean and median values for each cluster:

Group.1 F1 F2 F3 F4 F5 F6
0 60 0 0 2 0 0
1 0 70 0 0 2 0
2 0 0 100 0 0 1
3 400 0 0 2 0 0
DT.clients F1 F2 F3 F4 F5 F6
0 87.42946 1.688919 2.525917 1.7822123 0.0470389 0.0531626
1 26.86719 116.261664 32.485884 0.2112874 1.5308704 0.3021373
2 28.92897 9.828363 149.377057 0.2467102 0.2318550 1.4915080
3 467.07228 1.556532 4.388723 1.7393149 0.0227448 0.0355559

A sample visualization to check the differences among clusters:

## Warning: Removed 1198 rows containing non-finite values (stat_density).

New features

To gain more insights form data we created some new variables to apply a RFM analysis of the users and get better knowledge from them.

# RFM

# Create a new varible with the days from the previous withdrawal for each user
setorder(DT, FECHA)
DT[, DIAS_DESDE_ULT_OPER := as.numeric(difftime(time1 = FECHA, time2 = min(FECHA), units = "days")),by=.(PER_ID_PERSONA)]

# Users requests withdraw funds every 34 days, and 50% of them made it between 10 and 55 days
summary(DT$DIAS_DESDE_ULT_OPER)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   10.00   32.00   34.16   56.00   90.00
# Frequency chart of days since last withdrawal
barplot(table(DT[DIAS_DESDE_ULT_OPER > 0,]$DIAS_DESDE_ULT_OPER), main="# number of withdrawals by days from last withdrawal" , col=viridis(1), border = "white")

Visualizing recency, frequency and average withdrawal amount

# New variables with recency, frequency and monetary value for each user
DT.rfm <- DT[, list( RECENCIA = mean(DIAS_DESDE_ULT_OPER, na.rm = T),
                     FRECUENCIA = unique(.N),
                     VALOR_MEDIO = median(IMPOPER, na.rm = T)
)
,by=.(PER_ID_PERSONA)]

# Merge the new variables with our users data frame
DT.clients <- merge(
  DT.clients, 
  DT.rfm, 
  by=c('PER_ID_PERSONA')
)

p3 <- ggplot(DT.clients[,mean(RECENCIA, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Recency") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

p4 <- ggplot(DT.clients[,mean(FRECUENCIA, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Frequency") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

p5 <- ggplot(DT.clients[,mean(VALOR_MEDIO, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Average withdrawal amount") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

The next three plot let us easily inspect the different behaviour among user in each segment.

New features

# Calculating user age and account duration
DT.clients <- merge(
  DT.clients,
  DT[!duplicated(DT$PER_ID_PERSONA), list(
    PER_ANTIGUEDAD = round(julian(as.Date(Sys.Date()), as.Date(PER_FECHA_ALTA))/365.25, 0),
    PER_EDAD = round(julian(as.Date(Sys.Date()), as.Date(PER_FECHA_NAC))/365.25,0) 
  ), by = .(PER_ID_PERSONA)],
  by=c('PER_ID_PERSONA')
)

p6 <- ggplot(DT.clients[,mean(PER_EDAD, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), y=V1, fill=as.factor(cluster))) + geom_bar(stat = "identity") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Average user age by segment") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

# Adding user information
DT.clients <- merge(
  DT.clients, 
  DT[!duplicated(DT$PER_ID_PERSONA),c("PER_ID_PERSONA", "PER_ID_SEXO", "PER_EST_CIVIL", "PER_COD_PAIS_NAC"), with = FALSE], 
  by=c('PER_ID_PERSONA')
)
# Derive new variables from the original ones for better undestanding of each cluster

client_means.df <- as.data.frame(DT.clients[, lapply(.SD, mean, na.rm=TRUE), by=list(DT.clients$cluster), .SDcols=c(9:11,13) ][order(DT.clients)])

# Export the result for data visualization in Tableu
write.table(client_means.df, file.path('data/tableau_rfm.csv'), row.names = F, col.names = TRUE, sep=",")

# Export client data for visualization in Tableu
write.table(DT.clients, file.path('data/tableau_clients.csv'), row.names = F, col.names = TRUE, sep=",")

Mean recency, frequency, amount and user age by cluster

DT.clients RECENCIA FRECUENCIA VALOR_MEDIO PER_EDAD
0 26 9 88 46
1 18 6 110 41
2 21 7 137 43
3 21 4 459 55

Users dataset

The final users dataset with relevant variables:

PER_ID_PERSONA F1 F2 F3 F4 F5 F6 cluster RECENCIA FRECUENCIA VALOR_MEDIO PER_ANTIGUEDAD PER_EDAD PER_ID_SEXO PER_EST_CIVIL PER_COD_PAIS_NAC
00005E5DRZUPAFFA7H0C 60 0 0 2 0 0 0 21.50000 4 60 5 57 F S MEX
0003QV0VUJPM1GO20U4O 0 0 80 0 0 2 2 43.00000 13 80 2 33 F S ESP
0004ZGKKY4XVRU311WJK 0 60 0 0 1 0 1 0.00000 1 60 NA NA F X NA
0008STWH9PQL3KZYB2MK 60 0 0 2 0 0 0 41.30435 23 60 35 35 M S ESP

Withdrawal data

# Add user cluster assignment to each operation
DT.transactons <- merge(
  DT, 
  DT.r[,c("PER_ID_PERSONA", "cluster"), with = FALSE], 
  by=c('PER_ID_PERSONA')
)

barplot(table(DT.transactons$cluster), main= "Number of successfull withdrowals requests by segment ",col=viridis(4), border = "white")

# Export withdrawals data for visualization in Tableu
write.table(DT.transactons, file.path('data/tableau_operaciones.csv'), row.names = F, col.names = TRUE, sep=",")

The final withdrawal dataset:

PER_ID_PERSONA FECHA ANO MES DIA OP_ADQUIRENTE ADQUIERENTE DES_TIPO_ADQUIRENTE OP_EMISOR EMISOR DES_TIPO_EMISOR DES_AMBITO OP_IDENT_TERMINAL OP_COD_POST_COMERCIO DES_PROVINCIA LOCALIDAD OP_COD_PAIS_COMERCIO DES_MARCA DES_GAMA DES_PRODUCTO TIPO_TARJETA DES_CREDEB DES_CLASE_OPERACION DES_PAGO DES_RESULTADO PER_TIPO_PERS PER_FECHA_ALTA OF_COD_POST PER_COD_PAIS_NAC OF_COD_PAIS_RES PER_ID_SEXO PER_EST_CIVIL PER_MARCA_EMP PER_MARCA_FALL PER_FECHA_NAC NOPER IMPOPER DIAS_DESDE_ULT_OPER cluster
00005E5DRZUPAFFA7H0C 2016-01-08 2016 01 08 1066 Entidad 1016 EURO 6000 CM8GMN7BQOF9JJ1XXCPE KSPHEXET1G2LNR4OXAGU EURO 6000 On us K4T6IAI4DWU3C8YF26M5 99999 NA NA ESP MasterCard Estándar MasterCard P Crédito Reintegros Diferido OK E 2011-03-28 23200 MEX ESP F S 0 0 1959-12-05 1 60 0 0
00005E5DRZUPAFFA7H0C 2016-01-14 2016 01 14 1066 Entidad 1016 EURO 6000 CM8GMN7BQOF9JJ1XXCPE KSPHEXET1G2LNR4OXAGU EURO 6000 On us K4T6IAI4DWU3C8YF26M5 99999 NA NA ESP MasterCard Estándar MasterCard P Crédito Reintegros Diferido OK E 2011-03-28 23200 MEX ESP F S 0 0 1959-12-05 1 10 6 0