Data integration

13 June 2016

The last process before analyze and visualize the results is to integrate all the data sets.

In this phase we query, transform, group, summarize and merge information in several ways to create two datasets. The first one will hold user data and the second withdrawl data.

R package data.table was used again for high resource consuming tasks over a big data set with more than 7MM observations.

Reading data

In the first step we read the original dataset and the results obtained from running the clustering algorithm

knitr::opts_chunk$set(echo = TRUE,fig.align='center')

list.of.packages <- c("data.table", "ggplot2","ggplot2","knitr","viridis","ggthemes","knitr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

A quick plot to check how many users where assigned to each segment.

# Number of users by cluster
barplot(table(DT.r$cluster), main="Number of customers by segment.", col=viridis(3), border = "white")

# Load not scaled variables
DT.c <- readRDS("./data/DTc.rds")

# Add the cluster variable to scaled data frame
DT.clients <- merge(
  DT.c, 
  DT.r[,c("PER_ID_PERSONA", "cluster"), with = FALSE], 
  by=c('PER_ID_PERSONA')
)

# Calculate the median for each variable by cluster
cluster_medians.df <- aggregate(DT.clients[, 2:7, with = FALSE], by = list(DT.clients$cluster), median)

write.table(cluster_medians.df, file.path('data/cluster_medians.csv'), row.names = F, col.names = TRUE, sep=",")

# Calculate the mean for each variable by cluster
setDT(DT.clients)
cluster_means.df <- DT.clients[, lapply(.SD, mean, na.rm=TRUE), by=list(DT.clients$cluster), .SDcols=c(2:7) ][order(DT.clients)]
write.table(cluster_means.df, file.path('data/cluster_means.csv'), row.names = F, col.names = TRUE, sep=",")

Mean and median values for each cluster:

Group.1	F1	F2	F3	F4	F5	F6
0	60	0	0	2	0	0
1	0	70	0	0	2	0
2	0	0	100	0	0	1
3	400	0	0	2	0	0

DT.clients	F1	F2	F3	F4	F5	F6
0	87.42946	1.688919	2.525917	1.7822123	0.0470389	0.0531626
1	26.86719	116.261664	32.485884	0.2112874	1.5308704	0.3021373
2	28.92897	9.828363	149.377057	0.2467102	0.2318550	1.4915080
3	467.07228	1.556532	4.388723	1.7393149	0.0227448	0.0355559

A sample visualization to check the differences among clusters:

## Warning: Removed 1198 rows containing non-finite values (stat_density).

New features

To gain more insights form data we created some new variables to apply a RFM analysis of the users and get better knowledge from them.

# RFM

# Create a new varible with the days from the previous withdrawal for each user
setorder(DT, FECHA)
DT[, DIAS_DESDE_ULT_OPER := as.numeric(difftime(time1 = FECHA, time2 = min(FECHA), units = "days")),by=.(PER_ID_PERSONA)]

# Users requests withdraw funds every 34 days, and 50% of them made it between 10 and 55 days
summary(DT$DIAS_DESDE_ULT_OPER)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   10.00   32.00   34.16   56.00   90.00

# Frequency chart of days since last withdrawal
barplot(table(DT[DIAS_DESDE_ULT_OPER > 0,]$DIAS_DESDE_ULT_OPER), main="# number of withdrawals by days from last withdrawal" , col=viridis(1), border = "white")

Visualizing recency, frequency and average withdrawal amount

# New variables with recency, frequency and monetary value for each user
DT.rfm <- DT[, list( RECENCIA = mean(DIAS_DESDE_ULT_OPER, na.rm = T),
                     FRECUENCIA = unique(.N),
                     VALOR_MEDIO = median(IMPOPER, na.rm = T)
)
,by=.(PER_ID_PERSONA)]

# Merge the new variables with our users data frame
DT.clients <- merge(
  DT.clients, 
  DT.rfm, 
  by=c('PER_ID_PERSONA')
)

p3 <- ggplot(DT.clients[,mean(RECENCIA, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Recency") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

p4 <- ggplot(DT.clients[,mean(FRECUENCIA, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Frequency") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

p5 <- ggplot(DT.clients[,mean(VALOR_MEDIO, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), fill=as.factor(cluster))) + geom_bar() + stat_summary_bin(aes(y = V1), fun.y = "mean", geom = "bar") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Average withdrawal amount") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

The next three plot let us easily inspect the different behaviour among user in each segment.

New features

# Calculating user age and account duration
DT.clients <- merge(
  DT.clients,
  DT[!duplicated(DT$PER_ID_PERSONA), list(
    PER_ANTIGUEDAD = round(julian(as.Date(Sys.Date()), as.Date(PER_FECHA_ALTA))/365.25, 0),
    PER_EDAD = round(julian(as.Date(Sys.Date()), as.Date(PER_FECHA_NAC))/365.25,0) 
  ), by = .(PER_ID_PERSONA)],
  by=c('PER_ID_PERSONA')
)

p6 <- ggplot(DT.clients[,mean(PER_EDAD, na.rm = T),by=.(cluster)], aes(x= as.factor(cluster), y=V1, fill=as.factor(cluster))) + geom_bar(stat = "identity") + scale_fill_viridis(discrete=T) + labs(x=NULL, y=NULL, title="Average user age by segment") + theme_tufte(base_family="Helvetica") + theme(axis.ticks=element_blank()) + theme(axis.text=element_text(size=10)) + theme(legend.title=element_text(size=8)) + theme(legend.text=element_text(size=6)) + theme(plot.title=element_text(hjust=0))

# Adding user information
DT.clients <- merge(
  DT.clients, 
  DT[!duplicated(DT$PER_ID_PERSONA),c("PER_ID_PERSONA", "PER_ID_SEXO", "PER_EST_CIVIL", "PER_COD_PAIS_NAC"), with = FALSE], 
  by=c('PER_ID_PERSONA')
)

# Derive new variables from the original ones for better undestanding of each cluster

client_means.df <- as.data.frame(DT.clients[, lapply(.SD, mean, na.rm=TRUE), by=list(DT.clients$cluster), .SDcols=c(9:11,13) ][order(DT.clients)])

# Export the result for data visualization in Tableu
write.table(client_means.df, file.path('data/tableau_rfm.csv'), row.names = F, col.names = TRUE, sep=",")

# Export client data for visualization in Tableu
write.table(DT.clients, file.path('data/tableau_clients.csv'), row.names = F, col.names = TRUE, sep=",")

Mean recency, frequency, amount and user age by cluster

DT.clients	RECENCIA	FRECUENCIA	VALOR_MEDIO	PER_EDAD
0	26	9	88	46
1	18	6	110	41
2	21	7	137	43
3	21	4	459	55

Users dataset

The final users dataset with relevant variables:

PER_ID_PERSONA	F1	F2	F3	F4	F5	F6	cluster	RECENCIA	FRECUENCIA	VALOR_MEDIO	PER_ANTIGUEDAD	PER_EDAD	PER_ID_SEXO	PER_EST_CIVIL	PER_COD_PAIS_NAC
00005E5DRZUPAFFA7H0C	60	0	0	2	0	0	0	21.50000	4	60	5	57	F	S	MEX
0003QV0VUJPM1GO20U4O	0	0	80	0	0	2	2	43.00000	13	80	2	33	F	S	ESP
0004ZGKKY4XVRU311WJK	0	60	0	0	1	0	1	0.00000	1	60	NA	NA	F	X	NA
0008STWH9PQL3KZYB2MK	60	0	0	2	0	0	0	41.30435	23	60	35	35	M	S	ESP

Withdrawal data

# Add user cluster assignment to each operation
DT.transactons <- merge(
  DT, 
  DT.r[,c("PER_ID_PERSONA", "cluster"), with = FALSE], 
  by=c('PER_ID_PERSONA')
)

barplot(table(DT.transactons$cluster), main= "Number of successfull withdrowals requests by segment ",col=viridis(4), border = "white")

# Export withdrawals data for visualization in Tableu
write.table(DT.transactons, file.path('data/tableau_operaciones.csv'), row.names = F, col.names = TRUE, sep=",")

The final withdrawal dataset:

PER_ID_PERSONA	FECHA	ANO	MES	DIA	OP_ADQUIRENTE	ADQUIERENTE	DES_TIPO_ADQUIRENTE	OP_EMISOR	EMISOR	DES_TIPO_EMISOR	DES_AMBITO	OP_IDENT_TERMINAL	OP_COD_POST_COMERCIO	DES_PROVINCIA	LOCALIDAD	OP_COD_PAIS_COMERCIO	DES_MARCA	DES_GAMA	DES_PRODUCTO	TIPO_TARJETA	DES_CREDEB	DES_CLASE_OPERACION	DES_PAGO	DES_RESULTADO	PER_TIPO_PERS	PER_FECHA_ALTA	OF_COD_POST	PER_COD_PAIS_NAC	OF_COD_PAIS_RES	PER_ID_SEXO	PER_EST_CIVIL	PER_MARCA_EMP	PER_MARCA_FALL	PER_FECHA_NAC	NOPER	IMPOPER	DIAS_DESDE_ULT_OPER	cluster
00005E5DRZUPAFFA7H0C	2016-01-08	2016	01	08	1066	Entidad 1016	EURO 6000	CM8GMN7BQOF9JJ1XXCPE	KSPHEXET1G2LNR4OXAGU	EURO 6000	On us	K4T6IAI4DWU3C8YF26M5	99999	NA	NA	ESP	MasterCard	Estándar	MasterCard	P	Crédito	Reintegros	Diferido	OK	E	2011-03-28	23200	MEX	ESP	F	S	0	0	1959-12-05	1	60	0	0
00005E5DRZUPAFFA7H0C	2016-01-14	2016	01	14	1066	Entidad 1016	EURO 6000	CM8GMN7BQOF9JJ1XXCPE	KSPHEXET1G2LNR4OXAGU	EURO 6000	On us	K4T6IAI4DWU3C8YF26M5	99999	NA	NA	ESP	MasterCard	Estándar	MasterCard	P	Crédito	Reintegros	Diferido	OK	E	2011-03-28	23200	MEX	ESP	F	S	0	0	1959-12-05	1	10	6	0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

data_integration.md

data_integration.md

Data integration

Withdrawal data

Files

data_integration.md

Latest commit

History

data_integration.md

File metadata and controls

Data integration

Withdrawal data