diff --git a/aggregate_tables.R b/aggregate_tables.R index bed2d0c..ea593f1 100644 --- a/aggregate_tables.R +++ b/aggregate_tables.R @@ -1,6 +1,7 @@ library(dplyr) library(DBI) library(rjson) +library(parallel) source('s_dplyr.R') source('aggregate_tables/indicator_functions.R') @@ -17,6 +18,11 @@ get_db_connection <- function(system_config_path='config_system.json') { return(db) } +get_cores <- function(system_config_path='config_system.json') { + config <- fromJSON(file=system_config_path)[['data_platform']] + if ('parallel' %in% names(config)) {return(config[['parallel']][['cores']])} else {return (1)} +} + drop_tables <- function(file) { config <- fromJSON(file=file) @@ -32,26 +38,38 @@ write_tables <- function(file, debug) { db <- get_db_connection() for (table.info in config) { print(paste('Computing indicators for ', table.info$table, 'indicator table.')) - df <- compute_indicators(table.info, db, debug) + df <- compute_indicators(table.info, debug) print(paste('Writing ', table.info$table, 'indicator table.')) dbRemoveTable(db$con, name=table.info$table) copy_to(db, df=df, name=table.info$table, temporary=FALSE) } } -compute_indicators <- function(info, db, debug) { +compute_indicators <- function(info, debug) { debug <- as.logical(debug) if (debug == T) {limit = 5000} else {limit = -1} - dfs <- lapply(info$components, function(component) { + cores <- get_cores() + cl <- makeCluster(cores, outfile='/tmp/aggregation.log', type="FORK") + clusterExport(cl,varlist=c('info','limit'),envir=environment()) + clusterExport(cl,varlist=c('get_data_source','s_group_by','aggregate','get_db_connection','fromJSON')) + + dfs <- parLapply(cl,info$components, function(component) { print(paste('Getting data source ', component$table)) + source('s_dplyr.R') + source('aggregate_tables/indicator_functions.R') + source('data_sources.R') + db <- get_db_connection() source.data <- get_data_source(db, component$table, limit) group.by.str <- paste(info$by, collapse=', ') print(paste('Grouping and aggregating', component$table)) df <- source.data %.% s_group_by(group.by.str) %.% aggregate(component$columns) + print(paste('Returning aggregated data from ', component$table)) return(df) }) + print('merging...') merged <- Reduce(function(...) merge(..., all.x=TRUE, all.y=TRUE, by=info$by), dfs) + stopCluster(cl) return(merged) } diff --git a/aggregate_tables.json b/aggregate_tables.json deleted file mode 100644 index be295b4..0000000 --- a/aggregate_tables.json +++ /dev/null @@ -1,117 +0,0 @@ -[ - { - "table":"aggregate_lifetime_interactions", - "by":[ - "domain", - "user_id", - "user_pk" - ], - "components":[ - { - "table":"visit_detail", - "columns":{ - "date_first_visit":"date_first_visit", - "date_last_visit":"date_last_visit", - "nvisits":"nvisits", - "calendar_month_on_cc":"calendar_month_on_cc", - "nforms":"nforms", - "median_visit_duration":"median_visit_duration", - "median_visits_per_day":"median_visits_per_day", - "median_visits_per_month":"median_visits_per_month", - "morning":"morning", - "afternoon":"afternoon", - "evening":"evening", - "night":"night", - "time_using_cc":"time_using_cc", - "nvisits_travel":"nvisits_travel", - "nvisits_travel_batch":"nvisits_travel_batch" - } - }, - { - "table":"interactions", - "columns":{ - "ninteractions":"ninteractions", - "ncases_registered":"ncases_registered", - "register_followup":"register_followup", - "case_register_followup_rate":"case_register_followup_rate", - "ncases_touched":"ncases_touched", - "nunique_followups":"nunique_followups" - } - }, - { - "table":"device_type", - "columns":{ - "summary_device_type":"summary_device_type", - "nforms": "nforms", - "active_days":"active_days", - "active_months":"active_months" - - } - }, - { - "table":"device_log_types_by_user", - "columns":{ - "total_logs":"total_logs", - "audio_plays":"audio_plays", - "network_warnings":"network_warnings" - } - } - ] - }, - { - "table":"aggregate_monthly_interactions", - "by":[ - "domain", - "user_id", - "user_pk", - "month.index" - ], - "components":[ - { - "table":"visit_detail", - "columns":{ - "date_first_visit":"date_first_visit", - "date_last_visit":"date_last_visit", - "nvisits":"nvisits", - "median_visit_duration":"median_visit_duration", - "median_visits_per_day":"median_visits_per_day", - "morning":"morning", - "afternoon":"afternoon", - "evening":"evening", - "night":"night", - "numeric_index":"numeric_index", - "time_using_cc":"time_using_cc", - "nvisits_travel":"nvisits_travel", - "nvisits_travel_batch":"nvisits_travel_batch" - } - }, - { - "table":"interactions", - "columns":{ - "ninteractions":"ninteractions", - "ncases_registered":"ncases_registered", - "register_followup":"register_followup", - "case_register_followup_rate":"case_register_followup_rate", - "ncases_touched":"ncases_touched", - "nunique_followups":"nunique_followups" - } - }, - { - "table":"device_type", - "columns":{ - "summary_device_type":"summary_device_type", - "nforms": "nforms", - "active_days":"active_days" - } - }, - { - "table":"device_log_types_by_user", - "columns":{ - "total_logs":"total_logs", - "audio_plays":"audio_plays", - "network_warnings":"network_warnings" - } - } - ] - } -] \ No newline at end of file diff --git a/aggregate_tables/indicator_functions.R b/aggregate_tables/indicator_functions.R deleted file mode 100644 index b15edfe..0000000 --- a/aggregate_tables/indicator_functions.R +++ /dev/null @@ -1,92 +0,0 @@ -## TODO: I think it would be good to switch over to lubridate to do -## all our date manipulations. We have some legacy code that uses -## zoo. -library(lubridate) -library(zoo) - -# VISIT TABLE INDICATORS: -date_first_visit <- function(x) min(x$visit_date, na.rm=TRUE) -date_last_visit <- function(x) max(x$visit_date, na.rm=TRUE) -nvisits <- function(x) NROW(x) - -get_visit_duration <- function(start, end) { - dur <- as.numeric(end - start, units="mins") - if (is.na(dur)) return(NA) - if (dur < 0) return(0) - if (dur > 30) return(30) - return(dur) -} -durations <- function(x) na.omit(mapply(get_visit_duration, x$time_start, x$time_end)) -median_visit_duration <- function(x) round(as.numeric(median(durations(x))), digits = 1) -time_using_cc <- function(x) sum(durations(x), na.rm = T) - -median_visits_per_day <- function(x) median(as.numeric(table(x$visit_date)), na.rm=TRUE) -nvisits_travel <- function(x) sum(x$home_visit, na.rm=T) -nvisits_travel_batch <- function(x) sum(x$time_since_previous_hv/60<10, na.rm = T) - -# Proportion of visits by time of day -morning <- function(x) mean(x$visit_time == 'morning')*100 -afternoon <- function(x) mean(x$visit_time == 'afternoon')*100 -evening <- function(x) mean(x$visit_time == 'night')*100 -night <- function(x) mean(x$visit_time == 'after midnight')*100 - -# User's first, second, third... etc. month on CC -numeric_index <- function (x) { - first_possible_visit_date <- as.POSIXct(strptime("2010-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")) - - this_month <- as.POSIXct(format(min(x$time_start),"%Y-%m-01"), tz = "UTC") - if (this_month < first_possible_visit_date) { return (1) } - - start_month <- as.POSIXct(format(min(x$user_start_date),"%Y-%m-01"), tz = "UTC") - if (start_month < first_possible_visit_date) {start_month <- first_possible_visit_date} - - total_months <- length(seq(from=start_month, to=this_month, by='month')) - return (total_months) -} - -## The next indicators are only applicable for the lifetime table. -calendar_month_on_cc <- function(x) { - first.month <- as.yearmon(date_first_visit(x)) - last.month <- as.yearmon(date_last_visit(x)) - nmonths <- 12 * as.numeric(last.month - first.month) + 1 - return(nmonths) -} -median_visits_per_month <- function(x) median(as.numeric(table(as.yearmon(x$visit_date))), na.rm=TRUE) - -# INTERACTION TABLE INDICATORS: -ninteractions <- function(x) NROW(x) -ncases_registered <- function(x) sum(x$created, na.rm=TRUE) -register_followup <- function(x) sum(!x$created) -case_register_followup_rate <- function(x) mean(!x$created)*100 - -ncases_touched <- function(x) length(unique(x$case_id)) -n_followups <- function(x) { - stopifnot(!any(is.na(x$created))) - stopifnot(all(x$created == 0 | x$created == 1)) - return(length(x$case_id[x$created == 0])) -} -nunique_followups <- function(x) { - stopifnot(!any(is.na(x$created))) - stopifnot(all(x$created == 0 | x$created == 1)) - return(length(unique(x$case_id[x$created == 0]))) -} -median_days_btw_followup <- function(x) median(x$days_elapsed_case) - -# DEVICE TYPE TABLE INDICATORS: -summary_device_type <- function (x) { - if (length(unique(x$device)) == 1) { - s <- paste(toupper(substring(x$device[1], 1,1)), substring(x$device[1], 2), - sep="", collapse=" ") - return (s) - } else { - return ('Multi') - } -} -nforms <- function(x) NROW(x) -active_days <- function(x) length(unique(as.Date(x$time_start))) -active_months <- function(x) length(unique(x$month.index)) - -# DEVICE LOG TABLE INDICATORS: -total_logs <-function(x) sum(x$num_logs) -audio_plays <-function(x) if (NROW(x[x$log_type=='audio',]) > 0) sum(x[x$log_type=='audio',c('num_logs')]) else 0 -network_warnings <-function(x) if (NROW(x[x$log_type=='warning-network',]) > 0) sum(x[x$log_type=='warning-network',c('num_logs')]) else 0 diff --git a/analysis_scripts/mchen/blog_3/user_consistency.R b/analysis_scripts/mchen/blog_3/user_consistency.R new file mode 100644 index 0000000..0e8e975 --- /dev/null +++ b/analysis_scripts/mchen/blog_3/user_consistency.R @@ -0,0 +1,210 @@ +##################################################################### +# visualization code developed for Dimagi's data blog series +# Dec 17, 2014 +##################################################################### + +library(grid) +library(gridExtra) +library(ggplot2) +library(sorvi) +library(dplyr) +library(RColorBrewer) + + + + + +############################################ +# domain consistency comparison plot +############################################ + +# side-by-side plot of two demo domains: domain 18 and 264 +# data source: domain_consistency_comparison.csv +data <- read.table("domain_consistency_comparison.csv",header=T,sep=",") +dm1 <- data %>% filter(data$domain_numeric == 18) +dm2 <- data %>% filter(data$domain_numeric == 264) + +source("plot_funcs.R") +plot1 <- plotOut(dm1,"red","#67a9cf","#ef8a62",50,50,paste("r^2==",0.75),5,45) +plot2 <- plotOut(dm2,"red","#67a9cf","#ef8a62",50,50,paste("r^2==",0.36),5,45) + +# output two plots side-by-side +# format: pdf +pdf(file="domain_comparison.pdf", + width=14, + height=10) +# png("domain_comparison.png", width = 1600, height = 1600, units = "px", pointsize = 16,bg = "transparent", res=150) +grid.arrange(plot1, plot2, ncol = 2) +dev.off() + + + + + +############################################ +# top10 vs. bottom10 comparison plot +############################################ +top_10p <- read.table("top_10p.csv",header=T,sep=",") +bottom_10p <- read.table("bottom_10p.csv",header=T,sep=",") +plot3 <- plotOut(top_10p, "red", "#67a9cf","#ef8a62",100,100,paste("r^2==",0.69),10,90) +plot4 <- plotOut(bottom_10p, "red", "#67a9cf","#ef8a62",100,100,paste("r^2==",0.34),10,90) +pdf(file="top_bottom_comparison.pdf", + width=14, + height=10) +grid.arrange(plot3, plot4, ncol = 2) +dev.off() + + + + + +###################### +# density plot +###################### +# main plot: March only data +# exclude users who visit more than 50 cases +# exclude users who visit only 1 case that month + +test2 <- read.table("test2.csv",header=T,sep=",") +data <- test2 %>% filter(test2$x <= 50 & test2$x > 1) +mainPlotData<- data %>% filter(as.character(data$calendar_month) == "2014-03-01") +p <- ggplot(mainPlotData, aes(x = x, y = y)) + + geom_point(position=position_jitter(w=0.4,h=0.4), + colour="black", + alpha=0.7) + + xlim(1,50) + ylim(1,50) +p1 <- p + + stat_density2d(aes(fill = rev(..level..)), geom = "polygon") +p2 <- p1 + + labs(x="Cases visited in March 2014", y = "Cases visited in April 2014") + + theme_bw() + + theme(legend.position="none") + + theme(aspect.ratio = 1) +# increase label size, space between axis and labels +p3 <- p2 + theme(plot.margin = unit(c(1.5,1.5,1.5,1.5), "cm"), + axis.title.x = element_text(size=rel(1.5), vjust = -1.5), + axis.title.y = element_text(size=rel(1.5), vjust = 1.5) +) + +ggsave("mainPlot.pdf", p3, dpi = 600, scale = 1.5, width = 4, height = 4) + + + + + +###################### +# Line graph +###################### +attrition <- read.table("leadup.csv", header = T, sep = ",") +month_levels <- rev(levels(attrition$X1)) + +l1 <- ggplot(attrition, aes(x = X1, y = X2, colour = att_duration, group = att_duration, linetype = att_duration)) + + geom_point(shape = 15, size = 4.0, colour="peachpuff4") + + geom_line(size = 1.5) + + scale_x_discrete("Month 'X' before attrition event",limits = month_levels) + +l2 <- l1 + + scale_y_continuous("Number of cases visited",limits=c(0,12),breaks=c(0,4,8,12)) + + theme(aspect.ratio=0.5, +# panel.grid.minor=element_blank(), + panel.grid.major=element_line(colour = "white",size=1), + panel.background=element_blank()) + +l3 <- l2 + theme(axis.title.x = element_text(size=rel(1.5), vjust = -1.5), + axis.title.y = element_text(size=rel(1.5), vjust = 1.5), + legend.title = element_blank()) + + + + + + +########################################## +# user experience: stacked barplot +########################################## + +# use colclasses to load data faster (http://www.r-bloggers.com/using-colclasses-to-load-data-more-quickly-in-r/) +ux_data <- read.csv("user_experience.csv", + stringsAsFactors = FALSE, + colClasses=c("integer","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric", + "factor","factor","factor","factor")) +colnames(ux_data)[14:17] <- c("Q1 to Q2","Q2 to Q3", "Q3 to Q4", "Q1 to Q4") + +# resize bins: +# <-20%, 1 +#[-20%,20%], 2 +#[20%,50%],3 +# >50%,4 + +binCut <- function(var, n1, n2, n3, n4){ + sapply(var, function(x) + if(x <= n1) {return("1") + } else if(x > n1 & x <= n2) {return("2") + } else if (x > n2 & x <= n3) {return("3") + } else if (x > n3 & x <= n4) {return("4") + } else {return("5") + }) +} + +ux_data$Q1_to_Q2 <- binCut(ux_data$percent_change_q1_2, -50, -20, 20, 50) +ux_data$Q2_to_Q3 <- binCut(ux_data$percent_change_q2_3, -50, -20, 20, 50) +ux_data$Q3_to_Q4 <- binCut(ux_data$percent_change_q3_4, -50, -20, 20, 50) +ux_data$Q1_to_Q4 <- binCut(ux_data$percent_change_q1_4, -50, -20, 20, 50) + +# reshape data: wide to long +l <- reshape(ux_data, + varying = c("Q1_to_Q2","Q2_to_Q3","Q3_to_Q4","Q1_to_Q4"), + v.names = "percentage_change", + timevar = "quarter", + times = c("Q1_to_Q2","Q2_to_Q3","Q3_to_Q4","Q1_to_Q4"), + direction = "long") + +# change the order of levels of factor quarter for visualization +l$quarter <- factor(l$quarter, levels = c("Q1_to_Q2","Q2_to_Q3","Q3_to_Q4","Q1_to_Q4")) + +p0 <- ggplot(l, aes(x=factor(quarter), fill=factor(percentage_change), y=4*100*(..count..)/sum(..count..))) +p1 <- p0 + geom_bar(width=0.7) + + scale_fill_manual(values = c("#e34a33","#fdbb84","#e5e5ab","#91bfdb","#67a9cf"), + breaks = c(1:5), + labels = c("Substantial drop (>50%)", + "Moderate drop (20-50%)", + "Stable (+/- 20%)", + "Moderate increase (20-50%)", + "Substantial increase (>50%)")) + +########################################################################## +### control style (http://www.cookbook-r.com/Graphs/Legends_(ggplot2)/)### +########################################################################## + +# change title appearance +p2 <- p1 + + labs(x="Interval", y="Percent of Users") + + scale_x_discrete(labels=c("Q1 to Q2", + "Q2 to Q3", + "Q3 to Q4", + "Q1 to Q4")) + + theme(plot.title = element_text(face = "bold")) + +# set title to twice the base font size +p3 <- p2 + theme(plot.title = element_text(size = rel(1.8))) + +# change panel and plot attributes +p4 <- p3 + theme(panel.background = element_blank()) + +# change legend attributes +p5 <- p4 + theme(legend.text = element_text(size = 10), + legend.title = element_blank(), + legend.background = element_rect(colour = "black"), + legend.key.size = unit(0.75, "cm"), + axis.title.x = element_text(vjust = - 1.5, size = rel(1.35)), + axis.title.y = element_text(vjust = 1.5, size = rel(1.35)), + plot.margin = unit(c(1,1,1,1),"cm")) + +# reverse the legend order +p6 <- p5 + guides(fill = guide_legend(reverse=TRUE)) + +print(p6) +ggsave("version6.pdf", p6, dpi = 600) + + + diff --git a/analysis_scripts/mchen/blog_5/data_prep_no_gap.R b/analysis_scripts/mchen/blog_5/data_prep_no_gap.R new file mode 100644 index 0000000..5c4c4a3 --- /dev/null +++ b/analysis_scripts/mchen/blog_5/data_prep_no_gap.R @@ -0,0 +1,98 @@ +library(plyr) +library(dplyr) +library(zoo) + +# load data +data = tbl_df(read.csv("blog.csv", stringsAsFactors=FALSE)) +data$calendar_month = as.Date(data$calendar_month, "%m/%d/%y") + +# subset columns of interest +all = select(data, domain_numeric, user_pk, active_days, ncases_touched, calendar_month) +all = filter(all, calendar_month >= as.Date("2010-01-01")) %>% + filter(., calendar_month <= as.Date("2014-12-01")) # reformat date + +# flagship project in India (pre-rebalancing) +crs = filter(all, domain_numeric == 40) + +# total active months per user +m = all %>% + group_by(domain_numeric, user_pk) %>% # duplicated user_id exists across domains + summarise(nmonths = n_distinct(calendar_month)) + +# excluding first N months of data (depends on reporting needs) +excdMonth = function(data, n) { + data = data %>% + group_by(domain_numeric, user_pk) %>% + filter(., row_number() > n) + return(data) +} + + +# Rebalancing data: no domain contribute more than 10% to the data +# total active months per domain +totalMonths = function(x) { + y = tbl_df(as.data.frame(table(x$domain_numeric))) + y = arrange(y, desc(Freq)) + names(y) = c("domain_numeric", "months") + y$pct = y$months/sum(y$months) + return(y) +} +# min months to be dropped (or added) on each domain to get to 10% contribution +drop = function(x, N) { + x$bal = (x$months - sum(x$months)*N)/(1-N) + return(x) +} + +rebalanceData = function(data, N) { + ov_month_data = totalMonths(data) + K = drop(ov_month_data, N) + dms = filter(K, bal > 0) + keepVal = dms$months - ceiling(dms$bal) + rebal_dm = as.numeric(as.character(dms$domain_numeric)) + sdata = filter(data, domain_numeric %in% rebal_dm) + ndata = filter(data, !(domain_numeric %in% rebal_dm)) + sp = split(sdata, sdata$domain_numeric) + for (i in seq_len(length(sp))) { + sp[[i]] = sp[[i]][sample(1:nrow(sp[[i]]), keepVal[i], FALSE),] + } + out = tbl_df(do.call(rbind, sp)) + rebalanced_data = tbl_df(rbind(out, ndata)) + return(rebalanced_data) +} + +# subset 1: all data rebalanced +rbl_all = rebalanceData(all, 0.10) +p1 = excdMonth(rbl_all, 3) + +# subset 2: all data excluding first 6 months +p2 = excdMonth(all, 6) +# reblanaced +rbl_p2 = rebalanceData(p2, 0.10) + +# subset 3: all data (unbalanced) including only users that have at least 18 months of usage +# rebalanced +u = select(filter(m, nmonths >= 18), user_pk) +qdata = filter(all, user_pk %in% u$user_pk) # exclude users active for < 18 months + +# indexing each user-month +qdata = arrange(qdata, domain_numeric, user_pk, calendar_month) +qdata = qdata %>% + group_by(domain_numeric, user_pk) %>% + mutate(user_month_index = seq(n())) + +# Bin user months to custom quarters +qdata$user_quarter_index = round_any(qdata$user_month_index, 3, ceiling)/3 +# Rebalance data from quarterly data (only for Q1-Q6) +qdata_sub = filter(qdata, user_quarter_index <= 6) +qdata_split = split(qdata_sub, qdata_sub$user_quarter_index, drop=TRUE) +rbl_qdata_split = lapply(qdata_split, function(x) rebalanceData(x, 0.10)) +rbl_qdata = do.call(rbind, rbl_qdata_split) + +# subset 4: 12 most active domains in terms of total user-months +bb = m %>% + group_by(domain_numeric) %>% + summarise(um = sum(nmonths)) %>% + arrange(., desc(um)) %>% + top_n(12) + +bb_data = filter(rbl_all, domain_numeric %in% bb$domain_numeric) diff --git a/analysis_scripts/mchen/blog_5/data_prep_with_gap.R b/analysis_scripts/mchen/blog_5/data_prep_with_gap.R new file mode 100644 index 0000000..e91c636 --- /dev/null +++ b/analysis_scripts/mchen/blog_5/data_prep_with_gap.R @@ -0,0 +1,78 @@ +################################### +# ADD GAP MONTHS BACK TO THE GAME # +################################### + +library(Hmisc) # monthDays() to get the exact number of days in a specific month + +# for each user, generate a vector of months according to the distance variable +expandMonths = function(data) { + months = seq(from = data$first_active_month, to = data$last_active_month, + by = "months") # length.out is adding extra month for some reason + noGap = data.frame(rep(data$user_pk,length(months)), months) + return(noGap) +} + +# for each user, get distance between first and last active month +gapBack = function(data){ + data = arrange(data, domain_numeric, user_pk, calendar_month) + uf = tbl_df(ddply(data, .(domain_numeric, user_pk), function(x)head(x,n=1))) + ul = tbl_df(ddply(data, .(domain_numeric, user_pk), function(x)tail(x,n=1))) + colnames(uf)[5] = c("first_active_month") + colnames(ul)[5] = c("last_active_month") + uf = select(uf, domain_numeric, user_pk, first_active_month) + ul = select(ul, domain_numeric, user_pk, last_active_month) + ufl = inner_join(uf, ul) + ufl$distance = 12*as.numeric(as.yearmon(ufl$last_active_month) - + as.yearmon(ufl$first_active_month)) + 1 + expanded = tbl_df(ddply(ufl, .(domain_numeric, user_pk), function(x) expandMonths(x))) + expanded = select(expanded, domain_numeric, user_pk, months) + names(expanded) = c("domain_numeric", "user_pk", "calendar_month") + return(expanded) +} + +# join the expanded data frame with all monthly data +all_expanded = gapBack(all) +all_expanded = left_join(expanded, all) +all_expanded[is.na(all_expanded)] <- 0 + +# for all data excluding 3 months, add back gap months +p1$percent_active_days = p1$active_days/monthDays(p1$calendar_month) +p1_expanded = gapBack(p1) +p1_expanded = left_join(p1_expanded, p1) +p1_expanded[is.na(p1_expanded)] <- 0 + +# for all data excluding 6 months, add back gap months +p2$percent_active_days = p2$active_days/monthDays(p2$calendar_month) +p2_expanded = gapBack(p2) +p2_expanded = left_join(p2_expanded, p2) +p2_expanded[is.na(p2_expanded)] <- 0 + +# for 12 big projects, add back gap months +bb_data$percent_active_days = bb_data$active_days/monthDays(bb_data$calendar_month) +bb_list = split(bb_data, bb_data$domain_numeric) +bb_data_expanded = gapBack(bb_data) +bb_data_expanded = left_join(bb_data_expanded, bb_data) +bb_data_expanded[is.na(bb_data_expanded)] <- 0 + +# for quarterly data, add back gap months +# adding back gap months means user_quarter_index needs to be recomputed +qdata$percent_active_days = qdata$active_days/monthDays(qdata$calendar_month) +qdata_expanded = gapBack(qdata) +qdata_expanded = left_join(qdata_expanded, qdata) +qdata_expanded[is.na(qdata_expanded)] <- 0 + +# recompute quarterly index after adding gap months +# reindex user_month_index +qdata_expanded = + qdata_expanded %>% + group_by(domain_numeric, user_pk) %>% + mutate(umi = seq_len(n())) +qdata_expanded$user_month_index <- NULL + +# Bin user months to custom quarters +bins = c(1+(3*(0:ceiling(max(qdata_expanded$umi)/3)))) +labs = paste("Month ", 3*seq(length(bins)-1)-2, "-", 3*seq(length(bins)-1), sep = "") + +qdata_expanded = qdata_expanded %>% + group_by(domain_numeric, user_pk) %>% + mutate(uqi = cut(umi, breaks = bins, labels = labs, right = FALSE)) diff --git a/analysis_scripts/mchen/blog_5/loess_smoothing.R b/analysis_scripts/mchen/blog_5/loess_smoothing.R new file mode 100644 index 0000000..892c462 --- /dev/null +++ b/analysis_scripts/mchen/blog_5/loess_smoothing.R @@ -0,0 +1,195 @@ +# Element: Point(position(user_pk, percent_active_days)) +# Dim(1): percent of active days +# Dim(2): ranksource("data_prep_no_gap.R") + +# functions + # user-month percent active days +loessTbl = function(var){ + var = round(var, digits = 2) + m1 = as.data.frame(table(var)) + names(m1) = c("percent_active_days", "count") + m1$pct = round(m1$count/sum(m1$count), digits = 4) + m1$percent_active_days = as.numeric(as.character(m1$percent_active_days)) + return(m1) +} + +binning = function(x){ + x$bins = cut(x$percent_active_days, breaks = 0.05*(0:20), + labels = c("0.05","0.10","0.15","0.20","0.25","0.30","0.35","0.40", + "0.45","0.50","0.55","0.60","0.65","0.70","0.75", + "0.80","0.85","0.90","0.95","1.0"), + include.lowest = TRUE) + return(x) +} + +dataPerUser = function(dataList){ + d1 = lapply(dataList, function(x){ + y = x %>% + group_by(user_pk) %>% + summarise(pad = mean(percent_active_days)) # obtain average %days for each unique user + z=loessTbl(y$pad) + return(z) + }) +} + +singleBinnedPlot = function(d1){ + d1 = binning(d1) + d2 = d1 %>% + group_by(bins) %>% + summarise(percent = sum(pct)) + return(d2) +} + +binnedDataPlot = function(d1) { + d2 = lapply(d1, function(x)binning(x)) + d3 = lapply(d2, function(x){ + y = x %>% + group_by(bins) %>% + summarise(percent = sum(pct)) + return(y) + }) + for (i in 1:length(d3)){ + d3[[i]]$facet = names(d3)[i] + } + d4 = do.call(rbind, d3) + return(d4) +} + + +# Data: all data excluding first 3 months +# Figure 1.1 %days, user-month, excluding gap months +data_1_1 = loessTbl(p1$percent_active_days) +data_1_1_binned = singleBinnedPlot(data_1_1) + +# Figure 1.3 %days, per user, excluding gap months +data_1_3 = p1 %>% + group_by(domain_numeric, user_pk) %>% + summarise(pad = mean(percent_active_days)) +data_1_3 = loessTbl(data_1_3$pad) +data_1_3 = singleBinnedPlot(data_1_3) + +# Data: all data excluding first 6 months +# Figure 2.1 %days, user-month, excluding gap months +data_2_1 = loessTbl(p2$percent_active_days) +data_2_1_binned = singleBinnedPlot(data_2_1) + +# Loess smooth plot +lineIntercept = c(quantile(p2$percent_active_days, c(.25, .50, .75)), + mean(p2$percent_active_days)) +f1_1 = ggplot(data_2_1_binned, aes(x=as.character(bins), y=percent)) + + geom_point(size = 2.5)+ + labs(x="", + y="")+ + geom_smooth(se=FALSE, col = "#882608", size = 1.5, aes(group=1))+ + geom_vline(xintercept = lineIntercept[1]/0.05, + linetype = "longdash", color = "#005266", size = 1) + + geom_vline(xintercept = lineIntercept[2]/0.05, + linetype = "solid", color = "#005266", size = 1) + + geom_vline(xintercept = lineIntercept[3]/0.05, + linetype = "longdash", color = "#005266", size = 1) + + geom_vline(xintercept = lineIntercept[4]/0.05, + linetype = "dotted", color = "#005266", size = 1) + + ylim(0, 0.20) + + guides(fill=FALSE) + + theme(panel.background = element_rect(fill="#d5e4eb"), + plot.background = element_rect(fill="#d5e4eb"), + panel.grid.major.y = element_line(size=1.0), + panel.grid.minor.y = element_blank(), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank(), + axis.ticks.y = element_blank(), + legend.position = "none") + +ggsave("f1_1.pdf", f1_1, width=10, height=10) + +# Data: 12 big domains +bb_list = split(bb_data, bb_data$domain_numeric) +bb_expanded_list = split(bb_data_expanded, bb_data_expanded$domain_numeric) +# Figure 3.1 %days, user-month, excluding gap months +data_3_1 = lapply(bb_list, function(x) loessTbl(x$percent_active_days)) +data_3_1_binned = binnedDataPlot(data_3_1) + +# get x-axis intercept for quantile lines for data frames in a list +lineIntercept = function(data){ + y = lapply(data, function(x)c(quantile(x$percent_active_days, + c(.25, .50, .75)), + mean(x$percent_active_days))) + facet = names(y) + li = as.data.frame(do.call("rbind", y)) + li$facet = facet + names(li)[4] <- c("mean") + return(li) +} + +li = lineIntercept(bb_list) + +f3_1 = ggplot(data_3_1_binned, aes(x=as.character(bins), y=percent)) + + geom_point(size = 2.5)+ + facet_wrap(~facet, ncol=3, as.table = TRUE) + + labs(x="", + y="")+ + geom_smooth(se=FALSE, col = "#882608", size = 1.5, aes(group=1))+ + ylim(0, 0.50) + + guides(fill=FALSE) + + theme(panel.background = element_rect(fill="#d5e4eb"), + plot.background = element_rect(fill="#d5e4eb"), + panel.grid.major.y = element_line(size=1.0), + panel.grid.minor.y = element_blank(), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank(), + axis.ticks.y = element_blank(), + legend.position = "none") + +f3_1 = + f3_1 + geom_vline(aes(xintercept = li[,1]/0.05), li, + linetype = "longdash", color = "#005266", size = 1) + + geom_vline(aes(xintercept = li[,2]/0.05), li, + linetype = "solid", color = "#005266", size = 1) + + geom_vline(aes(xintercept = li[,3]/0.05), li, + linetype = "longdash", color = "#005266", size = 1) + + geom_vline(aes(xintercept = li[,4]/0.05), li, + linetype = "dotted", color = "#005266", size = 1) + + +# Data: First 6 quarters of all users that have been active for at least 18 months on CC +# Figure set 4: Quarterly plot of all data (excluding first 3 months) +qdata_gap_out = split(qdata, qdata$user_quarter_index) +qdata_gap_out = qdata_gap_out[1:6] + +# figure 4.1 %days, user-month, excluding gap months +data_4_1 = lapply(qdata_gap_out, function(x)loessTbl(x$percent_active_days)) +data_4_1_binned = binnedDataPlot(data_4_1) +data_4_1_binned$facet = factor(data_4_1_binned$facet, + levels = c("Month 1-3", "Month 4-6", + "Month 7-9", "Month 10-12", + "Month 13-15", "Month 16-18")) + +li = lineIntercept(qdata_gap_out) + +# plot it +f4_1 = ggplot(data_4_1_binned, aes(x=bins, y=percent)) + + geom_point(size = 2.5)+ + facet_wrap(~facet, ncol=3, as.table = TRUE) + + # facet_grid(facet~.) + + labs(x="", + y="")+ + geom_smooth(se=FALSE, col = "#882608", size = 1.5, aes(group=1))+ + ylim(0, 0.25) + # upper limit is obtained from summary(sub_qd$pct) + guides(fill=FALSE) + + theme(panel.background = element_rect(fill="#d5e4eb"), + plot.background = element_rect(fill="#d5e4eb"), + panel.grid.major.y = element_line(size=1.0), + panel.grid.minor.y = element_blank(), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank(), + axis.ticks.y = element_blank(), + legend.position = "none") + +f4_1 + geom_vline(aes(xintercept = li[,1]/0.05), li, + linetype = "longdash", color = "#005266", size = 1) + + geom_vline(aes(xintercept = li[,2]/0.05), li, + linetype = "solid", color = "#005266", size = 1) + + geom_vline(aes(xintercept = li[,3]/0.05), li, + linetype = "longdash", color = "#005266", size = 1) + + geom_vline(aes(xintercept = li[,4]/0.05), li, + linetype = "dotted", color = "#005266", size = 1) diff --git a/analysis_scripts/mchen/blog_5/user performance distribution.Rproj b/analysis_scripts/mchen/blog_5/user performance distribution.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/analysis_scripts/mchen/blog_5/user performance distribution.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/analysis_scripts/mchen/blog_6/barcharts.R b/analysis_scripts/mchen/blog_6/barcharts.R new file mode 100644 index 0000000..b701039 --- /dev/null +++ b/analysis_scripts/mchen/blog_6/barcharts.R @@ -0,0 +1,129 @@ +# barcharts +library(readxl) +f1 = read_excel("figures.xlsx", "f1") +f2 = read_excel("figures.xlsx", "f2") +f3 = read_excel("figures.xlsx", "f3") + +names(f1) = c("active_months","active_q4","inactive_q4") +wide_f1 = reshape(f1, idvar="active_months",varying=c("active_q4","inactive_q4"), + v.names="number",direction="long") +names(wide_f1)[2] = c("status") +wide_f1$status = as.factor(wide_f1$status) # position_dodge would not work otherwise +wide_f1$active_months = factor(wide_f1$active_months, + levels(wide_f1$active_months)[c(1,4,7,9,2,3,5,6,8)]) + +library(ggplot2) +library(grid) + +# SCALES +# dim(1): active months on CC +# dim(2): number of programs (active/inactive) +# ELEMENTS: bar +maxnum = max(wide_f1$number) +minnum = min(wide_f1$number) +ggplot(wide_f1, aes(x=active_months, + y=number, + fill=status)) + + geom_bar(width=0.65, position="dodge", stat="identity") + + scale_y_continuous(breaks = seq(minnum, maxnum, by=10))+ + scale_fill_manual(values=c("#01526d","#00a4dc"), + name="", + labels=c("Active in Q4 2014 (ongoing programs)", + "Not active in Q4 2014")) + + labs(x = "Active months on CommCare", + y = "No. of programs") + + theme(legend.position = "top", + legend.background = element_rect(fill = "transparent"), + axis.ticks = element_blank(), + axis.ticks.margin = unit(0.25, "cm"), + axis.text.x = element_text(face="bold",colour="black"), + axis.text.y = element_text(face="bold",colour="black"), + panel.background = element_rect(fill = "#d5e2e8"), + plot.background = element_rect(fill = "#d5e2e8"), + panel.grid.major.x = element_blank(), + panel.grid.minor.y = element_blank() + ) +ggsave("figure_1.pdf",width=8,height=8) + +# chart 2: months on CC +names(f2) = c("months", "total", "active") +f2$months = as.factor(f2$months) +f2$months = factor(f2$months, levels(f2$months)[c(1,4,7,9,2,3,5,6,8)]) +f2_1 = ggplot(subset(f2, select = c(months, total)), aes(x=months, y=total))+ + geom_bar(width=0.6, stat="identity", fill="#01526d") + + labs(x="", + y="")+ + theme(legend.position = "top", + legend.background = element_rect(fill = "transparent"), + axis.ticks = element_blank(), + axis.ticks.margin = unit(0.25, "cm"), + axis.text.x = element_text(face="bold",colour="black",angle=90), + axis.text.y = element_text(face="bold",colour="black",angle=90), + panel.background = element_rect(fill = "#d5e2e8"), + plot.background = element_rect(fill = "#d5e2e8"), + panel.grid.major.x = element_blank(), + panel.grid.minor.y = element_blank() + ) + coord_flip() + +f2_2 = ggplot(subset(f2, select = c(months, active)), aes(x=months, y=active))+ + geom_bar(width=0.6, stat="identity", fill="#74d0f5") + + labs(x="", + y="")+ + theme(legend.position = "top", + legend.background = element_rect(fill = "transparent"), + axis.ticks = element_blank(), + axis.ticks.margin = unit(0.25, "cm"), + axis.text.x = element_text(face="bold",colour="black",angle=90), + axis.text.y = element_text(face="bold",colour="black",angle=90), + panel.background = element_rect(fill = "#d5e2e8"), + plot.background = element_rect(fill = "#d5e2e8"), + panel.grid.major.x = element_blank(), + panel.grid.minor.y = element_blank() + ) + coord_flip() + +# dodged barcharts for comparison +# reshaping data: wide to long +library(reshape) +f2_reshaped = reshape(f2, idvar="months", varying=list(2:3), # put list(3:2) here and you would see interesting result + v.names="status", direction="long") +ggplot(f2_reshaped, aes(x=months, y=status, fill=factor(time))) + + geom_bar(width=0.6, stat="identity", position="dodge") + + labs(x="", + y="")+ + theme(legend.position = "top", + legend.background = element_rect(fill = "transparent"), + legend.title = element_blank(), + axis.ticks = element_blank(), + axis.ticks.margin = unit(0.25, "cm"), + axis.text.x = element_text(face="bold",colour="black"), + axis.text.y = element_text(face="bold",colour="black"), + panel.background = element_rect(fill = "#d5e2e8"), + plot.background = element_rect(fill = "#d5e2e8"), + panel.grid.major.x = element_blank(), + panel.grid.minor.y = element_blank()) + + coord_flip() + +# chart 3: re-start rate +names(f3) = c("month", "attrited", "restarted", "pct") +f3$month = as.factor(f3$month) +f3$month = factor(f3$month, levels(f3$month)[c(1,4,5,6,2,3)]) + +maxpct = max(f3$pct) +minpct = min(f3$pct) +ggplot(f3, aes(x=month,y=pct)) + + geom_bar(width=0.6, stat="identity", fill="#01526d") + + scale_y_continuous(breaks = seq(0, maxnum, by = 0.10)) + + labs(x="", + y="") + + theme(legend.position = "top", + legend.background = element_rect(fill = "transparent"), + axis.ticks = element_blank(), + axis.ticks.margin = unit(0.25, "cm"), + axis.text.x = element_text(face="bold",colour="black"), + axis.text.y = element_text(face="bold",colour="black"), + panel.background = element_rect(fill = "#d5e2e8"), + plot.background = element_rect(fill = "#d5e2e8"), + panel.grid.major.x = element_blank(), + panel.grid.minor.y = element_blank() + ) +ggsave("figure_3.pdf",width=8,height=8) \ No newline at end of file diff --git a/analysis_scripts/mchen/blog_6/blog_6.Rproj b/analysis_scripts/mchen/blog_6/blog_6.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/analysis_scripts/mchen/blog_6/blog_6.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/analysis_scripts/mchen/blog_6/hq_viz.R b/analysis_scripts/mchen/blog_6/hq_viz.R new file mode 100644 index 0000000..0cd92e2 --- /dev/null +++ b/analysis_scripts/mchen/blog_6/hq_viz.R @@ -0,0 +1,110 @@ +# IMPORT DATA # +suppressMessages(library(plyr)) +suppressMessages(library(dplyr)) +suppressMessages(library(zoo)) +suppressMessages(library(lubridate)) + +data = tbl_df(read.csv("blog_data_6_10_15.csv", stringsAsFactors=FALSE)) +data$calendar_month = as.Date(data$calendar_month, "%Y-%m-%d") # time data in DP is stored as numeric distance from 1970-01-01 + +hq = select(data, domain_numeric, user_pk, calendar_month) # keep columns of interest +hq = arrange(hq, domain_numeric, user_pk, calendar_month) +t = filter(hq, calendar_month >= as.Date("2012-01-01")) %>% + filter(., calendar_month <= as.Date("2014-12-01")) # keeping data from 2012 onward + + +# for each domain, get total number of users and number of active months +agg = t %>% + group_by(domain_numeric) %>% # change from domain_numeric to domain + summarise(nusers=n_distinct(user_pk), + nmonths = n_distinct(calendar_month)) + +# for each domain in each month, get the number of active users. TBF'ED +t_agg_1 = t %>% + group_by(domain_numeric, calendar_month) %>% + summarise(nusers = n_distinct(user_pk)) %>% + arrange(., domain_numeric, calendar_month) + +# for each domain, get total user blocks +t_agg_1$blocks = round_any(t_agg_1$nusers, 25, ceiling)/25 + +# rank domains by max user blocks in a given month (big domains go to the bottom) +d_rank = t_agg_1 %>% + group_by(domain_numeric) %>% + summarise(b = max(blocks)) %>% + arrange(., desc(b)) %>% + mutate(b_rank = seq(b)) %>% + select(., domain_numeric, b_rank) + +# Replicate each row; set the number of replications to the number of blocks +expanded = as.numeric(rep(row.names(t_agg_1), t_agg_1$blocks)) +texp = t_agg_1[expanded,] +texp = texp[order(match(texp$domain_numeric, d_rank$domain_numeric)),] # order domains by max active month +texp = left_join(texp, d_rank) +texp = texp %>% + group_by(b_rank, calendar_month) %>% + mutate(incre_blocks = seq(n())) # Index user bins incrementally + +# for each cum_blocks value, sum up cum_blocks and the max value of the domain that's sitting below +piles = texp %>% + group_by(b_rank) %>% + summarise(max_blocks = max(incre_blocks)) +piles$max_blocks = c(0, cumsum(piles$max_blocks[-nrow(piles)])) +texp = left_join(texp, piles) +texp$piled_blocks = texp$incre_blocks + texp$max_blocks # building blocks into mounds for each domain + + +# all HQ domains +# geom_tile plot +suppressMessages(library(ggplot2)) +suppressMessages(library(scales)) +suppressMessages(library(RColorBrewer)) + +x_axis_ticks = as.character(seq(min(texp$calendar_month),max(texp$calendar_month),by="quarter")) # R considers Date variable as continuous +x_axis_labels = format(as.Date(x_axis_ticks), "%b %y") + +g0 = ggplot(texp, aes(x=as.character(calendar_month),y=piled_blocks)) + + geom_tile(aes(fill=factor(texp$b_rank))) + + guides(fill=FALSE) + # turning off legends + scale_fill_manual(values=rep(c("#005266", "#882608", "#fe9c7b"), + ceiling(n_distinct(texp$domain_numeric)/3))) + # repeating colors to distinguish each domain + scale_y_continuous(breaks = NULL) + # suppressing ticks, gridlines + scale_x_discrete(breaks = x_axis_ticks, labels = x_axis_labels) + # reduce total number of breaks + theme(axis.title.x = element_blank(), + axis.title.y = element_blank(), # hide titles for both x- and y-axis + axis.text.x = element_text(face="bold", + colour="#004865", size=11), + panel.background = element_rect(fill="#d5e4eb"), + panel.grid = element_blank()) + +ggsave("hq_viz.pdf", g0, width = 8, height = 12) + + +# subset HQ domains +# subs = c(11, 67, 87, 127, 138) +# run line 17-29 and you will get the data +t = filter(t, domain_numeric %in% subs) +hq_sub = ggplot(t_agg_1, aes(x=calendar_month, y=nusers)) + + geom_bar(width=28,stat="identity") + + facet_wrap(~domain_numeric, nrow=5, scales = "free_y") + + theme(axis.title.x = element_blank(), + axis.title.y = element_blank(), # hide titles for both x- and y-axis + axis.text.x = element_text(face="bold", + colour="#004865", size=11), + panel.background = element_rect(fill="#d5e4eb"), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank(), + panel.grid.minor.y = element_blank()) + +# highlight time range +rect = data.frame(xmin = as.Date(c("2012-04-01","2012-10-01","2013-04-01","2013-10-01","2014-04-01","2014-10-01")), + xmax = as.Date(c("2012-06-01","2012-12-01","2013-06-01","2013-12-01","2014-06-01","2014-10-01")), + ymin = c(-Inf,-Inf,-Inf,-Inf,-Inf,-Inf), + ymax = c(Inf,Inf,Inf,Inf,Inf,Inf)) +hq_sub_2 = hq_sub + geom_rect(data = rect, aes(xmin=xmin, xmax=xmax, + ymin = ymin, ymax=ymax), + fill = "#aaaea6", + alpha = 0.3, + inherit.aes = FALSE) + +ggsave("hq_subdomains.pdf", hq_sub_2, width = 8, height = 8) diff --git a/analysis_scripts/mchen/blog_visualization/cluster_analysis.R b/analysis_scripts/mchen/blog_visualization/cluster_analysis.R deleted file mode 100644 index ce47eb0..0000000 --- a/analysis_scripts/mchen/blog_visualization/cluster_analysis.R +++ /dev/null @@ -1,20 +0,0 @@ -# cluster analyis -# get the kmeans - -# add scripts to compute kmeans here - -# results: 4 clusters: x = 20, 40, 95, 270 - -# removing all x = y = 1 -test3 <- test2[-which(test2$x == 1 & test2$y == 1),] -ca_sub1 <- test3[which(test3$x <= 30 & test3$y <= 30),] - -crs <- test3[which(test3$domain == "crs-remind"),] -crs_sub <- crs[which(crs$x >= 10 & crs$x <= 40),] -smoothScatter(crs_sub$x, crs_sub$y) - -koraro <- test3[which(test3$domain == "mvp-koraro"),] -koraro_sub <- koraro[which(koraro$x >= 40 & koraro$x <= 100),] -smoothScatter(koraro_sub$x, koraro_sub$y) - -intensityPlotOut(crs_sub, koraro_sub, 100, 100) \ No newline at end of file diff --git a/analysis_scripts/mchen/blog_visualization/plot_funcs.R b/analysis_scripts/mchen/blog_visualization/plot_funcs.R deleted file mode 100644 index be08ff4..0000000 --- a/analysis_scripts/mchen/blog_visualization/plot_funcs.R +++ /dev/null @@ -1,29 +0,0 @@ -# plot functions -plotOut <- function(data, medianColor, palette1, palette2, xmax, ymax, rLabel, rSquareX, rSquareY) { - p <- regression_plot(ncases_touched ~ prev_ncases_touched, data, - shade=FALSE, - spag=TRUE, - median.col=medianColor, # median line (not regression line) - palette=colorRampPalette(c(palette1,palette2))(4)) - - p <- p + - scale_x_continuous("Cases visited in month N-1", limits=c(0,xmax)) + - scale_y_continuous("Cases visited in month N", limits=c(0,ymax)) # modify the axis limits - - # p <- p + labs(x="Cases visited in month N-1", y="Cases visited in month N") # modify axis labels - - # adding annotation layer - p <- p + annotate("text", x=rSquareX, y=rSquareY, size=7, label=rLabel, colour="red", parse=T) - # modify theme settings: - p <- p + - theme(axis.title.x=element_text(vjust = -1.5,size=rel(2)), - axis.title.y=element_text(vjust = 1.5,size=rel(2)), - plot.margin=unit(c(1,1,1,1),"cm"), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - panel.background = element_blank()) + - theme(aspect.ratio = 1) + - theme(legend.position="none") - return(p) -} - diff --git a/visualizations/distribution_viz.R b/visualizations/distribution_viz.R new file mode 100644 index 0000000..2fd3960 --- /dev/null +++ b/visualizations/distribution_viz.R @@ -0,0 +1,674 @@ +############################## +##distribution visualization## +############################## + +# create a single line graph from a set of user-months and an activity indicator showing distribution of levels + # For each indicator + # the x-axis should be the value of the indicator (percent of active days) + # the y-axis should be the percent of user-months that have that value + # multiple graphs will be generated on different sets of user-months + # graph naming format: p_n (n: the nth graph) + +data <- tbl_df(read.csv("blog_data_2_2_15.csv", stringsAsFactors = FALSE)) +data$adp <- round(data$active_day_percent) # rounding would make it a discrete variable + +library(plyr) +library(dplyr) +library(ggplot2) +library(Cairo) +library(zoo) +library(RColorBrewer) +library(lattice) +library(grid) +library(gridExtra) + + +# distribution visualization of performance indicators + +# Custom Quarter entirely by domain +d_1 = select(data, user_pk, domain_numeric, calendar_month, active_days, month_index) +d_1 = arrange(d_1, calendar_month, domain_numeric) +# create month_seq by domain +d_1 = d_1 %>% + group_by(domain_numeric) %>% + mutate(month_seq_by_domain = rep(1:length(table(calendar_month)), + table(calendar_month)) + ) +# create custom quarter by month_index +bins = c(1+( + 3*(0:ceiling(max(d_1$month_seq_by_domain)/3)) +)) + +labs = paste("Q_", seq(length(bins)-1), sep = "") + +d_1 = d_1 %>% + group_by(domain_numeric) %>% + mutate(QTR_by_domain = cut(month_seq_by_domain, + breaks = bins, + labels = labs, + right = FALSE) + ) + +pdf(7, 14, file="custom qtr by domain.pdf") +p_1 = densityplot(~active_days|QTR_by_domain, data=d_1, + panel=function(x,...){ + panel.densityplot(x,...) + lower.v <- quantile(x,probs=0.25) + median.v <- median(x) + upper.v <- quantile(x,probs=0.75) + mean.v <- mean(x) + + panel.abline(v=median.v, col.line="#003d71", lwd=1.2) # 50 percentile + # panel.abline(v=lower.v, col.line="#f58220") # 25 percentile + # panel.abline(v=upper.v, col.line="#f58220") # 75 percentile + panel.abline(v=mean.v, col.line="#003d71", lwd=1.2, lty=2) # mean + + dens <- density(x) + min.v <- min(dens$x) + max.v <- max(dens$x) + panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), + c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0), + col="#f58220") + panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v), + c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0), + col="#f58220") + }, + scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))), + xlim=range(d_1$active_days), # reversing the order of x-axis values + ylim=c(0,0.5), + xlab="Active days per month", + ylab="Density", + # aspect=0.1, + layout=c(1,length(bins)-1), + index.cond=list((length(bins)-1):1), + strip=strip.custom(bg="#ffffff", par.strip.text=list(font=0.5,col="#413f3f")), + col="#413f3f", + plot.points=FALSE) +print(p_1) +dev.off() + +# get a subset of users that have no dropout(gap) months from their first active month till 80% of domain active months +d_2 = d_1 %>% + group_by(domain_numeric) %>% + summarise(max_months = max(month_seq_by_domain)) +d_2$pct_80 = floor(d_2$max_months*0.8) # for each domain, calculate the first 80 percent months + +d_2 %>% + group_by(domain_numeric) %>% + filter() + +# for each user, get the distance between two active months +d_1 = arrange(d_1, domain_numeric, user_pk, month_seq_by_domain) +d_1 = d_1%>% + group_by(domain_numeric, user_pk) %>% + mutate(diff_month = c(NA, diff(month_seq_by_domain))) + +# get the last row per user, return users that have lived through the first 80 percent of domain active months +d_2 = left_join(d_1, d_2) +d_2 = arrange(d_2, domain_numeric, user_pk, calendar_month) +d_2_user = d_2 %>% + group_by(domain_numeric, user_pk) %>% + do(tail(.,n=1)) %>% + filter(., month_seq_by_domain >= pct_80) +d_2 = filter(d_2, user_pk %in% d_2_user$user_pk) +# filter out users who lived through but have dropouts during the first 80 percent +d_2_80 = filter(d_2, month_seq_by_domain <= pct_80) # for users kept in d_2_user, truncate their activity to pct_80 +d_2_80 = d_2_80 %>% + group_by(domain_numeric, user_pk) %>% + filter(., n()==max(month_seq_by_domain)) # excluding all users that have gaps during the first 80 percent of domain active months + +d_3 = filter(d_1, user_pk %in% d_2_user$user_pk) # this returns all user-months that live through first 80% of domain active months with no dropouts + +# get the subset of users that have contribute to 14 quarters +d_3_subset_user = d_3 %>% + group_by(domain_numeric, user_pk) %>% + summarise(tot_qtr = length(levels(factor(QTR_by_domain)))) %>% # this returns total active quarters per user + filter(., tot_qtr >= 6) + +# only keep data till QTR_6 for visual +d_4 = filter(d_3, user_pk %in% d_3_subset_user$user_pk) +d_4_subset = filter(d_4, month_seq_by_domain <= 18) +d_4_subset$QTR_by_domain = factor(d_4_subset$QTR_by_domain) # drop unused levels + +pdf(7, 14, file="no dropout in 6 quarters.pdf") +p_qtr_6_by_domain = densityplot(~active_days|QTR_by_domain, data=d_4_subset, + panel=function(x,...){ + panel.densityplot(x,...) + lower.v <- quantile(x,probs=0.25) + median.v <- median(x) + upper.v <- quantile(x,probs=0.75) + mean.v <- mean(x) + + panel.abline(v=median.v, col.line="#003d71", lwd=1.2) # 50 percentile + # panel.abline(v=lower.v, col.line="#f58220") # 25 percentile + # panel.abline(v=upper.v, col.line="#f58220") # 75 percentile + panel.abline(v=mean.v, col.line="#003d71", lwd=1.2, lty=2) # mean + + dens <- density(x) + min.v <- min(dens$x) + max.v <- max(dens$x) + panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), + c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0), + col="#f58220") + panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v), + c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0), + col="#f58220") + }, + # scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))), + xlim=range(d_4_subset$active_days), # reversing the order of x-axis values + ylim=c(0,0.5), + xlab="Active days per month", + ylab="Density", + # aspect=0.1, + layout=c(1,length(levels(d_4_subset$QTR_by_domain))), + index.cond=list(length(levels(d_4_subset$QTR_by_domain)):1), + strip=strip.custom(bg="#ffffff", par.strip.text=list(font=0.5,col="#413f3f")), + col="#413f3f", + plot.points=FALSE) +print(p_qtr_6_by_domain) +dev.off() + + + +# custom quarter by user +pdf(7, 14, file="custom qtr by user (no dropout).pdf") +d_2$QTR = factor(d_2$QTR) +p_2_by_user = densityplot(~active_days|QTR, data=d_2, + panel=function(x,...){ + panel.densityplot(x,...) + lower.v <- quantile(x,probs=0.25) + median.v <- median(x) + upper.v <- quantile(x,probs=0.75) + mean.v <- mean(x) + + panel.abline(v=median.v, col.line="#003d71", lwd=1.2) # 50 percentile + # panel.abline(v=lower.v, col.line="#f58220") # 25 percentile + # panel.abline(v=upper.v, col.line="#f58220") # 75 percentile + panel.abline(v=mean.v, col.line="#003d71", lwd=1.2, lty=2) # mean + + dens <- density(x) + min.v <- min(dens$x) + max.v <- max(dens$x) + panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), + c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0), + col="#f58220") + panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v), + c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0), + col="#f58220") + }, + scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))), + xlim=range(d_2$active_days), # reversing the order of x-axis values + ylim=c(0,0.5), + xlab="Active days per month", + ylab="Density", + # aspect=0.1, + layout=c(1,length(levels(d_2$QTR))), + index.cond=list(length(levels(d_2$QTR)):1), + strip=strip.custom(bg="#ffffff", par.strip.text=list(font=0.5,col="#413f3f")), + col="#413f3f", + plot.points=FALSE) + +print(p_2_by_user) +dev.off() + + + + + + + +# Distribution Graphs 1: Kernel Density Plot for all user-months + # construct data table for graph 1 +adp_groups <- data %>% group_by(adp) # order data table by percent of active days +x <- unique(adp_groups$adp) # extract unique percent of active days +y <- group_size(adp_groups) # calculate the occurrence of each value in percent of active days +data_adp1 <- tbl_df(as.data.frame(cbind(x,y))) # combine percentage of each value occurring and unique values into a new data table +data_adp1 <- arrange(data_adp1, x) # reorder the data table by unique values in percent of active days +data_adp1$y_prob <- round(y/sum(data_adp1$y),digits=6) # calculate the percentage of each value occurring in the whole data set + # run plot +png(600,600,file="pad_all_domains.png") +d1 = density(data$active_days) +p1 = plot(d1, main="Distribution of active days per month (Kernel density estimation)") +dev.off() + + + + + +# Distribution Graph 2: Probability Distribution by Custom Quarter + # Create custom quarters entirely by domain +d_1 = select(data, user_pk, domain_numeric, calendar_month, active_days, month_index) + + + # Create custom quarters entirely by user +d_2 = select(data, user_pk, domain_numeric, calendar_month, active_days) +d_2 %>% + group_by(domain_numeric, user_pk) %>% + mutate(month_seq_by_user = c(1:length(calendar_month))) + + +labs = paste("QTR_by_use ",seq(17),sep="") # there are 17 quarters per n_distinct(calendar_month) by user +d_2$QTR = cut(d_2$month_seq_by_user, breaks=3*(0:17), labels=labs) # adding labels +d_2$QTR = factor(d_2$QTR, levels = levels(d_2$QTR)) + + +p_2 = densityplot(~active_days|QTR, data=d_2, + panel=function(x,...){ + panel.densityplot(x,...) + lower.v <- quantile(x,probs=0.25) + median.v <- median(x) + upper.v <- quantile(x,probs=0.75) + mean.v <- mean(x) + + panel.abline(v=median.v, col.line="red") # 50 percentile +# panel.abline(v=lower.v, col.line="#f58220") # 25 percentile +# panel.abline(v=upper.v, col.line="#f58220") # 75 percentile + panel.abline(v=mean.v, col.line="darkorange", lty=2) # mean + + dens <- density(x) + min.v <- min(dens$x) + max.v <- max(dens$x) + panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), + c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0), + col="#f58220") + panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v), + c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0), + col="#f58220") + }, + scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))), + xlim=range(d_2$active_days), # reversing the order of x-axis values + xlab="Active days per month", + ylab="Density", +# aspect=0.1, + layout=c(1,17), + index.cond=list(17:1), + strip=strip.custom(bg="#003d71", par.strip.text=list(font=1,col="white")), + col="#413f3f", + plot.points=FALSE) + + + +# run plot on a subset of domains that have at least one user active for 10 QTRs +max_qtr = d_2 %>% + group_by(domain_numeric) %>% + summarise(max(as.numeric(QTR))) +colnames(max_qtr) = c("domain_numeric", "max_q") +dm = filter(max_qtr, max_q >= 10)$domain_numeric +d_3 = filter(d_2, domain_numeric %in% dm) + + +pdf("distribution_final.pdf", + width=6, + height=12) + +print(densityplot(~active_days|QTR, data=d_3, + panel=function(x,...){ + panel.densityplot(x,...) + lower.v <- quantile(x,probs=0.25) + median.v <- median(x) + upper.v <- quantile(x,probs=0.75) + mean.v <- mean(x) + + panel.abline(v=median.v, col.line="#f58220", lwd=2) # 50 percentile + # panel.abline(v=lower.v, col.line="#f58220") # 25 percentile + # panel.abline(v=upper.v, col.line="#f58220") # 75 percentile + panel.abline(v=mean.v, col.line="#f58220", lty=2, lwd=2) # mean + + dens <- density(x) + min.v <- min(dens$x) + max.v <- max(dens$x) + panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), + c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0), + col="#f58220") + panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v), + c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0), + col="#f58220") + }, + scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))), + xlim=range(d_3$active_days), # reversing the order of x-axis values + xlab="Active days per month", + ylab="Density", + # aspect=0.1, + layout=c(1,17), + index.cond=list(17:1), + strip=strip.custom(bg="#003d71", par.strip.text=list(font=1,col="white")), + col="#413f3f", + plot.points=FALSE)) + +dev.off() + +round(nrow(d_3)/nrow(data),digits=2) # contribution of these 7 domains to all_domain data is 34% + + +# d_4: for each domain, exclude users that were inactive for the first 80% of the N months, +# namely to exclude users whose month_start > 08*calendar_month_index + +# for each domain, generate continuous id for each unique calendar month +d_4 = select(data, user_pk, domain_numeric, calendar_month, active_days, QTR) +d_4 = arrange(d_4, domain_numeric, calendar_month) +d_4 = + d_4 %>% + group_by(domain_numeric) %>% + mutate(monthCode = rep(seq(n_distinct(calendar_month + )),as.numeric(table(calendar_month)))) + +d_4 = arrange(d_4, domain_numeric, user_pk) + +tot_active_months = + d_4 %>% + group_by(domain_numeric) %>% + summarise(monthMax = max(monthCode)) + + +tot_active_months$month80 = round(tot_active_months$monthMax*0.8) + +# for each user, left join first user-month with month80 +d_4 = arrange(d_4, domain_numeric, user_pk, calendar_month) +d_4_first = d_4 %>% + group_by(domain_numeric, user_pk) %>% + filter(calendar_month == min(calendar_month)) + + +d_4_first_join = left_join(d_4_first, tot_active_months) +# compare the first user-month with the 80%-month per domain +d_4_first_join$later_than_80 = ifelse(d_4_first_join$monthCode >= d_4_first_join$month80, "yes", "no") + +# if the first user-month is later than the 80% cutoff, drop it out +d_4_user_subset = filter(d_4_first_join, later_than_80 == "no")$user_pk + +# extract data for the subset of user +d_4_final = select(filter(data, user_pk %in% d_4_user_subset), + user_pk, domain_numeric, calendar_month, active_days, QTR) + + +# check out which domains are dropped in the process +dropped = + as.numeric( + unique(d_4$domain_numeric) %in% d_4_final$domain_numeric) +dropped_dm = + unique(d_4$domain_numeric)[which(dropped == 0)] + + +# print out the table of user distribution per domain (see if there are skyrocketing domains) +d_4_final_user = + d_4_final %.% + group_by(domain_numeric) %.% + summarise(numUser = n_distinct(user_pk)) + +# barplot users per domain +d_4_final_user = arrange(d_4_final_user, -numUser) +barplot(d_4_final_user$numUser) + +d_4_final_user$userBin = ifelse(d_4_final_user$numUser < 5, "1", + ifelse(d_4_final_user$numUser >= 5 & d_4_final_user$numUser < 25, "2", + ifelse(d_4_final_user$numUser >= 25 & d_4_final_user$numUser < 50, "3", + ifelse(d_4_final_user$numUser >= 50 & d_4_final_user$numUser < 100, "4", "5" + )))) + +barplot(as.numeric(d_4_final_user$userBin)) +table(as.numeric(d_4_final_user$userBin)) + +d_4_final = left_join(d_4_final, d_4_final_user) +d_4_final$QTR = factor(d_4_final$QTR) + +print(densityplot(~active_days|QTR, data=filter(d_4_final, userBin == 3), + panel=function(x,...){ + panel.densityplot(x,...) + lower.v <- quantile(x,probs=0.25) + median.v <- median(x) + upper.v <- quantile(x,probs=0.75) + mean.v <- mean(x) + + panel.abline(v=median.v, col.line="#f58220", lwd=2) # 50 percentile + # panel.abline(v=lower.v, col.line="#f58220") # 25 percentile + # panel.abline(v=upper.v, col.line="#f58220") # 75 percentile + panel.abline(v=mean.v, col.line="#f58220", lty=2, lwd=2) # mean + + dens <- density(x) + min.v <- min(dens$x) + max.v <- max(dens$x) + panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), + c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0), + col="#f58220") + panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v), + c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0), + col="#f58220") + }, + scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))), + between=list(y=0.5), # introduce a gap between rows of vertical panels + par.strip.text=list(cex=0.7), # size down the text in the strip above each panel + par.settings=list(axis.text=list(cex=0.7)), # control size of the text on tick lablels + xlim=range(d_4_final$active_days), # reversing the order of x-axis values + xlab="Active days per month", + ylab="Density", + + # custom arrangement of panel layout + aspect=1, + layout=c(1,17), # single column, 17 rows, one for each quarter + index.cond=list(17:1), # panels should be ordered from the top to the bottom + strip=strip.custom(bg="#003d71", par.strip.text=list(font=1,col="white")), + col="#413f3f", + plot.points=FALSE)) + + + + + +# summary data table as a supplement +p_data = group_by(p_data, QTR) +viz_sum_table = summarise(p_data, round(mean(active_days)), round(median(active_days)), round(quantile(active_days,probs=0.25)), round(quantile(active_days,probs=0.75))) + +# create a dimagi theme + + + +# graph 2: subset data from 10 biggest domains (in terms of user-months) +tum <- tally(group_by(data,domain_numeric)) # for each domain, count total user-month +colnames(tum) <- c("domain_numeric","user_months") # renaming variables +top_10_dm <- tum %>% + arrange(desc(user_months)) %>% + top_n(10) # identify the qualifying 10 domains +top_10 <- filter(data, domain_numeric %in% top_10_dm$domain_numeric) # subset data +top_10$calendar_month <- as.Date(top_10$calendar_month) + +# generate a sequence of active months on each domain +top_10 = top_10 %>% + group_by(domain_numeric,user_pk) %>% + mutate(month_seq_by_user = seq(n())) # for each user this calculates the total active month + +# for each user on each domain, categorize active month by quarter (not calendar quarter) +top_10$YR = ifelse(top_10$month_seq_by_user <= 12, "Y1","Y2") +top_10_Y1 = filter(top_10, YR == "Y1") +top_10_Y1$QR = sapply(top_10_Y1$month_seq_by_user, function(x){ + if(x <= 3) return("Q1") + if(x > 3 & x <= 6) return("Q2") + if(x > 6 & x <= 9) return("Q3") + else return("Q4") +}) + + +# Distribution graph 2 (alternative: use sm.density.compare) +d1 = densityplot(~active_days, + data=top_10_Y1, + groups=QR, + xlab="Active days per month", + main="Top 10 domains", + from=0,to=28, + auto.key=TRUE, + plot.points=FALSE) + + +d2 = densityplot(~active_days|QR, data=top_10_Y1, + panel=function(x,...){ + panel.densityplot(x,...) + lowerp.values <- quantile(x,probs=0.25) + median.values <- median(x) + upperp.values <- quantile(x,probs=0.75) + mean.values <- mean(x) + panel.abline(v=median.values, col.line="red") + panel.abline(v=lowerp.values, col.line="yellow") + panel.abline(v=upperp.values, col.line="green") + panel.abline(v=mean.values, col.line="pink", lty=2) + }, + xlim=range(top_10_Y1$active_days), # reversing the order of x-axis values + layout=c(1,4), + index.cond=list(4:1), + plot.points=FALSE) + +# create plots separately for each domain + +dm_list = split(top_10_Y1, top_10_Y1$domain_numeric) +dm_1 = densityplot(~active_days|QR, data=dm_list[[1]], + panel=function(x,...){ + panel.densityplot(x,...) + lowerp.values <- quantile(x,probs=0.25) + median.values <- median(x) + upperp.values <- quantile(x,probs=0.75) + mean.values <- mean(x) + panel.abline(v=median.values, col.line="red") + panel.abline(v=lowerp.values, col.line="yellow") + panel.abline(v=upperp.values, col.line="green") + panel.abline(v=mean.values, col.line="pink", lty=2) + }, + xlim=range(dm_list[[1]]$active_days), # reversing the order of x-axis values + layout=c(1,4), + index.cond=list(4:1), + plot.points=FALSE) + +densPlot = function(dm_data){ + densityplot(~active_days|QR, data=dm_data, + panel=function(x,...){ + panel.densityplot(x,...) + lowerp.values <- quantile(x,probs=0.25) + median.values <- median(x) + upperp.values <- quantile(x,probs=0.75) + mean.values <- mean(x) + panel.abline(v=median.values, col.line="red") + panel.abline(v=lowerp.values, col.line="yellow") + panel.abline(v=upperp.values, col.line="green") + panel.abline(v=mean.values, col.line="pink", lty=2) + }, + xlim=range(dm_data$active_days), # reversing the order of x-axis values + layout=c(1,4), + index.cond=list(4:1), + plot.points=FALSE) +} + +plots_separate = lapply(names(dm_list), function(x) densPlot(dm_list[[x]])) +png("density_plot_per_domain.png") +do.call(grid.arrange, plots_separate) +dev.off() + + + + +toList = quantile(top_10_Y1$active_days, probs=c(0,0.25,0.5,0.75,1)) + +lattice.theme <- trellis.par.get() +col <- lattice.theme$superpose.symbol$col + +plots = lapply(1:4,function(.x){ + densityplot(~active_days, + data=top_10_Y1, + group=QR, + from=toList[.x], + to=toList[.x + 1], + plot.points=FALSE, + xlab="Active days per month", + key=list(text=levels(top_10_Y1$QR), + lines=TRUE), + main=paste("Percentile", names(toList)[.x], "to", names(toList[.x+1]))) +}) + +do.call(grid.arrange,plots) + + + +# adding summary stats to the graph that calculates and plots quantiles +q1 = ddply(top_10_Y1, .(QR), function(x){ + x1 = quantile(x$active_days, .25) + y1 = density(x$active_days, from=x1, to=x1, n=1)$y # this calculates the density at a single point + df.dens.x1 = data.frame(x=x1,y=y1) + return(df.dens.x1) +}) # this returns the 25 percentile for each quarter + +q2 = ddply(top_10_Y1, .(QR), function(x){ + + x1 = quantile(x$active_days, .5) + y1 = density(x$active_days, from=x1, to=x1, n=1)$y # this calculates the density at a single point + df.dens.x1 = data.frame(x=x1,y=y1) + return(df.dens.x1) +}) + +q3 = ddply(top_10_Y1, .(QR), function(x){ + x1 = quantile(x$active_days, .75) + y1 = density(x$active_days, from=x1, to=x1, n=1)$y # this calculates the density at a single point + df.dens.x1 = data.frame(x=x1,y=y1) + return(df.dens.x1) +}) + +q = rbind(q1,q2,q3) # quantile data for geom plot +p2 = p1 + + geom_point(data=q, aes(x=x,y=y,group=factor(QR),shape=factor(QR),color=factor(QR),size=6)) + +ggsave("top_10_domains.pdf",p2,height=8,width=16) + + +# data check + # is every domain contributing to every quarter? +top_10_Y1 %>% + group_by(domain_numeric) %>% + n_distinct(QR) + +# graph 3: grouped by 3/6/12/more months## + +# for each user, add a sequence of active months +data2 = data %>% + group_by(user_pk) %>% + mutate(nmonths = seq(n())) + +# for each group, compute the distribution of active day percent +data_adp2 = tbl_df(as.data.frame(with(data2, table(cut(nmonths, breaks=c(1,3,6,12,50)), adp)))) +data_adp2 = arrange(data_adp2, Var1) # Var1 is the resulting category +data_adp2 = data_adp2 %>% + group_by(Var1) %>% + mutate(dist_adp = Freq/n()) + +p_adp3 = ggplot(data_adp2,aes(x=adp,y=dist_adp,group=Var1))+ + geom_line(aes(linetype=Var1,color=Var1))+ + xlab("percent of active days")+ + ylab("distribution")+ + theme( + panel.background = element_blank() + ) +ggsave("p_adp3.pdf",p_adp3,width=16,height=8) + +# Graph 4: same with graph 3, only on users who are active for at least 15 of their first 18 months +user_nmonths = tally(group_by(data2, user_pk)) # this returns total active months +user_candidate = select(filter(user_nmonths, n >= 18), user_pk) # only users with at least 18 consecutive active months will be kept +data_adp3 = filter(data2, user_pk %in% user_candidate$user_pk) + +adp3_sub_test = data_adp3 %>% + group_by(user_pk) %>% + summarise(seq_15 = length(row.names(table(seq(15)%in%month_index)))) # if a user is active for 15 consecutive months, row.names would have a length of 1, otherwise 2(TRUE,FALSE) +adp3_users = select(filter(adp3_sub_test, seq_15 == 1),user_pk) # this returns all qualified users +data_adp3 = filter(data_adp3, user_pk %in% adp3_users$user_pk) + +data_adp3 = tbl_df(as.data.frame(with(data_adp3, table(cut(nmonths, breaks=c(1,3,6,12,50)), adp)))) +data_adp3 = arrange(data_adp3, Var1) # Var1 is the resulting category +data_adp3 = data_adp3 %>% + group_by(Var1) %>% + mutate(dist_adp = round(100*(Freq/sum(Freq)))) +data_adp4 = filter(data_adp3, dist_adp > 0) + +p_adp4 = ggplot(data_adp4,aes(x=adp,y=dist_adp,group=Var1))+ + geom_line(aes(linetype=Var1,color=Var1))+ + xlab("percent of active days")+ + ylab("distribution")+ + theme( + panel.background = element_blank() + )+ + ggtitle("422 users who are active for at least 15 of their first 18 months are included") + +ggsave("p_adp4.pdf",p_adp4,width=16,height=8) + diff --git a/visualizations/hq_viz.R b/visualizations/hq_viz.R new file mode 100644 index 0000000..36e1275 --- /dev/null +++ b/visualizations/hq_viz.R @@ -0,0 +1,391 @@ +#################### +##HQ Visualization## +#################### + +library(ggplot2) +library(scales) +library(plyr) +library(dplyr) +library(grDevices) +library(grid) + +data <- tbl_df(read.csv("data.csv", stringsAsFactors = FALSE)) +data$calendar_month <- as.Date(data$calendar_month,format="%m/%d/%Y") +## manually delete a user: user_pk == 6421 +data = filter(data, user_pk != 6421) + +# for each domain, count total unique users +tuu <- data %>% + group_by(domain_numeric) %>% + summarise(tot_unique_users = n_distinct(user_pk)) +tuu = arrange(tuu, tot_unique_users) +# rank domains by total number of users +tuu$dm_rank_tuu = seq(nrow(tuu)) + +# for each domain each month, count blocks of users + # blocks: (0,25],(25,50]... + # expected result: in a domain this month, there would be N blocks of users +bou = data %>% + group_by(domain_numeric, calendar_month) %>% + summarise(user_size = n_distinct(user_pk)) + +bou = tbl_df(cbind(bou,findInterval(bou$user_size, c(25,50,75,100,125,150,175,200,225,250,275,300,325)))) +colnames(bou)[4] = c("user_block") +bou$user_block = bou$user_block + 1 +rep_domain = rle(bou$domain_numeric)$lengths +bou$dm_index = rep(seq(length(unique(bou$domain_numeric))),rep_domain) # adding a new index to correct for the random domain_numeric (hitting 7600s) + +bou = arrange(bou, dm_index, calendar_month) +p_bou = ggplot(bou, aes(x=calendar_month,y=dm_index,fill=factor(dm_index)))+ + geom_tile(aes(height=user_block))+ + scale_fill_manual(values=c(rep(c("blue","yellow"),times=153))) +# customize y-axis scale to be same with the height of geom tiles + + +# for each domain, count total unique months +tum <- data %>% + group_by(domain_numeric) %>% + summarise(tot_unique_months = n_distinct(calendar_month)) +tum <- arrange(tum, tot_unique_months) +tum$dm_rank_tum <- seq(nrow(tum)) + +# cbind by domain_numeric +tuu <- arrange(tuu, domain_numeric) +tum <- arrange(tum, domain_numeric) +cbinded <- tbl_df(merge(tuu, tum)) + +# add dm_rank_tuu, dm_rank_tum, user_bin back to monthly aggregate data +rep_time <- as.data.frame(table(data$domain_numeric))$Freq +cbinded_rep <- cbinded[rep(row.names(cbinded),rep_time),1:6] +data <- arrange(data, domain_numeric) +hq_viz_data <- tbl_df(cbind(data, cbinded_rep[,2:6])) + +# rank domains by total unique users (ascending) +hvd <- arrange(hq_viz_data, dm_rank_tuu, user_pk, calendar_month) +rep_time_user <- rle(hvd$user_pk)$lengths +hvd$user_rank_1 <- rep(seq(length(unique(hvd$user_pk))), rep_time_user) + + +# SETUP PARAMS +# title text style +title.style <- element_text(family="Arial", face="bold", color="black", size=16) +# axis text style +label.style <- element_text(family="Arial Narrow", color="grey50", size=8, angle=90) +# panel setup +horizontal_theme <- theme( + panel.background = element_blank() , + panel.grid.major.x = element_blank() , + panel.grid.minor = element_blank() , + axis.text.y = element_blank(), + axis.title.y = element_blank(), + axis.ticks.y = element_blank(), + axis.line = element_line( size=.2 , colour="#666666" ), + legend.position = "none", # turn off legends + plot.margin = unit( c(.2,.25,.25,.5) , "in") +) +non_horizontal_theme <- theme( + panel.background = element_blank() , + # panel.grid.major.x = element_blank() , + panel.grid.minor.y = element_blank() , + axis.line = element_line( size=.2 , colour="#666666" ), + legend.position = "none", # turn off legends + plot.margin = unit( c(.5,.25,.25,.5) , "in") +) + +# horizontal chart 1: each line represents one user. lines are grouped by domains +monthStart <- min(hvd$calendar_month) +monthEnd <- max(hvd$calendar_month) +p_hvd = ggplot(hvd, aes(x=user_rank_1,y=calendar_month)) + + geom_tile(aes(fill=factor(dm_rank_tuu))) + + scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=26354)) + + scale_y_date(limits=c(monthStart,monthEnd),breaks=date_breaks(width="1 month"),labels=date_format("%m/%y")) + + theme( + panel.background = element_blank() , + panel.grid.major.x = element_blank() , + panel.grid.minor.y = element_blank() , + axis.text.y = element_blank(), + axis.text.x = element_text(size = 10, angle = 90), + axis.title.y = element_blank(), + axis.ticks.y = element_blank(), + axis.line = element_line( size=.2 , colour="#666666" ), + legend.position = "none", # turn off legends + plot.margin = unit( c(.2,.25,.25,.5) , "in"), + aspect.ratio = 1 + ) + + ggtitle(expression(atop(italic("Domain activity over time"), + atop("Each line represents one user", + atop("Two colors are recycled to distinguish between 264 domains", ""))))) + + ylab("calendar month") + + coord_flip() + +# horizontal chart 1.2: one line per domain, domain(geom tile) height is proportional to its user size +hvd2 = filter(hvd, domain_numeric < 310) +hvd2 = arrange(hvd2, user_bin, dm_rank_tuu) +p_hvd_1.1 = ggplot(hvd2, aes(x=dm_rank_tuu,y=calendar_month,fill=factor(user_bin)))+ + geom_tile(aes(height=user_bin*5))+ + scale_y_date(limits=c(monthStart,monthEnd),breaks=date_breaks(width="1 month"),labels=date_format("%m/%y")) + + theme( + panel.background = element_blank() , + panel.grid.major.y = element_blank() , + panel.grid.minor.x = element_blank() , + axis.text.x = element_blank(), + axis.text.y = element_text(size = 10, angle = 0), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.ticks.x = element_blank(), + axis.line = element_line( size=.2 , colour="#666666" ), + legend.position = "none", # turn off legends + plot.margin = unit( c(.2,.25,.25,.5) , "in"), + aspect.ratio = 1 + ) + +ggsave("hq_1.1.pdf", p_hvd_1.1, width=12, height=12) +# horizontal chart 2: for each domain, plot a horizontal line visualizing their total active days +# for each domain each month, calculate total active days +hvd_tad = hvd %>% + group_by(dm_rank_tuu, calendar_month) %>% + summarise(tot_active_days = sum(active_days)) + +p_hvd_tad = ggplot(hvd_tad, aes(x=dm_rank_tuu,y=calendar_month)) + + geom_tile(aes(fill=factor(dm_rank_tuu),alpha=log(tot_active_days))) + + scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) + + scale_alpha(range=c(0.2,1))+ + theme( + panel.background = element_blank() , + panel.grid.major.x = element_blank() , + panel.grid.minor.y = element_blank() , + axis.text.y = element_blank(), + axis.text.x = element_text(size = 10, angle = 90), + axis.title.y = element_blank(), + axis.ticks.y = element_blank(), + axis.line = element_line( size=.2 , colour="#666666" ), + legend.position = "none", # turn off legends + plot.margin = unit( c(.2,.25,.25,.5) , "in"), + aspect.ratio = 1 + ) + + scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + ggtitle(expression(atop(italic("Domain activity: total active days per month"), + atop("Each line = one domain", + atop("Darker colors = more active days", ""))))) + + ylab("calendar month") + + coord_flip() + +# horizontal chart 3: for each domain, plot difference in total active days between this and previous month +arrange(hvd_tad, dm_rank_tuu, calendar_month) +hvd_tad = mutate(hvd_tad, prev = lag(tot_active_days)) +hvd_tad = mutate(hvd_tad, tad_diff=ifelse(is.na(prev), "NA", tot_active_days-prev)) +hvd_tad = mutate(hvd_tad, tad_diff_cat=ifelse(tad_diff < 0, 0, 1)) # assign val=1 if one domain is more active this month than last month in terms of days +hvd_tad$tad_diff_cat[which(hvd_tad$tad_diff == "NA")] = NA + +# deprecated +p_hvd_tdc = ggplot(hvd_tad, aes(x=dm_rank_tuu,y=calendar_month)) + + geom_tile(aes(fill=factor(dm_rank_tuu),alpha=factor(tad_diff_cat))) + + scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) + + scale_alpha_discrete(range=c(0.2,1))+ + theme( + panel.background = element_blank() , + panel.grid.major.x = element_blank() , + panel.grid.minor.y = element_blank() , + axis.text.y = element_blank(), + axis.text.x = element_text(size = 10, angle = 90), + axis.title.y = element_blank(), + axis.ticks.y = element_blank(), + axis.line = element_line( size=.2 , colour="#666666" ), + legend.position = "none", # turn off legends + plot.margin = unit( c(.2,.25,.25,.5) , "in"), + aspect.ratio = 1 + ) + + scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + ggtitle(expression(atop(italic("Domain activity: changes in total active days over time"), + atop("Each line = one domain (264 domains in total)", + atop("Current vs. Previous Month: Darker color represents a growth in active days", ""))))) + + ylab("calendar month") + + coord_flip() + +# counting total positive values of tad_diff_cat and order domains by count +dm_rank_tp = hvd_tad %>% + group_by(dm_rank_tuu) %>% + summarise(tot_pos = sum(tad_diff_cat[which(tad_diff_cat == 1)])) + +dm_rank_tp = arrange(dm_rank_tp, tot_pos) +dm_rank_tp$dm_rank_pos_prev = seq(nrow(dm_rank_tp)) + +# add the new ranking back to monthly data +rep_time_2 = as.data.frame(table(hvd_tad$dm_rank_tuu))$Freq +dm_rank_tp = arrange(dm_rank_tp, dm_rank_tuu) +dm_rank_tp_rep = dm_rank_tp[rep(row.names(dm_rank_tp),rep_time_2),1:3] +hvd_tad = arrange(hvd_tad, dm_rank_tuu) +hvd_tad <- tbl_df(cbind(hvd_tad, dm_rank_tp_rep[,2:3])) + +hvd_tad = arrange(hvd_tad, dm_rank_pos_prev, calendar_month) + +# colors()[grep("azure",colors())] # returning all color names containing 'azure' + +p_hvd_tad_2 = ggplot(hvd_tad, aes(x=dm_rank_pos_prev,y=calendar_month)) + + geom_tile(aes(fill=factor(dm_rank_pos_prev),alpha=factor(tad_diff_cat))) + + scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) + + scale_alpha_discrete(range=c(0.3,1))+ + theme( + panel.background = element_blank() , + panel.grid.major.x = element_blank() , + panel.grid.minor.y = element_blank() , + axis.text.y = element_blank(), + axis.text.x = element_text(size = 10, angle = 90), + axis.title.y = element_blank(), + axis.ticks.y = element_blank(), + axis.line = element_line( size=.2 , colour="#666666" ), + legend.position = "none", # turn off legends + plot.margin = unit( c(.2,.25,.25,.5) , "in"), + aspect.ratio = 1 + ) + + scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + ylab("calendar month") + + ggtitle(expression(atop(italic("Domain activity: changes in total active days over time"), + atop("Current vs. Previous Month: Darker color represents an increase in active days", "")))) + + coord_flip() + +ggsave("1.4_horizontal_domain_active_or_not.pdf", p_hvd, path=mypath, width=12, height=12, limitsize = FALSE, useDingbats=FALSE) +ggsave("1.3_horizontal_domain_total_active_days_by_domain.pdf", p_hvd_tad, path=mypath, width=12, height=12, limitsize = FALSE) +ggsave("1.3_horizontal_domain_change_in_total_active_days_by_domain.pdf", p_hvd_tad_2, path=mypath, width=12, height=12, limitsize = FALSE) + + +################# +# NON-HORIZONTAL# +################# +# for each domain each month, counts total unique users +tau <- data %>% + group_by(domain_numeric, calendar_month) %>% + summarise_each(funs(n_distinct), user_pk) + +p_tau <- ggplot(tau, aes(x=calendar_month, y=user_pk,group=factor(domain_numeric))) + + geom_line(aes(colour=factor(domain_numeric),size=0.5)) + + scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + scale_y_discrete("user_pk", breaks = c(0,50,100,200,300,400,500)) + + labs(x="calendar month", y="total users") + + ggtitle(expression(atop("Total active users on each domain", + atop(italic("Aesthetic mapping: color - domain"),"")))) + + non_horizontal_theme + +# for each domain in each month, calculate the change in number of users from the previous month (user change) +uc <- data %>% + group_by(domain_numeric, calendar_month) %>% + summarise(tot_active_users = n_distinct(user_pk)) %>% + mutate(change = tot_active_users - lag(tot_active_users)) # to be rewritten with tally() function + +# bar chart +p_uc_bar <- ggplot(uc, aes(x=calendar_month)) + + geom_bar(aes(y=change, fill=factor(domain_numeric)),stat="identity") + + scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + labs(x="calendar month", y="change compared to previous month") + + ggtitle(expression(atop("Changes in active users on each domain", + atop(italic("Aesthetic mapping: color - domain"),"")))) + + non_horizontal_theme + + theme(axis.text.x=element_text(angle=90))+ + theme(title=title.style) + +# for each domain, each month of a year, count total active days (of each user) and sort in descending order +tad <- data %>% + group_by(domain_numeric, calendar_month) %>% + summarise(tot_active_days = sum(active_days)) + +# TO BE UPDATED: Y-AXIS & LEGENDS +p_tad <- ggplot(tad, aes(x=calendar_month, y=tot_active_days,group=factor(domain_numeric))) + + geom_line(aes(colour=factor(domain_numeric), size=1)) + + scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + scale_y_discrete("tot_active_days", breaks = c(500,1000,1500,2000,2500,3000,3500,4000,4500)) + + labs(list(x="calendar month", y="total active days")) + + ggtitle(expression(atop("Total active days on each domain", + atop(italic("Aesthetic mapping: color - domain"),"")))) + + non_horizontal_theme + + theme(title=title.style) + + theme(axis.text.x=element_text(angle=90)) + + +# limit data submitted to submissions from 2012-10-31 to 2014-10-31 for all visuals +tau_sub <- filter(tau, calendar_month >= as.Date("2012-10-31") & calendar_month <= as.Date("2014-10-31")) +tad_sub <- filter(tad, calendar_month >= as.Date("2012-10-31") & calendar_month <= as.Date("2014-10-31")) + +# for 10 skyrocketing months in y-axis variables, identity their domain_numeric +tau_skyrockets <- arrange(tau_sub, desc(user_pk))[1:10,] +tad_skyrockets <- arrange(tad_sub, desc(tot_active_days))[1:10,] + +# bar chart viz: total active users +tau_sub <- arrange(tau_sub, calendar_month, user_pk, domain_numeric) +p_tau_sub <- ggplot(tau_sub, aes(x=calendar_month)) + + geom_bar(aes(y=(user_pk), fill=factor(domain_numeric)), # need to add color scale to distinguish adjacent domains with same color fill + stat="identity", + width=20) + + scale_fill_manual(values=rep(brewer.pal(10,"Set3"),times=26)) + + scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + non_horizontal_theme + + labs(x="calendar month", + y="total active users") + + ggtitle(expression(atop("Total active users on each domain in last two years", + atop(italic("Aesthetic mapping: color - domain, size - total users",""))))) + + theme(title=title.style) + + +# bar chart viz: total active days +tad_sub <- arrange(tad_sub, calendar_month, tot_active_days, domain_numeric) +p_tad_sub <- ggplot(tad_sub, aes(x=calendar_month)) + + geom_bar(aes(y=(tot_active_days), fill=factor(domain_numeric)), # need to add color scale to distinguish adjacent domains with same color fill + stat="identity", + width=20) + + scale_fill_manual(values=rep(brewer.pal(10,"Set3"),times=26)) + + scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + horizontal_theme + + labs(x="calendar month", + y="total active days") + + ggtitle(expression(atop("Total active days on each domain in last two years", + atop(italic("Aesthetic mapping: color - domain, size - active days",""))))) + + theme(title=title.style) + + +# log scale y-axis variables +bu_sub <- data[,c("domain_numeric","business_unit")] +# find unique domain_numeric +bu_sub <- bu_sub[!duplicated(bu_sub$domain_numeric),] +tad_sub_2 <- inner_join(tad_sub, bu_sub, by = "domain_numeric") +tad_sub_2$log_tot_active_days <- round(log2(tad_sub_2$tot_active_days),1) +tad_sub_2 <- arrange(tad_sub_2, domain_numeric, calendar_month) +tad_sub_2$bu <- ifelse(tad_sub_2$business_unit == "DSI", "1", "0") +tad_sub_2[which(is.na(tad_sub_2$bu)),]$bu = 0 + +p_tad_sub_2 <- ggplot(tad_sub_2, aes(x=calendar_month, y=log_tot_active_days, group=factor(domain_numeric))) + + geom_line(aes(color=factor(bu),size=factor(bu))) + # need to add color scale to distinguish adjacent domains with same color fill + scale_size_manual(values=c(0.05, 1.2)) + + scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + non_horizontal_theme + + labs(x="calendar month", + y="total active days" + ) + + ggtitle(expression(atop("Total active days on each domain in last two years (log scale)", + atop(italic("Thick blue: DSI domains, Thin Pink: Non-DSI domains"),"")))) + + theme(title=title.style) + + +# path to save plot to (each version should be stored in different sub-folders) +mypath=file.path("D://Dropbox//Dropbox (Dimagi)//data tests", "motion_chart", "1.1") + +ggsave("1.2_domain_total_active_days_log_scale.pdf", p_tad_sub_2, path=mypath, width=12, height=8) +ggsave("1.1_domain_total_active_days_bar_chart.pdf", p_tad_sub, path=mypath, scale=2) +ggsave("1.2_domain_total_active_users.pdf", p_tau, path=mypath, width=12, height=8) +ggsave("1.2_domain_active_user_change.pdf", p_uc_bar, path=mypath, width=12, height=8) +ggsave("1.2_domain_total_active_days.pdf", p_tad, path=mypath, width=12, height=8) + +# polar histogram + +p_hvd_tad_3 = ggplot(hvd_tad, aes(x=dm_rank_pos_prev,y=calendar_month)) + + geom_tile(aes(fill=factor(dm_rank_pos_prev),alpha=factor(tad_diff_cat))) + + scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) + + scale_alpha_discrete(range=c(0.3,1))+ + horizontal_theme + + theme(title=title.style, + aspect.ratio=1) + + scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + + ggtitle(expression(atop(italic("Domain activity: changes in total active days over time"), + atop("Current vs. Previous Month: Darker color represents an increase in active days", "")))) + + coord_flip() + + coord_polar() + +ggsave("1.2_domain_change_active_days_polar.pdf", p_hvd_tad_3, path=mypath, scale=2) \ No newline at end of file diff --git a/visualizations/hq_viz_data_table.R b/visualizations/hq_viz_data_table.R new file mode 100644 index 0000000..aa86f42 --- /dev/null +++ b/visualizations/hq_viz_data_table.R @@ -0,0 +1,4 @@ +################################## +# data table accompanying hq viz # +################################## + diff --git a/visualizations/hq_viz_levelplot.R b/visualizations/hq_viz_levelplot.R new file mode 100644 index 0000000..098878b --- /dev/null +++ b/visualizations/hq_viz_levelplot.R @@ -0,0 +1,37 @@ +######################################## +# using levelplot for hq visualization # +######################################## + +library(lattice) +library(zoo) +# create a matrix using domain_numeric, calendar_month +ldata = select(data, domain_numeric, calendar_month) + +lmatrix = matrix(0, + nrow=length(unique(ldata$domain_numeric)), + ncol=length(unique(ldata$calendar_month))) # create an empty matrix +rownames(lmatrix) = paste("domain_", unique(ldata$domain_numeric), sep="") +colnames(lmatrix) = as.character(sort(unique(ldata$calendar_month))) + +# split ldata into a list of data frames by domain_numeric +ldata = split(ldata, ldata$domain_numeric) + +# assign values to each cell in the matrix +for (i in seq(nrow(lmatrix))) { + lmatrix[i,] = as.numeric(as.Date(colnames(lmatrix)) %in% + unique(ldata[[i]]$calendar_month)) +} + +# replace 0 with NA +lmatrix[lmatrix == 0] <- NA + +levelplot(t(lmatrix), + colorkey = FALSE, + scales = list(x = list(rot = 90)), + panel = function(x, y, z,...) { + panel.abline(v = unique(as.numeric(x)), + h = unique(as.numeric(y)), + col = "darkgrey") + panel.xyplot(x, y, pch = 16 * z, ...) + }, + xlab = "calendar month")