diff --git a/aggregate_tables.R b/aggregate_tables.R
index bed2d0c..ea593f1 100644
--- a/aggregate_tables.R
+++ b/aggregate_tables.R
@@ -1,6 +1,7 @@
 library(dplyr)
 library(DBI)
 library(rjson)
+library(parallel)
 
 source('s_dplyr.R')
 source('aggregate_tables/indicator_functions.R')
@@ -17,6 +18,11 @@ get_db_connection <- function(system_config_path='config_system.json') {
     return(db)
 }
 
+get_cores <- function(system_config_path='config_system.json') {
+  config <- fromJSON(file=system_config_path)[['data_platform']]
+  if ('parallel' %in% names(config)) {return(config[['parallel']][['cores']])} else {return (1)}
+}
+
 drop_tables <- function(file) {
   config <- fromJSON(file=file)
   
@@ -32,26 +38,38 @@ write_tables <- function(file, debug) {
     db <- get_db_connection()
     for (table.info in config) {
         print(paste('Computing indicators for ', table.info$table, 'indicator table.'))
-        df <- compute_indicators(table.info, db, debug)
+        df <- compute_indicators(table.info, debug)
         print(paste('Writing ', table.info$table, 'indicator table.'))
         dbRemoveTable(db$con, name=table.info$table)
         copy_to(db, df=df, name=table.info$table, temporary=FALSE)
     }
 }
 
-compute_indicators <- function(info, db, debug) {
+compute_indicators <- function(info, debug) {
   debug <- as.logical(debug)
   if (debug == T) {limit = 5000} else {limit = -1}
-    dfs <- lapply(info$components, function(component) {
+  cores <- get_cores()
+  cl <- makeCluster(cores, outfile='/tmp/aggregation.log', type="FORK")
+  clusterExport(cl,varlist=c('info','limit'),envir=environment())
+  clusterExport(cl,varlist=c('get_data_source','s_group_by','aggregate','get_db_connection','fromJSON'))
+  
+  dfs <- parLapply(cl,info$components, function(component) {
         print(paste('Getting data source ', component$table))
+        source('s_dplyr.R')
+        source('aggregate_tables/indicator_functions.R')
+        source('data_sources.R')
+        db <- get_db_connection()
         source.data <- get_data_source(db, component$table, limit)
         group.by.str <- paste(info$by, collapse=', ')
         print(paste('Grouping and aggregating', component$table))
         df <- source.data %.% s_group_by(group.by.str) %.% aggregate(component$columns)
+        print(paste('Returning aggregated data from ', component$table))
         return(df)
     })
+  
     print('merging...')
     merged <- Reduce(function(...) merge(..., all.x=TRUE, all.y=TRUE, by=info$by), dfs)
+    stopCluster(cl)
     return(merged)
 }
 
diff --git a/aggregate_tables.json b/aggregate_tables.json
deleted file mode 100644
index be295b4..0000000
--- a/aggregate_tables.json
+++ /dev/null
@@ -1,117 +0,0 @@
-[
-   {
-      "table":"aggregate_lifetime_interactions",
-      "by":[
-         "domain",
-         "user_id",
-         "user_pk"
-      ],
-      "components":[
-         {
-            "table":"visit_detail",
-            "columns":{
-               "date_first_visit":"date_first_visit",
-               "date_last_visit":"date_last_visit",
-               "nvisits":"nvisits",
-               "calendar_month_on_cc":"calendar_month_on_cc",
-               "nforms":"nforms",
-               "median_visit_duration":"median_visit_duration",
-               "median_visits_per_day":"median_visits_per_day",
-               "median_visits_per_month":"median_visits_per_month",
-               "morning":"morning",
-               "afternoon":"afternoon",
-               "evening":"evening",
-               "night":"night",
-               "time_using_cc":"time_using_cc",
-               "nvisits_travel":"nvisits_travel",
-               "nvisits_travel_batch":"nvisits_travel_batch"
-            }
-         },
-         {
-            "table":"interactions",
-            "columns":{
-               "ninteractions":"ninteractions",
-               "ncases_registered":"ncases_registered",
-               "register_followup":"register_followup",
-               "case_register_followup_rate":"case_register_followup_rate",
-               "ncases_touched":"ncases_touched",
-               "nunique_followups":"nunique_followups"
-            }
-         },
-         {
-            "table":"device_type",
-            "columns":{
-               "summary_device_type":"summary_device_type",
-               "nforms": "nforms",
-               "active_days":"active_days",
-               "active_months":"active_months"
-
-            }
-         },
-         {
-            "table":"device_log_types_by_user",
-            "columns":{
-               "total_logs":"total_logs",
-               "audio_plays":"audio_plays",
-               "network_warnings":"network_warnings"
-            }
-         }
-      ]
-   },
-   {
-      "table":"aggregate_monthly_interactions",
-      "by":[
-         "domain",
-         "user_id",
-         "user_pk",
-         "month.index"
-      ],
-      "components":[
-         {
-            "table":"visit_detail",
-            "columns":{
-               "date_first_visit":"date_first_visit",
-               "date_last_visit":"date_last_visit",
-               "nvisits":"nvisits",
-               "median_visit_duration":"median_visit_duration",
-               "median_visits_per_day":"median_visits_per_day",
-               "morning":"morning",
-               "afternoon":"afternoon",
-               "evening":"evening",
-               "night":"night",
-               "numeric_index":"numeric_index",
-               "time_using_cc":"time_using_cc",
-               "nvisits_travel":"nvisits_travel",
-               "nvisits_travel_batch":"nvisits_travel_batch"
-            }
-         },
-         {
-            "table":"interactions",
-            "columns":{
-               "ninteractions":"ninteractions",
-               "ncases_registered":"ncases_registered",
-               "register_followup":"register_followup",
-               "case_register_followup_rate":"case_register_followup_rate",
-               "ncases_touched":"ncases_touched",
-               "nunique_followups":"nunique_followups"
-            }
-         },
-         {
-            "table":"device_type",
-            "columns":{
-               "summary_device_type":"summary_device_type",
-               "nforms": "nforms",
-               "active_days":"active_days"
-            }
-         },
-         {
-            "table":"device_log_types_by_user",
-            "columns":{
-               "total_logs":"total_logs",
-               "audio_plays":"audio_plays",
-               "network_warnings":"network_warnings"
-            }
-         }
-      ]
-   }
-]
\ No newline at end of file
diff --git a/aggregate_tables/indicator_functions.R b/aggregate_tables/indicator_functions.R
deleted file mode 100644
index b15edfe..0000000
--- a/aggregate_tables/indicator_functions.R
+++ /dev/null
@@ -1,92 +0,0 @@
-## TODO: I think it would be good to switch over to lubridate to do
-## all our date manipulations. We have some legacy code that uses
-## zoo.
-library(lubridate)
-library(zoo)
-
-# VISIT TABLE INDICATORS:
-date_first_visit <- function(x) min(x$visit_date, na.rm=TRUE)
-date_last_visit <- function(x) max(x$visit_date, na.rm=TRUE)
-nvisits <- function(x) NROW(x)
-
-get_visit_duration <- function(start, end) {
-  dur <- as.numeric(end - start, units="mins") 
-  if (is.na(dur)) return(NA)
-  if (dur < 0) return(0)
-  if (dur > 30) return(30)
-  return(dur)
-}
-durations <- function(x) na.omit(mapply(get_visit_duration, x$time_start, x$time_end))
-median_visit_duration <- function(x) round(as.numeric(median(durations(x))), digits = 1)
-time_using_cc <- function(x) sum(durations(x), na.rm = T)
-
-median_visits_per_day <- function(x) median(as.numeric(table(x$visit_date)), na.rm=TRUE)
-nvisits_travel <- function(x) sum(x$home_visit, na.rm=T)
-nvisits_travel_batch <- function(x) sum(x$time_since_previous_hv/60<10, na.rm = T)
-
-# Proportion of visits by time of day
-morning <- function(x) mean(x$visit_time == 'morning')*100
-afternoon <- function(x) mean(x$visit_time == 'afternoon')*100
-evening <- function(x) mean(x$visit_time == 'night')*100
-night <- function(x) mean(x$visit_time == 'after midnight')*100
-
-# User's first, second, third... etc. month on CC
-numeric_index <- function (x) {
-  first_possible_visit_date <- as.POSIXct(strptime("2010-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"))
-  
-  this_month <- as.POSIXct(format(min(x$time_start),"%Y-%m-01"), tz = "UTC")
-  if (this_month < first_possible_visit_date) { return (1) }
-  
-  start_month <- as.POSIXct(format(min(x$user_start_date),"%Y-%m-01"), tz = "UTC")
-  if (start_month < first_possible_visit_date) {start_month <- first_possible_visit_date}
-  
-  total_months <- length(seq(from=start_month, to=this_month, by='month'))
-  return (total_months)
-}
-
-## The next indicators are only applicable for the lifetime table.
-calendar_month_on_cc <- function(x) {
-  first.month <- as.yearmon(date_first_visit(x))
-  last.month <- as.yearmon(date_last_visit(x))
-  nmonths <- 12 * as.numeric(last.month - first.month) + 1
-  return(nmonths)
-}
-median_visits_per_month <- function(x) median(as.numeric(table(as.yearmon(x$visit_date))), na.rm=TRUE)
-
-# INTERACTION TABLE INDICATORS:
-ninteractions <- function(x) NROW(x)
-ncases_registered <- function(x) sum(x$created, na.rm=TRUE)
-register_followup <- function(x) sum(!x$created)
-case_register_followup_rate <- function(x) mean(!x$created)*100
-
-ncases_touched <- function(x) length(unique(x$case_id))
-n_followups <- function(x) {
-  stopifnot(!any(is.na(x$created)))
-  stopifnot(all(x$created == 0 | x$created == 1))
-  return(length(x$case_id[x$created == 0]))
-}
-nunique_followups <- function(x) {
-  stopifnot(!any(is.na(x$created)))
-  stopifnot(all(x$created == 0 | x$created == 1))
-  return(length(unique(x$case_id[x$created == 0])))
-}
-median_days_btw_followup <- function(x) median(x$days_elapsed_case)
-
-# DEVICE TYPE TABLE INDICATORS:
-summary_device_type <- function (x) {
-  if (length(unique(x$device)) == 1) {
-    s <- paste(toupper(substring(x$device[1], 1,1)), substring(x$device[1], 2),
-               sep="", collapse=" ")
-    return (s)
-  } else {
-    return ('Multi')
-  }
-}
-nforms <- function(x) NROW(x)
-active_days <- function(x) length(unique(as.Date(x$time_start)))
-active_months <- function(x) length(unique(x$month.index))
-
-# DEVICE LOG TABLE INDICATORS:
-total_logs <-function(x) sum(x$num_logs)
-audio_plays <-function(x) if (NROW(x[x$log_type=='audio',]) > 0) sum(x[x$log_type=='audio',c('num_logs')]) else 0
-network_warnings <-function(x) if (NROW(x[x$log_type=='warning-network',]) > 0) sum(x[x$log_type=='warning-network',c('num_logs')]) else 0
diff --git a/analysis_scripts/mchen/blog_3/user_consistency.R b/analysis_scripts/mchen/blog_3/user_consistency.R
new file mode 100644
index 0000000..0e8e975
--- /dev/null
+++ b/analysis_scripts/mchen/blog_3/user_consistency.R
@@ -0,0 +1,210 @@
+#####################################################################
+# visualization code developed for Dimagi's data blog series  
+# Dec 17, 2014
+#####################################################################
+
+library(grid)
+library(gridExtra)
+library(ggplot2)
+library(sorvi)
+library(dplyr)
+library(RColorBrewer)
+
+
+
+
+
+############################################ 
+# domain consistency comparison plot
+############################################
+
+# side-by-side plot of two demo domains: domain 18 and 264
+# data source: domain_consistency_comparison.csv
+data <- read.table("domain_consistency_comparison.csv",header=T,sep=",")
+dm1 <- data %>% filter(data$domain_numeric == 18)
+dm2 <- data %>% filter(data$domain_numeric == 264)
+
+source("plot_funcs.R")
+plot1 <- plotOut(dm1,"red","#67a9cf","#ef8a62",50,50,paste("r^2==",0.75),5,45)
+plot2 <- plotOut(dm2,"red","#67a9cf","#ef8a62",50,50,paste("r^2==",0.36),5,45)
+
+# output two plots side-by-side 
+# format: pdf
+pdf(file="domain_comparison.pdf",
+    width=14,
+    height=10)
+# png("domain_comparison.png", width = 1600, height = 1600, units = "px", pointsize = 16,bg = "transparent", res=150)
+grid.arrange(plot1, plot2, ncol = 2)
+dev.off()
+
+
+
+
+
+############################################ 
+# top10 vs. bottom10 comparison plot
+############################################
+top_10p <- read.table("top_10p.csv",header=T,sep=",")
+bottom_10p <- read.table("bottom_10p.csv",header=T,sep=",")
+plot3 <- plotOut(top_10p, "red", "#67a9cf","#ef8a62",100,100,paste("r^2==",0.69),10,90)
+plot4 <- plotOut(bottom_10p, "red", "#67a9cf","#ef8a62",100,100,paste("r^2==",0.34),10,90)
+pdf(file="top_bottom_comparison.pdf",
+    width=14,
+    height=10)
+grid.arrange(plot3, plot4, ncol = 2)
+dev.off()
+
+
+
+
+
+######################
+# density plot
+######################
+# main plot: March only data
+# exclude users who visit more than 50 cases 
+# exclude users who visit only 1 case that month
+
+test2 <- read.table("test2.csv",header=T,sep=",")
+data <- test2 %>% filter(test2$x <= 50 & test2$x > 1)
+mainPlotData<- data %>% filter(as.character(data$calendar_month) == "2014-03-01")
+p <- ggplot(mainPlotData, aes(x = x, y = y)) + 
+  geom_point(position=position_jitter(w=0.4,h=0.4),
+             colour="black",
+             alpha=0.7) + 
+  xlim(1,50) + ylim(1,50)
+p1 <- p + 
+  stat_density2d(aes(fill = rev(..level..)), geom = "polygon")  
+p2 <- p1 + 
+  labs(x="Cases visited in March 2014", y = "Cases visited in April 2014") + 
+  theme_bw() + 
+  theme(legend.position="none") + 
+  theme(aspect.ratio = 1) 
+# increase label size, space between axis and labels
+p3 <- p2 + theme(plot.margin = unit(c(1.5,1.5,1.5,1.5), "cm"),
+                 axis.title.x = element_text(size=rel(1.5), vjust = -1.5),
+                 axis.title.y = element_text(size=rel(1.5), vjust = 1.5)
+)
+
+ggsave("mainPlot.pdf", p3, dpi = 600, scale = 1.5, width = 4, height = 4)
+
+
+
+
+
+######################
+# Line graph
+######################
+attrition <- read.table("leadup.csv", header = T, sep = ",")
+month_levels <- rev(levels(attrition$X1))
+
+l1 <- ggplot(attrition, aes(x = X1, y = X2, colour = att_duration, group = att_duration, linetype = att_duration)) +
+  geom_point(shape = 15, size = 4.0, colour="peachpuff4") +
+  geom_line(size = 1.5) + 
+  scale_x_discrete("Month 'X' before attrition event",limits = month_levels) 
+
+l2 <- l1 + 
+  scale_y_continuous("Number of cases visited",limits=c(0,12),breaks=c(0,4,8,12)) + 
+  theme(aspect.ratio=0.5,
+#        panel.grid.minor=element_blank(),
+        panel.grid.major=element_line(colour = "white",size=1),
+        panel.background=element_blank()) 
+
+l3 <- l2 + theme(axis.title.x = element_text(size=rel(1.5), vjust = -1.5),
+                 axis.title.y = element_text(size=rel(1.5), vjust = 1.5),
+                 legend.title = element_blank())
+
+
+
+
+
+
+##########################################
+# user experience: stacked barplot
+##########################################
+
+# use colclasses to load data faster (http://www.r-bloggers.com/using-colclasses-to-load-data-more-quickly-in-r/)
+ux_data <- read.csv("user_experience.csv", 
+                    stringsAsFactors = FALSE, 
+                    colClasses=c("integer","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric",
+                                 "factor","factor","factor","factor"))
+colnames(ux_data)[14:17] <- c("Q1 to Q2","Q2 to Q3", "Q3 to Q4", "Q1 to Q4")
+
+# resize bins: 
+# <-20%, 1
+#[-20%,20%], 2
+#[20%,50%],3
+# >50%,4
+
+binCut <- function(var, n1, n2, n3, n4){
+  sapply(var, function(x) 
+    if(x <= n1)                  {return("1")
+    } else if(x > n1 & x <= n2)  {return("2")
+    } else if (x > n2 & x <= n3) {return("3")
+    } else if (x > n3 & x <= n4) {return("4")
+    } else {return("5")
+    })  
+}
+
+ux_data$Q1_to_Q2 <- binCut(ux_data$percent_change_q1_2, -50, -20, 20, 50)
+ux_data$Q2_to_Q3 <- binCut(ux_data$percent_change_q2_3, -50, -20, 20, 50)
+ux_data$Q3_to_Q4 <- binCut(ux_data$percent_change_q3_4, -50, -20, 20, 50)
+ux_data$Q1_to_Q4 <- binCut(ux_data$percent_change_q1_4, -50, -20, 20, 50)
+
+# reshape data: wide to long
+l <- reshape(ux_data, 
+             varying = c("Q1_to_Q2","Q2_to_Q3","Q3_to_Q4","Q1_to_Q4"),
+             v.names = "percentage_change",
+             timevar = "quarter",
+             times = c("Q1_to_Q2","Q2_to_Q3","Q3_to_Q4","Q1_to_Q4"),
+             direction = "long")
+
+# change the order of levels of factor quarter for visualization
+l$quarter <- factor(l$quarter, levels = c("Q1_to_Q2","Q2_to_Q3","Q3_to_Q4","Q1_to_Q4"))
+
+p0 <- ggplot(l, aes(x=factor(quarter), fill=factor(percentage_change), y=4*100*(..count..)/sum(..count..)))
+p1 <- p0 + geom_bar(width=0.7) +
+  scale_fill_manual(values = c("#e34a33","#fdbb84","#e5e5ab","#91bfdb","#67a9cf"), 
+                    breaks = c(1:5), 
+                    labels = c("Substantial drop (>50%)",
+                               "Moderate drop (20-50%)",
+                               "Stable (+/- 20%)",
+                               "Moderate increase (20-50%)",
+                               "Substantial increase (>50%)")) 
+
+##########################################################################
+### control style (http://www.cookbook-r.com/Graphs/Legends_(ggplot2)/)###
+##########################################################################
+
+# change title appearance
+p2 <- p1 + 
+  labs(x="Interval", y="Percent of Users") +
+  scale_x_discrete(labels=c("Q1 to Q2",
+                            "Q2 to Q3",
+                            "Q3 to Q4",
+                            "Q1 to Q4")) +
+  theme(plot.title = element_text(face = "bold"))
+
+# set title to twice the base font size
+p3 <- p2 + theme(plot.title = element_text(size = rel(1.8)))
+
+# change panel and plot attributes
+p4 <- p3 + theme(panel.background = element_blank()) 
+
+# change legend attributes
+p5 <- p4 + theme(legend.text = element_text(size = 10),
+                 legend.title = element_blank(),
+                 legend.background = element_rect(colour = "black"),
+                 legend.key.size = unit(0.75, "cm"),
+                 axis.title.x = element_text(vjust = - 1.5, size = rel(1.35)),
+                 axis.title.y = element_text(vjust = 1.5, size = rel(1.35)),
+                 plot.margin = unit(c(1,1,1,1),"cm")) 
+
+# reverse the legend order
+p6 <- p5 + guides(fill = guide_legend(reverse=TRUE))
+
+print(p6)
+ggsave("version6.pdf", p6, dpi = 600)
+
+
+
diff --git a/analysis_scripts/mchen/blog_5/data_prep_no_gap.R b/analysis_scripts/mchen/blog_5/data_prep_no_gap.R
new file mode 100644
index 0000000..5c4c4a3
--- /dev/null
+++ b/analysis_scripts/mchen/blog_5/data_prep_no_gap.R
@@ -0,0 +1,98 @@
+library(plyr)
+library(dplyr)
+library(zoo)
+
+# load data
+data = tbl_df(read.csv("blog.csv", stringsAsFactors=FALSE))
+data$calendar_month = as.Date(data$calendar_month, "%m/%d/%y") 
+
+# subset columns of interest
+all = select(data, domain_numeric, user_pk, active_days, ncases_touched, calendar_month)
+all = filter(all, calendar_month >= as.Date("2010-01-01")) %>% 
+  filter(., calendar_month <= as.Date("2014-12-01")) # reformat date
+
+# flagship project in India (pre-rebalancing)
+crs = filter(all, domain_numeric == 40)
+
+# total active months per user 
+m = all %>%
+  group_by(domain_numeric, user_pk) %>% # duplicated user_id exists across domains
+  summarise(nmonths = n_distinct(calendar_month))
+
+# excluding first N months of data (depends on reporting needs)
+excdMonth = function(data, n) {
+  data = data %>%
+    group_by(domain_numeric, user_pk) %>%
+    filter(., row_number() > n) 
+  return(data)
+}
+
+
+# Rebalancing data: no domain contribute more than 10% to the data
+# total active months per domain
+totalMonths = function(x) {
+  y = tbl_df(as.data.frame(table(x$domain_numeric)))
+  y = arrange(y, desc(Freq))
+  names(y) = c("domain_numeric", "months")
+  y$pct = y$months/sum(y$months)
+  return(y)
+}
+# min months to be dropped (or added) on each domain to get to 10% contribution 
+drop = function(x, N) {
+  x$bal = (x$months - sum(x$months)*N)/(1-N)
+  return(x)
+}
+
+rebalanceData = function(data, N) {
+  ov_month_data = totalMonths(data)
+  K = drop(ov_month_data, N)
+  dms = filter(K, bal > 0)
+  keepVal = dms$months - ceiling(dms$bal)
+  rebal_dm = as.numeric(as.character(dms$domain_numeric))
+  sdata = filter(data, domain_numeric %in% rebal_dm) 
+  ndata = filter(data, !(domain_numeric %in% rebal_dm)) 
+  sp = split(sdata, sdata$domain_numeric)
+  for (i in seq_len(length(sp))) {
+    sp[[i]] = sp[[i]][sample(1:nrow(sp[[i]]), keepVal[i], FALSE),]
+  }
+  out = tbl_df(do.call(rbind, sp))
+  rebalanced_data = tbl_df(rbind(out, ndata))  
+  return(rebalanced_data)
+}
+
+# subset 1: all data rebalanced
+rbl_all = rebalanceData(all, 0.10)
+p1 = excdMonth(rbl_all, 3) 
+
+# subset 2: all data excluding first 6 months 
+p2 = excdMonth(all, 6) 
+# reblanaced
+rbl_p2 = rebalanceData(p2, 0.10) 
+
+# subset 3: all data (unbalanced) including only users that have at least 18 months of usage
+# rebalanced
+u = select(filter(m, nmonths >= 18), user_pk)
+qdata = filter(all, user_pk %in% u$user_pk) # exclude users active for < 18 months
+
+# indexing each user-month
+qdata = arrange(qdata, domain_numeric, user_pk, calendar_month)
+qdata = qdata %>%
+  group_by(domain_numeric, user_pk) %>%  
+  mutate(user_month_index = seq(n()))  
+
+# Bin user months to custom quarters
+qdata$user_quarter_index = round_any(qdata$user_month_index, 3, ceiling)/3
+# Rebalance data from quarterly data (only for Q1-Q6)
+qdata_sub = filter(qdata, user_quarter_index <= 6)
+qdata_split = split(qdata_sub, qdata_sub$user_quarter_index, drop=TRUE) 
+rbl_qdata_split = lapply(qdata_split, function(x) rebalanceData(x, 0.10))
+rbl_qdata = do.call(rbind, rbl_qdata_split)
+
+# subset 4: 12 most active domains in terms of total user-months
+bb = m %>%
+  group_by(domain_numeric) %>%
+  summarise(um = sum(nmonths)) %>%
+  arrange(., desc(um)) %>%
+  top_n(12)
+
+bb_data = filter(rbl_all, domain_numeric %in% bb$domain_numeric)
diff --git a/analysis_scripts/mchen/blog_5/data_prep_with_gap.R b/analysis_scripts/mchen/blog_5/data_prep_with_gap.R
new file mode 100644
index 0000000..e91c636
--- /dev/null
+++ b/analysis_scripts/mchen/blog_5/data_prep_with_gap.R
@@ -0,0 +1,78 @@
+###################################
+# ADD GAP MONTHS BACK TO THE GAME #
+###################################
+
+library(Hmisc) # monthDays() to get the exact number of days in a specific month
+
+# for each user, generate a vector of months according to the distance variable
+expandMonths = function(data) {
+  months = seq(from = data$first_active_month, to = data$last_active_month,
+               by = "months") # length.out is adding extra month for some reason
+  noGap = data.frame(rep(data$user_pk,length(months)), months)
+  return(noGap)
+}
+
+# for each user, get distance between first and last active month
+gapBack = function(data){
+  data = arrange(data, domain_numeric, user_pk, calendar_month)
+  uf = tbl_df(ddply(data, .(domain_numeric, user_pk), function(x)head(x,n=1)))
+  ul = tbl_df(ddply(data, .(domain_numeric, user_pk), function(x)tail(x,n=1)))   
+  colnames(uf)[5] = c("first_active_month")
+  colnames(ul)[5] = c("last_active_month")
+  uf = select(uf, domain_numeric, user_pk, first_active_month)
+  ul = select(ul, domain_numeric, user_pk, last_active_month)
+  ufl = inner_join(uf, ul)
+  ufl$distance = 12*as.numeric(as.yearmon(ufl$last_active_month) - 
+                                 as.yearmon(ufl$first_active_month)) + 1
+  expanded = tbl_df(ddply(ufl, .(domain_numeric, user_pk), function(x) expandMonths(x)))
+  expanded = select(expanded, domain_numeric, user_pk, months)
+  names(expanded) = c("domain_numeric", "user_pk", "calendar_month")  
+  return(expanded)
+}
+
+# join the expanded data frame with all monthly data
+all_expanded = gapBack(all)
+all_expanded = left_join(expanded, all)
+all_expanded[is.na(all_expanded)] <- 0
+
+# for all data excluding 3 months, add back gap months
+p1$percent_active_days = p1$active_days/monthDays(p1$calendar_month)
+p1_expanded = gapBack(p1)
+p1_expanded = left_join(p1_expanded, p1)
+p1_expanded[is.na(p1_expanded)] <- 0
+
+# for all data excluding 6 months, add back gap months
+p2$percent_active_days = p2$active_days/monthDays(p2$calendar_month)
+p2_expanded = gapBack(p2)
+p2_expanded = left_join(p2_expanded, p2)
+p2_expanded[is.na(p2_expanded)] <- 0
+
+# for 12 big projects, add back gap months
+bb_data$percent_active_days = bb_data$active_days/monthDays(bb_data$calendar_month)
+bb_list = split(bb_data, bb_data$domain_numeric)
+bb_data_expanded = gapBack(bb_data)
+bb_data_expanded = left_join(bb_data_expanded, bb_data)
+bb_data_expanded[is.na(bb_data_expanded)] <- 0
+
+# for quarterly data, add back gap months
+# adding back gap months means user_quarter_index needs to be recomputed
+qdata$percent_active_days = qdata$active_days/monthDays(qdata$calendar_month)
+qdata_expanded = gapBack(qdata)
+qdata_expanded = left_join(qdata_expanded, qdata)
+qdata_expanded[is.na(qdata_expanded)] <- 0
+
+# recompute quarterly index after adding gap months
+# reindex user_month_index
+qdata_expanded = 
+  qdata_expanded %>%
+  group_by(domain_numeric, user_pk) %>%
+  mutate(umi = seq_len(n()))
+qdata_expanded$user_month_index <- NULL                    
+
+# Bin user months to custom quarters
+bins = c(1+(3*(0:ceiling(max(qdata_expanded$umi)/3))))
+labs = paste("Month ", 3*seq(length(bins)-1)-2, "-", 3*seq(length(bins)-1), sep = "")
+
+qdata_expanded = qdata_expanded %>% 
+  group_by(domain_numeric, user_pk) %>%
+  mutate(uqi = cut(umi, breaks = bins, labels = labs, right = FALSE))
diff --git a/analysis_scripts/mchen/blog_5/loess_smoothing.R b/analysis_scripts/mchen/blog_5/loess_smoothing.R
new file mode 100644
index 0000000..892c462
--- /dev/null
+++ b/analysis_scripts/mchen/blog_5/loess_smoothing.R
@@ -0,0 +1,195 @@
+# Element: Point(position(user_pk, percent_active_days))
+# Dim(1): percent of active days
+# Dim(2): ranksource("data_prep_no_gap.R")
+
+# functions
+  # user-month percent active days
+loessTbl = function(var){
+  var = round(var, digits = 2)
+  m1 = as.data.frame(table(var))
+  names(m1) = c("percent_active_days", "count")
+  m1$pct = round(m1$count/sum(m1$count), digits = 4)
+  m1$percent_active_days = as.numeric(as.character(m1$percent_active_days))
+  return(m1)
+}
+
+binning = function(x){
+  x$bins = cut(x$percent_active_days, breaks = 0.05*(0:20), 
+               labels = c("0.05","0.10","0.15","0.20","0.25","0.30","0.35","0.40",
+                          "0.45","0.50","0.55","0.60","0.65","0.70","0.75",
+                          "0.80","0.85","0.90","0.95","1.0"), 
+               include.lowest = TRUE)
+  return(x)
+}
+
+dataPerUser = function(dataList){
+  d1 = lapply(dataList, function(x){
+    y = x %>%
+      group_by(user_pk) %>%
+      summarise(pad = mean(percent_active_days)) # obtain average %days for each unique user
+    z=loessTbl(y$pad)
+    return(z)    
+  })
+}
+
+singleBinnedPlot = function(d1){
+  d1 = binning(d1)
+  d2 = d1 %>%
+    group_by(bins) %>%
+    summarise(percent = sum(pct))
+  return(d2)
+}
+
+binnedDataPlot = function(d1) {
+  d2 = lapply(d1, function(x)binning(x))
+  d3 = lapply(d2, function(x){
+    y = x %>%
+      group_by(bins) %>%
+      summarise(percent = sum(pct))
+    return(y)
+  })
+  for (i in 1:length(d3)){
+    d3[[i]]$facet = names(d3)[i]
+  }
+  d4 = do.call(rbind, d3)
+  return(d4)
+}
+
+
+# Data: all data excluding first 3 months
+# Figure 1.1 %days, user-month, excluding gap months 
+data_1_1 = loessTbl(p1$percent_active_days)
+data_1_1_binned = singleBinnedPlot(data_1_1)
+
+# Figure 1.3 %days, per user, excluding gap months
+data_1_3 = p1 %>%
+  group_by(domain_numeric, user_pk) %>%
+  summarise(pad = mean(percent_active_days)) 
+data_1_3 = loessTbl(data_1_3$pad)
+data_1_3 = singleBinnedPlot(data_1_3)
+
+# Data: all data excluding first 6 months
+# Figure 2.1 %days, user-month, excluding gap months
+data_2_1 = loessTbl(p2$percent_active_days)
+data_2_1_binned = singleBinnedPlot(data_2_1)
+
+# Loess smooth plot
+lineIntercept = c(quantile(p2$percent_active_days, c(.25, .50, .75)), 
+                  mean(p2$percent_active_days))
+f1_1 = ggplot(data_2_1_binned, aes(x=as.character(bins), y=percent)) + 
+  geom_point(size = 2.5)+
+  labs(x="",
+       y="")+
+  geom_smooth(se=FALSE, col = "#882608", size = 1.5, aes(group=1))+ 
+  geom_vline(xintercept = lineIntercept[1]/0.05, 
+             linetype = "longdash", color = "#005266", size = 1) +
+  geom_vline(xintercept = lineIntercept[2]/0.05, 
+             linetype = "solid", color = "#005266", size = 1) +
+  geom_vline(xintercept = lineIntercept[3]/0.05, 
+             linetype = "longdash", color = "#005266", size = 1) +
+  geom_vline(xintercept = lineIntercept[4]/0.05, 
+             linetype = "dotted", color = "#005266", size = 1) +
+  ylim(0, 0.20) +
+  guides(fill=FALSE) +
+  theme(panel.background = element_rect(fill="#d5e4eb"), 
+        plot.background = element_rect(fill="#d5e4eb"),
+        panel.grid.major.y = element_line(size=1.0),
+        panel.grid.minor.y = element_blank(),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.x = element_blank(),
+        axis.ticks.y = element_blank(),
+        legend.position = "none") 
+
+ggsave("f1_1.pdf", f1_1, width=10, height=10)
+
+# Data: 12 big domains
+bb_list = split(bb_data, bb_data$domain_numeric)
+bb_expanded_list = split(bb_data_expanded, bb_data_expanded$domain_numeric)
+# Figure 3.1 %days, user-month, excluding gap months
+data_3_1 = lapply(bb_list, function(x) loessTbl(x$percent_active_days))
+data_3_1_binned = binnedDataPlot(data_3_1)
+
+# get x-axis intercept for quantile lines for data frames in a list  
+lineIntercept = function(data){
+  y = lapply(data, function(x)c(quantile(x$percent_active_days, 
+                                         c(.25, .50, .75)), 
+                                mean(x$percent_active_days)))
+  facet = names(y)
+  li = as.data.frame(do.call("rbind", y))
+  li$facet = facet
+  names(li)[4] <- c("mean")
+  return(li)
+}
+
+li = lineIntercept(bb_list)
+
+f3_1 = ggplot(data_3_1_binned, aes(x=as.character(bins), y=percent)) + 
+  geom_point(size = 2.5)+
+  facet_wrap(~facet, ncol=3, as.table = TRUE) +
+  labs(x="",
+       y="")+
+  geom_smooth(se=FALSE, col = "#882608", size = 1.5, aes(group=1))+ 
+  ylim(0, 0.50) +
+  guides(fill=FALSE) +
+  theme(panel.background = element_rect(fill="#d5e4eb"), 
+        plot.background = element_rect(fill="#d5e4eb"),
+        panel.grid.major.y = element_line(size=1.0),
+        panel.grid.minor.y = element_blank(),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.x = element_blank(),
+        axis.ticks.y = element_blank(),
+        legend.position = "none") 
+
+f3_1 = 
+  f3_1 + geom_vline(aes(xintercept = li[,1]/0.05), li,
+                  linetype = "longdash", color = "#005266", size = 1) +
+  geom_vline(aes(xintercept = li[,2]/0.05), li,
+             linetype = "solid", color = "#005266", size = 1) +
+  geom_vline(aes(xintercept = li[,3]/0.05), li,
+             linetype = "longdash", color = "#005266", size = 1) +
+  geom_vline(aes(xintercept = li[,4]/0.05), li,
+             linetype = "dotted", color = "#005266", size = 1) 
+
+
+# Data: First 6 quarters of all users that have been active for at least 18 months on CC
+# Figure set 4: Quarterly plot of all data (excluding first 3 months)
+qdata_gap_out = split(qdata, qdata$user_quarter_index)
+qdata_gap_out = qdata_gap_out[1:6]
+
+# figure 4.1 %days, user-month, excluding gap months
+data_4_1 = lapply(qdata_gap_out, function(x)loessTbl(x$percent_active_days))
+data_4_1_binned = binnedDataPlot(data_4_1)
+data_4_1_binned$facet = factor(data_4_1_binned$facet, 
+                               levels = c("Month 1-3", "Month 4-6",
+                                          "Month 7-9", "Month 10-12",
+                                          "Month 13-15", "Month 16-18"))
+
+li = lineIntercept(qdata_gap_out)
+
+# plot it
+f4_1 = ggplot(data_4_1_binned, aes(x=bins, y=percent)) + 
+  geom_point(size = 2.5)+
+  facet_wrap(~facet, ncol=3, as.table = TRUE) +
+  #          facet_grid(facet~.) +
+  labs(x="",
+       y="")+
+  geom_smooth(se=FALSE, col = "#882608", size = 1.5, aes(group=1))+ 
+  ylim(0, 0.25) + # upper limit is obtained from summary(sub_qd$pct)
+  guides(fill=FALSE) +
+  theme(panel.background = element_rect(fill="#d5e4eb"), 
+        plot.background = element_rect(fill="#d5e4eb"),
+        panel.grid.major.y = element_line(size=1.0),
+        panel.grid.minor.y = element_blank(),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.x = element_blank(),
+        axis.ticks.y = element_blank(),
+        legend.position = "none") 
+
+f4_1 + geom_vline(aes(xintercept = li[,1]/0.05), li,
+                  linetype = "longdash", color = "#005266", size = 1) +
+  geom_vline(aes(xintercept = li[,2]/0.05), li,
+             linetype = "solid", color = "#005266", size = 1) +
+  geom_vline(aes(xintercept = li[,3]/0.05), li,
+             linetype = "longdash", color = "#005266", size = 1) +
+  geom_vline(aes(xintercept = li[,4]/0.05), li,
+             linetype = "dotted", color = "#005266", size = 1) 
diff --git a/analysis_scripts/mchen/blog_5/user performance distribution.Rproj b/analysis_scripts/mchen/blog_5/user performance distribution.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/analysis_scripts/mchen/blog_5/user performance distribution.Rproj	
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/analysis_scripts/mchen/blog_6/barcharts.R b/analysis_scripts/mchen/blog_6/barcharts.R
new file mode 100644
index 0000000..b701039
--- /dev/null
+++ b/analysis_scripts/mchen/blog_6/barcharts.R
@@ -0,0 +1,129 @@
+# barcharts
+library(readxl)
+f1 = read_excel("figures.xlsx", "f1")
+f2 = read_excel("figures.xlsx", "f2")
+f3 = read_excel("figures.xlsx", "f3")
+
+names(f1) = c("active_months","active_q4","inactive_q4")
+wide_f1 = reshape(f1, idvar="active_months",varying=c("active_q4","inactive_q4"),
+                  v.names="number",direction="long")
+names(wide_f1)[2] = c("status")
+wide_f1$status = as.factor(wide_f1$status) # position_dodge would not work otherwise
+wide_f1$active_months = factor(wide_f1$active_months, 
+                               levels(wide_f1$active_months)[c(1,4,7,9,2,3,5,6,8)])  
+
+library(ggplot2)
+library(grid)
+
+# SCALES
+# dim(1): active months on CC
+# dim(2): number of programs (active/inactive)
+# ELEMENTS: bar
+maxnum = max(wide_f1$number)
+minnum = min(wide_f1$number)
+ggplot(wide_f1, aes(x=active_months, 
+                    y=number,
+                    fill=status)) + 
+  geom_bar(width=0.65, position="dodge", stat="identity") +
+  scale_y_continuous(breaks = seq(minnum, maxnum, by=10))+
+  scale_fill_manual(values=c("#01526d","#00a4dc"),
+                    name="",
+                    labels=c("Active in Q4 2014 (ongoing programs)",
+                             "Not active in Q4 2014")) +
+  labs(x = "Active months on CommCare",
+       y = "No. of programs") +
+  theme(legend.position = "top",
+        legend.background = element_rect(fill = "transparent"),
+        axis.ticks = element_blank(),
+        axis.ticks.margin = unit(0.25, "cm"),
+        axis.text.x = element_text(face="bold",colour="black"),
+        axis.text.y = element_text(face="bold",colour="black"),
+        panel.background = element_rect(fill = "#d5e2e8"),
+        plot.background = element_rect(fill = "#d5e2e8"),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.y = element_blank()
+  )
+ggsave("figure_1.pdf",width=8,height=8)  
+
+# chart 2: months on CC
+names(f2) = c("months", "total", "active")
+f2$months = as.factor(f2$months)
+f2$months = factor(f2$months, levels(f2$months)[c(1,4,7,9,2,3,5,6,8)])
+f2_1 = ggplot(subset(f2, select = c(months, total)), aes(x=months, y=total))+
+  geom_bar(width=0.6, stat="identity", fill="#01526d") + 
+  labs(x="",
+       y="")+
+  theme(legend.position = "top",
+        legend.background = element_rect(fill = "transparent"),
+        axis.ticks = element_blank(),
+        axis.ticks.margin = unit(0.25, "cm"),
+        axis.text.x = element_text(face="bold",colour="black",angle=90),
+        axis.text.y = element_text(face="bold",colour="black",angle=90),
+        panel.background = element_rect(fill = "#d5e2e8"),
+        plot.background = element_rect(fill = "#d5e2e8"),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.y = element_blank()
+  ) + coord_flip() 
+
+f2_2 = ggplot(subset(f2, select = c(months, active)), aes(x=months, y=active))+
+  geom_bar(width=0.6, stat="identity", fill="#74d0f5") + 
+  labs(x="",
+       y="")+
+  theme(legend.position = "top",
+        legend.background = element_rect(fill = "transparent"),
+        axis.ticks = element_blank(),
+        axis.ticks.margin = unit(0.25, "cm"),
+        axis.text.x = element_text(face="bold",colour="black",angle=90),
+        axis.text.y = element_text(face="bold",colour="black",angle=90),
+        panel.background = element_rect(fill = "#d5e2e8"),
+        plot.background = element_rect(fill = "#d5e2e8"),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.y = element_blank()
+  ) + coord_flip()
+
+# dodged barcharts for comparison
+# reshaping data: wide to long
+library(reshape)
+f2_reshaped = reshape(f2, idvar="months", varying=list(2:3), # put list(3:2) here and you would see interesting result
+                      v.names="status", direction="long")
+ggplot(f2_reshaped, aes(x=months, y=status, fill=factor(time))) +
+  geom_bar(width=0.6, stat="identity", position="dodge") +
+  labs(x="",
+       y="")+
+  theme(legend.position = "top",
+        legend.background = element_rect(fill = "transparent"),
+        legend.title = element_blank(),
+        axis.ticks = element_blank(),
+        axis.ticks.margin = unit(0.25, "cm"),
+        axis.text.x = element_text(face="bold",colour="black"),
+        axis.text.y = element_text(face="bold",colour="black"),
+        panel.background = element_rect(fill = "#d5e2e8"),
+        plot.background = element_rect(fill = "#d5e2e8"),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.y = element_blank()) + 
+  coord_flip()  
+
+# chart 3: re-start rate
+names(f3) = c("month", "attrited", "restarted", "pct")
+f3$month = as.factor(f3$month)
+f3$month = factor(f3$month, levels(f3$month)[c(1,4,5,6,2,3)])
+
+maxpct = max(f3$pct)
+minpct = min(f3$pct)
+ggplot(f3, aes(x=month,y=pct)) +
+  geom_bar(width=0.6, stat="identity", fill="#01526d") +
+  scale_y_continuous(breaks = seq(0, maxnum, by = 0.10)) + 
+  labs(x="",
+       y="") + 
+  theme(legend.position = "top",
+        legend.background = element_rect(fill = "transparent"),
+        axis.ticks = element_blank(),
+        axis.ticks.margin = unit(0.25, "cm"),
+        axis.text.x = element_text(face="bold",colour="black"),
+        axis.text.y = element_text(face="bold",colour="black"),
+        panel.background = element_rect(fill = "#d5e2e8"),
+        plot.background = element_rect(fill = "#d5e2e8"),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.y = element_blank()
+  )
+ggsave("figure_3.pdf",width=8,height=8)  
\ No newline at end of file
diff --git a/analysis_scripts/mchen/blog_6/blog_6.Rproj b/analysis_scripts/mchen/blog_6/blog_6.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/analysis_scripts/mchen/blog_6/blog_6.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/analysis_scripts/mchen/blog_6/hq_viz.R b/analysis_scripts/mchen/blog_6/hq_viz.R
new file mode 100644
index 0000000..0cd92e2
--- /dev/null
+++ b/analysis_scripts/mchen/blog_6/hq_viz.R
@@ -0,0 +1,110 @@
+# IMPORT DATA #
+suppressMessages(library(plyr))
+suppressMessages(library(dplyr))
+suppressMessages(library(zoo))
+suppressMessages(library(lubridate))
+
+data = tbl_df(read.csv("blog_data_6_10_15.csv", stringsAsFactors=FALSE))
+data$calendar_month = as.Date(data$calendar_month, "%Y-%m-%d") # time data in DP is stored as numeric distance from 1970-01-01
+
+hq = select(data, domain_numeric, user_pk, calendar_month) # keep columns of interest
+hq = arrange(hq, domain_numeric, user_pk, calendar_month)
+t = filter(hq, calendar_month >= as.Date("2012-01-01")) %>% 
+  filter(., calendar_month <= as.Date("2014-12-01")) # keeping data from 2012 onward
+
+
+# for each domain, get total number of users and number of active months 
+agg = t %>%
+  group_by(domain_numeric) %>% # change from domain_numeric to domain
+  summarise(nusers=n_distinct(user_pk),
+            nmonths = n_distinct(calendar_month))
+
+# for each domain in each month, get the number of active users. TBF'ED
+t_agg_1 = t %>%
+  group_by(domain_numeric, calendar_month) %>%
+  summarise(nusers = n_distinct(user_pk)) %>%
+  arrange(., domain_numeric, calendar_month)
+
+# for each domain, get total user blocks
+t_agg_1$blocks = round_any(t_agg_1$nusers, 25, ceiling)/25
+
+# rank domains by max user blocks in a given month (big domains go to the bottom)
+d_rank = t_agg_1 %>% 
+  group_by(domain_numeric) %>%
+  summarise(b = max(blocks)) %>%
+  arrange(., desc(b)) %>%
+  mutate(b_rank = seq(b)) %>%
+  select(., domain_numeric, b_rank)
+
+# Replicate each row; set the number of replications to the number of blocks
+expanded = as.numeric(rep(row.names(t_agg_1), t_agg_1$blocks))
+texp = t_agg_1[expanded,] 
+texp = texp[order(match(texp$domain_numeric, d_rank$domain_numeric)),] # order domains by max active month
+texp = left_join(texp, d_rank)
+texp = texp %>%
+  group_by(b_rank, calendar_month) %>%
+  mutate(incre_blocks = seq(n())) # Index user bins incrementally
+
+# for each cum_blocks value, sum up cum_blocks and the max value of the domain that's sitting below
+piles = texp %>%
+  group_by(b_rank) %>%
+  summarise(max_blocks = max(incre_blocks))
+piles$max_blocks = c(0, cumsum(piles$max_blocks[-nrow(piles)])) 
+texp = left_join(texp, piles) 
+texp$piled_blocks = texp$incre_blocks + texp$max_blocks # building blocks into mounds for each domain
+
+
+# all HQ domains
+# geom_tile plot 
+suppressMessages(library(ggplot2))
+suppressMessages(library(scales))
+suppressMessages(library(RColorBrewer))
+
+x_axis_ticks = as.character(seq(min(texp$calendar_month),max(texp$calendar_month),by="quarter")) # R considers Date variable as continuous
+x_axis_labels = format(as.Date(x_axis_ticks), "%b %y")  
+
+g0 = ggplot(texp, aes(x=as.character(calendar_month),y=piled_blocks)) + 
+  geom_tile(aes(fill=factor(texp$b_rank))) + 
+  guides(fill=FALSE) + # turning off legends
+  scale_fill_manual(values=rep(c("#005266", "#882608", "#fe9c7b"),
+                               ceiling(n_distinct(texp$domain_numeric)/3))) + # repeating colors to distinguish each domain
+  scale_y_continuous(breaks = NULL) + # suppressing ticks, gridlines
+  scale_x_discrete(breaks = x_axis_ticks, labels = x_axis_labels) + # reduce total number of breaks
+  theme(axis.title.x = element_blank(), 
+        axis.title.y = element_blank(), # hide titles for both x- and y-axis
+        axis.text.x = element_text(face="bold",
+                                   colour="#004865", size=11),
+        panel.background = element_rect(fill="#d5e4eb"),
+        panel.grid = element_blank())  
+
+ggsave("hq_viz.pdf", g0, width = 8, height = 12)
+
+
+# subset HQ domains 
+# subs = c(11, 67, 87, 127, 138)
+# run line 17-29 and you will get the data
+t = filter(t, domain_numeric %in% subs)
+hq_sub = ggplot(t_agg_1, aes(x=calendar_month, y=nusers)) +
+  geom_bar(width=28,stat="identity") +
+  facet_wrap(~domain_numeric, nrow=5, scales = "free_y") + 
+  theme(axis.title.x = element_blank(), 
+        axis.title.y = element_blank(), # hide titles for both x- and y-axis
+        axis.text.x = element_text(face="bold",
+                                   colour="#004865", size=11),
+        panel.background = element_rect(fill="#d5e4eb"),
+        panel.grid.major.x = element_blank(),
+        panel.grid.minor.x = element_blank(),
+        panel.grid.minor.y = element_blank())
+
+# highlight time range
+rect = data.frame(xmin = as.Date(c("2012-04-01","2012-10-01","2013-04-01","2013-10-01","2014-04-01","2014-10-01")), 
+                  xmax = as.Date(c("2012-06-01","2012-12-01","2013-06-01","2013-12-01","2014-06-01","2014-10-01")),
+                  ymin = c(-Inf,-Inf,-Inf,-Inf,-Inf,-Inf),
+                  ymax = c(Inf,Inf,Inf,Inf,Inf,Inf))
+hq_sub_2 = hq_sub + geom_rect(data = rect, aes(xmin=xmin, xmax=xmax,
+                                      ymin = ymin, ymax=ymax),
+                     fill = "#aaaea6",
+                     alpha = 0.3,
+                     inherit.aes = FALSE) 
+
+ggsave("hq_subdomains.pdf", hq_sub_2, width = 8, height = 8)
diff --git a/analysis_scripts/mchen/blog_visualization/cluster_analysis.R b/analysis_scripts/mchen/blog_visualization/cluster_analysis.R
deleted file mode 100644
index ce47eb0..0000000
--- a/analysis_scripts/mchen/blog_visualization/cluster_analysis.R
+++ /dev/null
@@ -1,20 +0,0 @@
-# cluster analyis
-# get the kmeans 
-
-# add scripts to compute kmeans here
-
-# results: 4 clusters: x = 20, 40, 95, 270
-
-# removing all x = y = 1
-test3 <- test2[-which(test2$x == 1 & test2$y == 1),]
-ca_sub1 <- test3[which(test3$x <= 30 & test3$y <= 30),]
-
-crs <- test3[which(test3$domain == "crs-remind"),]
-crs_sub <- crs[which(crs$x >= 10 & crs$x <= 40),]
-smoothScatter(crs_sub$x, crs_sub$y)  
-
-koraro <- test3[which(test3$domain == "mvp-koraro"),]
-koraro_sub <- koraro[which(koraro$x >= 40 & koraro$x <= 100),]
-smoothScatter(koraro_sub$x, koraro_sub$y)  
-
-intensityPlotOut(crs_sub, koraro_sub, 100, 100)
\ No newline at end of file
diff --git a/analysis_scripts/mchen/blog_visualization/plot_funcs.R b/analysis_scripts/mchen/blog_visualization/plot_funcs.R
deleted file mode 100644
index be08ff4..0000000
--- a/analysis_scripts/mchen/blog_visualization/plot_funcs.R
+++ /dev/null
@@ -1,29 +0,0 @@
-# plot functions
-plotOut <- function(data, medianColor, palette1, palette2, xmax, ymax, rLabel, rSquareX, rSquareY) {
-  p <- regression_plot(ncases_touched ~ prev_ncases_touched, data, 
-                       shade=FALSE,
-                       spag=TRUE, 
-                       median.col=medianColor, # median line (not regression line)
-                       palette=colorRampPalette(c(palette1,palette2))(4))
-  
-  p <- p + 
-    scale_x_continuous("Cases visited in month N-1", limits=c(0,xmax)) +
-    scale_y_continuous("Cases visited in month N", limits=c(0,ymax)) # modify the axis limits
-  
-  #  p <- p + labs(x="Cases visited in month N-1", y="Cases visited in month N") # modify axis labels
-  
-  # adding annotation layer
-  p <- p + annotate("text", x=rSquareX, y=rSquareY, size=7, label=rLabel, colour="red", parse=T) 
-  # modify theme settings:
-  p <- p + 
-    theme(axis.title.x=element_text(vjust = -1.5,size=rel(2)),
-          axis.title.y=element_text(vjust = 1.5,size=rel(2)),
-          plot.margin=unit(c(1,1,1,1),"cm"),
-          panel.grid.major = element_blank(),
-          panel.grid.minor = element_blank(),
-          panel.background = element_blank()) +
-    theme(aspect.ratio = 1) +
-    theme(legend.position="none") 
-  return(p)
-}
-
diff --git a/visualizations/distribution_viz.R b/visualizations/distribution_viz.R
new file mode 100644
index 0000000..2fd3960
--- /dev/null
+++ b/visualizations/distribution_viz.R
@@ -0,0 +1,674 @@
+##############################
+##distribution visualization##
+##############################
+
+# create a single line graph from a set of user-months and an activity indicator showing distribution of levels   
+  # For each indicator
+  # the x-axis should be the value of the indicator (percent of active days)
+  # the y-axis should be the percent of user-months that have that value
+  # multiple graphs will be generated on different sets of user-months 
+  # graph naming format: p_n (n: the nth graph)
+
+data <- tbl_df(read.csv("blog_data_2_2_15.csv", stringsAsFactors = FALSE))
+data$adp <- round(data$active_day_percent) # rounding would make it a discrete variable
+
+library(plyr)
+library(dplyr)
+library(ggplot2)
+library(Cairo)
+library(zoo)
+library(RColorBrewer)
+library(lattice)
+library(grid)
+library(gridExtra)
+
+
+# distribution visualization of performance indicators
+
+# Custom Quarter entirely by domain
+d_1 = select(data, user_pk, domain_numeric, calendar_month, active_days, month_index)
+d_1 = arrange(d_1, calendar_month, domain_numeric)
+# create month_seq by domain
+d_1 = d_1 %>%
+  group_by(domain_numeric) %>%
+  mutate(month_seq_by_domain = rep(1:length(table(calendar_month)),
+                                   table(calendar_month))
+  )	
+# create custom quarter by month_index 
+bins = c(1+(
+  3*(0:ceiling(max(d_1$month_seq_by_domain)/3))
+))
+
+labs = paste("Q_", seq(length(bins)-1), sep = "")
+
+d_1 = d_1 %>% 
+  group_by(domain_numeric) %>%
+  mutate(QTR_by_domain = cut(month_seq_by_domain,
+                             breaks = bins,
+                             labels = labs,
+                             right = FALSE)
+  )
+
+pdf(7, 14, file="custom qtr by domain.pdf")
+p_1 = densityplot(~active_days|QTR_by_domain, data=d_1,
+                  panel=function(x,...){
+                    panel.densityplot(x,...)
+                    lower.v <- quantile(x,probs=0.25)
+                    median.v <- median(x) 
+                    upper.v <- quantile(x,probs=0.75)
+                    mean.v <- mean(x)
+                    
+                    panel.abline(v=median.v, col.line="#003d71", lwd=1.2)  # 50 percentile
+                    #                   panel.abline(v=lower.v, col.line="#f58220") # 25 percentile
+                    #                   panel.abline(v=upper.v, col.line="#f58220") # 75 percentile
+                    panel.abline(v=mean.v, col.line="#003d71", lwd=1.2, lty=2) # mean
+                    
+                    dens <- density(x)                   
+                    min.v <- min(dens$x)
+                    max.v <- max(dens$x)
+                    panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), 
+                                  c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0),
+                                  col="#f58220")
+                    panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v),
+                                  c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0),
+                                  col="#f58220")
+                  },
+                  scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))),
+                  xlim=range(d_1$active_days), # reversing the order of x-axis values
+                  ylim=c(0,0.5),
+                  xlab="Active days per month",
+                  ylab="Density",
+                  #                 aspect=0.1,
+                  layout=c(1,length(bins)-1),
+                  index.cond=list((length(bins)-1):1),
+                  strip=strip.custom(bg="#ffffff", par.strip.text=list(font=0.5,col="#413f3f")),
+                  col="#413f3f",
+                  plot.points=FALSE)
+print(p_1)
+dev.off()
+
+# get a subset of users that have no dropout(gap) months from their first active month till 80% of domain active months
+d_2 = d_1 %>%
+  group_by(domain_numeric) %>%
+  summarise(max_months = max(month_seq_by_domain))
+d_2$pct_80 = floor(d_2$max_months*0.8)  # for each domain, calculate the first 80 percent months
+
+d_2 %>%
+  group_by(domain_numeric) %>%
+  filter() 
+
+# for each user, get the distance between two active months 
+d_1 = arrange(d_1, domain_numeric, user_pk, month_seq_by_domain)
+d_1 = d_1%>%
+  group_by(domain_numeric, user_pk) %>%
+  mutate(diff_month = c(NA, diff(month_seq_by_domain)))	
+
+# get the last row per user, return users that have lived through the first 80 percent of domain active months
+d_2 = left_join(d_1, d_2)	
+d_2 = arrange(d_2, domain_numeric, user_pk, calendar_month)
+d_2_user = d_2 %>%
+  group_by(domain_numeric, user_pk) %>%
+  do(tail(.,n=1)) %>% 
+  filter(., month_seq_by_domain >= pct_80)
+d_2 = filter(d_2, user_pk %in% d_2_user$user_pk)
+# filter out users who lived through but have dropouts during the first 80 percent 
+d_2_80 = filter(d_2, month_seq_by_domain <= pct_80)  # for users kept in d_2_user, truncate their activity to pct_80
+d_2_80 = d_2_80 %>%
+  group_by(domain_numeric, user_pk) %>%
+  filter(., n()==max(month_seq_by_domain)) # excluding all users that have gaps during the first 80 percent of domain active months
+
+d_3 = filter(d_1, user_pk %in% d_2_user$user_pk) # this returns all user-months that live through first 80% of domain active months with no dropouts
+
+# get the subset of users that have contribute to 14 quarters
+d_3_subset_user = d_3 %>%
+  group_by(domain_numeric, user_pk) %>%
+  summarise(tot_qtr = length(levels(factor(QTR_by_domain)))) %>% # this returns total active quarters per user
+  filter(., tot_qtr >= 6)
+
+# only keep data till QTR_6 for visual
+d_4 = filter(d_3, user_pk %in% d_3_subset_user$user_pk)
+d_4_subset = filter(d_4, month_seq_by_domain <= 18)
+d_4_subset$QTR_by_domain = factor(d_4_subset$QTR_by_domain) # drop unused levels
+
+pdf(7, 14, file="no dropout in 6 quarters.pdf")
+p_qtr_6_by_domain = densityplot(~active_days|QTR_by_domain, data=d_4_subset,
+                                panel=function(x,...){
+                                  panel.densityplot(x,...)
+                                  lower.v <- quantile(x,probs=0.25)
+                                  median.v <- median(x) 
+                                  upper.v <- quantile(x,probs=0.75)
+                                  mean.v <- mean(x)
+                                  
+                                  panel.abline(v=median.v, col.line="#003d71", lwd=1.2)  # 50 percentile
+                                  #                   panel.abline(v=lower.v, col.line="#f58220") # 25 percentile
+                                  #                   panel.abline(v=upper.v, col.line="#f58220") # 75 percentile
+                                  panel.abline(v=mean.v, col.line="#003d71", lwd=1.2, lty=2) # mean
+                                  
+                                  dens <- density(x)                   
+                                  min.v <- min(dens$x)
+                                  max.v <- max(dens$x)
+                                  panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), 
+                                                c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0),
+                                                col="#f58220")
+                                  panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v),
+                                                c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0),
+                                                col="#f58220")
+                                },
+                                # scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))),
+                                xlim=range(d_4_subset$active_days), # reversing the order of x-axis values
+                                ylim=c(0,0.5),
+                                xlab="Active days per month",
+                                ylab="Density",
+                                #                 aspect=0.1,
+                                layout=c(1,length(levels(d_4_subset$QTR_by_domain))),
+                                index.cond=list(length(levels(d_4_subset$QTR_by_domain)):1),
+                                strip=strip.custom(bg="#ffffff", par.strip.text=list(font=0.5,col="#413f3f")),
+                                col="#413f3f",
+                                plot.points=FALSE)
+print(p_qtr_6_by_domain)
+dev.off()
+
+
+
+# custom quarter by user
+pdf(7, 14, file="custom qtr by user (no dropout).pdf")
+d_2$QTR = factor(d_2$QTR)
+p_2_by_user = densityplot(~active_days|QTR, data=d_2,
+                          panel=function(x,...){
+                            panel.densityplot(x,...)
+                            lower.v <- quantile(x,probs=0.25)
+                            median.v <- median(x) 
+                            upper.v <- quantile(x,probs=0.75)
+                            mean.v <- mean(x)
+                            
+                            panel.abline(v=median.v, col.line="#003d71", lwd=1.2)  # 50 percentile
+                            #                   panel.abline(v=lower.v, col.line="#f58220") # 25 percentile
+                            #                   panel.abline(v=upper.v, col.line="#f58220") # 75 percentile
+                            panel.abline(v=mean.v, col.line="#003d71", lwd=1.2, lty=2) # mean
+                            
+                            dens <- density(x)                   
+                            min.v <- min(dens$x)
+                            max.v <- max(dens$x)
+                            panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), 
+                                          c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0),
+                                          col="#f58220")
+                            panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v),
+                                          c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0),
+                                          col="#f58220")
+                          },
+                          scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))),
+                          xlim=range(d_2$active_days), # reversing the order of x-axis values
+                          ylim=c(0,0.5),
+                          xlab="Active days per month",
+                          ylab="Density",
+                          #                 aspect=0.1,
+                          layout=c(1,length(levels(d_2$QTR))),
+                          index.cond=list(length(levels(d_2$QTR)):1),
+                          strip=strip.custom(bg="#ffffff", par.strip.text=list(font=0.5,col="#413f3f")),
+                          col="#413f3f",
+                          plot.points=FALSE)
+
+print(p_2_by_user)
+dev.off()
+
+
+
+
+
+
+
+# Distribution Graphs 1: Kernel Density Plot for all user-months
+  # construct data table for graph 1
+adp_groups <- data %>% group_by(adp) # order data table by percent of active days
+x <- unique(adp_groups$adp) # extract unique percent of active days 
+y <- group_size(adp_groups) # calculate the occurrence of each value in percent of active days
+data_adp1 <- tbl_df(as.data.frame(cbind(x,y))) # combine percentage of each value occurring and unique values into a new data table
+data_adp1 <- arrange(data_adp1, x) # reorder the data table by unique values in percent of active days
+data_adp1$y_prob <- round(y/sum(data_adp1$y),digits=6) # calculate the percentage of each value occurring in the whole data set
+  # run plot
+png(600,600,file="pad_all_domains.png")
+d1 = density(data$active_days)
+p1 = plot(d1, main="Distribution of active days per month (Kernel density estimation)")
+dev.off()
+
+
+
+
+
+# Distribution Graph 2: Probability Distribution by Custom Quarter
+  # Create custom quarters entirely by domain
+d_1 = select(data, user_pk, domain_numeric, calendar_month, active_days, month_index)
+
+
+  # Create custom quarters entirely by user
+d_2 = select(data, user_pk, domain_numeric, calendar_month, active_days)
+d_2 %>%
+  group_by(domain_numeric, user_pk) %>%
+  mutate(month_seq_by_user = c(1:length(calendar_month)))
+
+
+labs = paste("QTR_by_use ",seq(17),sep="") # there are 17 quarters per n_distinct(calendar_month) by user
+d_2$QTR = cut(d_2$month_seq_by_user, breaks=3*(0:17), labels=labs) # adding labels
+d_2$QTR = factor(d_2$QTR, levels = levels(d_2$QTR))
+
+
+p_2 = densityplot(~active_days|QTR, data=d_2,
+                 panel=function(x,...){
+                   panel.densityplot(x,...)
+                   lower.v <- quantile(x,probs=0.25)
+                   median.v <- median(x) 
+                   upper.v <- quantile(x,probs=0.75)
+                   mean.v <- mean(x)
+                   
+                   panel.abline(v=median.v, col.line="red")  # 50 percentile
+#                   panel.abline(v=lower.v, col.line="#f58220") # 25 percentile
+#                   panel.abline(v=upper.v, col.line="#f58220") # 75 percentile
+                   panel.abline(v=mean.v, col.line="darkorange", lty=2) # mean
+
+                   dens <- density(x)                   
+                   min.v <- min(dens$x)
+                   max.v <- max(dens$x)
+                   panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), 
+                                 c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0),
+                                 col="#f58220")
+                   panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v),
+                                 c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0),
+                                 col="#f58220")
+                 },
+                 scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))),
+                 xlim=range(d_2$active_days), # reversing the order of x-axis values
+                 xlab="Active days per month",
+                 ylab="Density",
+#                 aspect=0.1,
+                 layout=c(1,17),
+                 index.cond=list(17:1),
+                 strip=strip.custom(bg="#003d71", par.strip.text=list(font=1,col="white")),
+                 col="#413f3f",
+                 plot.points=FALSE)
+
+
+
+# run plot on a subset of domains that have at least one user active for 10 QTRs
+max_qtr = d_2 %>%
+  group_by(domain_numeric) %>%
+  summarise(max(as.numeric(QTR)))
+colnames(max_qtr) = c("domain_numeric", "max_q")
+dm = filter(max_qtr, max_q >= 10)$domain_numeric
+d_3 = filter(d_2, domain_numeric %in% dm) 
+
+
+pdf("distribution_final.pdf",
+    width=6,
+    height=12)
+
+print(densityplot(~active_days|QTR, data=d_3,
+                  panel=function(x,...){
+                    panel.densityplot(x,...)
+                    lower.v <- quantile(x,probs=0.25)
+                    median.v <- median(x) 
+                    upper.v <- quantile(x,probs=0.75)
+                    mean.v <- mean(x)
+                    
+                    panel.abline(v=median.v, col.line="#f58220", lwd=2)  # 50 percentile
+                    #                   panel.abline(v=lower.v, col.line="#f58220") # 25 percentile
+                    #                   panel.abline(v=upper.v, col.line="#f58220") # 75 percentile
+                    panel.abline(v=mean.v, col.line="#f58220", lty=2, lwd=2) # mean
+                    
+                    dens <- density(x)                   
+                    min.v <- min(dens$x)
+                    max.v <- max(dens$x)
+                    panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), 
+                                  c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0),
+                                  col="#f58220")
+                    panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v),
+                                  c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0),
+                                  col="#f58220")
+                  },
+                  scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))),
+                  xlim=range(d_3$active_days), # reversing the order of x-axis values
+                  xlab="Active days per month",
+                  ylab="Density",
+                  #                 aspect=0.1,
+                  layout=c(1,17),
+                  index.cond=list(17:1),
+                  strip=strip.custom(bg="#003d71", par.strip.text=list(font=1,col="white")),
+                  col="#413f3f",
+                  plot.points=FALSE))
+
+dev.off()
+
+round(nrow(d_3)/nrow(data),digits=2) # contribution of these 7 domains to all_domain data is 34%
+
+
+# d_4: for each domain, exclude users that were inactive for the first 80% of the N months, 
+# namely to exclude users whose month_start > 08*calendar_month_index
+
+# for each domain, generate continuous id for each unique calendar month 
+d_4 = select(data, user_pk, domain_numeric, calendar_month, active_days, QTR)
+d_4 = arrange(d_4, domain_numeric, calendar_month)
+d_4 = 
+  d_4 %>%
+  group_by(domain_numeric) %>%
+  mutate(monthCode = rep(seq(n_distinct(calendar_month
+                            )),as.numeric(table(calendar_month))))
+
+d_4 = arrange(d_4, domain_numeric, user_pk)
+
+tot_active_months = 
+  d_4 %>%
+  group_by(domain_numeric) %>%
+  summarise(monthMax = max(monthCode))
+
+
+tot_active_months$month80 = round(tot_active_months$monthMax*0.8)
+
+# for each user, left join first user-month with month80
+d_4 = arrange(d_4, domain_numeric, user_pk, calendar_month)
+d_4_first = d_4 %>%
+  group_by(domain_numeric, user_pk) %>%
+  filter(calendar_month == min(calendar_month))
+
+
+d_4_first_join = left_join(d_4_first, tot_active_months)
+# compare the first user-month with the 80%-month per domain
+d_4_first_join$later_than_80 = ifelse(d_4_first_join$monthCode >= d_4_first_join$month80, "yes", "no")
+
+# if the first user-month is later than the 80% cutoff, drop it out
+d_4_user_subset = filter(d_4_first_join, later_than_80 == "no")$user_pk
+
+# extract data for the subset of user
+d_4_final = select(filter(data, user_pk %in% d_4_user_subset), 
+                   user_pk, domain_numeric, calendar_month, active_days, QTR)
+
+
+# check out which domains are dropped in the process 
+dropped = 
+  as.numeric(
+  unique(d_4$domain_numeric) %in% d_4_final$domain_numeric)
+dropped_dm = 
+  unique(d_4$domain_numeric)[which(dropped == 0)]
+
+
+# print out the table of user distribution per domain (see if there are skyrocketing domains)
+d_4_final_user = 
+  d_4_final %.%
+  group_by(domain_numeric) %.%
+  summarise(numUser = n_distinct(user_pk))
+
+# barplot users per domain
+d_4_final_user = arrange(d_4_final_user, -numUser)
+barplot(d_4_final_user$numUser)
+
+d_4_final_user$userBin = ifelse(d_4_final_user$numUser < 5, "1",
+                                ifelse(d_4_final_user$numUser >= 5 & d_4_final_user$numUser < 25, "2",
+                                       ifelse(d_4_final_user$numUser >= 25 & d_4_final_user$numUser < 50, "3",
+                                              ifelse(d_4_final_user$numUser >= 50 & d_4_final_user$numUser < 100, "4", "5"
+                                                    ))))
+
+barplot(as.numeric(d_4_final_user$userBin))
+table(as.numeric(d_4_final_user$userBin))
+
+d_4_final = left_join(d_4_final, d_4_final_user)
+d_4_final$QTR = factor(d_4_final$QTR)
+
+print(densityplot(~active_days|QTR, data=filter(d_4_final, userBin == 3),
+                  panel=function(x,...){
+                    panel.densityplot(x,...)
+                    lower.v <- quantile(x,probs=0.25)
+                    median.v <- median(x) 
+                    upper.v <- quantile(x,probs=0.75)
+                    mean.v <- mean(x)
+                    
+                    panel.abline(v=median.v, col.line="#f58220", lwd=2)  # 50 percentile
+                    #                   panel.abline(v=lower.v, col.line="#f58220") # 25 percentile
+                    #                   panel.abline(v=upper.v, col.line="#f58220") # 75 percentile
+                    panel.abline(v=mean.v, col.line="#f58220", lty=2, lwd=2) # mean
+                    
+                    dens <- density(x)                   
+                    min.v <- min(dens$x)
+                    max.v <- max(dens$x)
+                    panel.polygon(c(upper.v, dens$x[dens$x > upper.v & dens$x < max.v], max.v), 
+                                  c(0, dens$y[dens$x> upper.v & dens$x < max.v], 0),
+                                  col="#f58220")
+                    panel.polygon(c(min.v, dens$x[dens$x > min.v & dens$x < lower.v], lower.v),
+                                  c(0, dens$y[dens$x > min.v & dens$x < lower.v], 0),
+                                  col="#f58220")
+                  },
+                  scales=list(x=list(at=c(1,2,3,4,5,6,7,8,9,10,20,30))),
+                  between=list(y=0.5), # introduce a gap between rows of vertical panels
+                  par.strip.text=list(cex=0.7), # size down the text in the strip above each panel
+                  par.settings=list(axis.text=list(cex=0.7)), # control size of the text on tick lablels
+                  xlim=range(d_4_final$active_days), # reversing the order of x-axis values
+                  xlab="Active days per month",
+                  ylab="Density",
+                  
+                  # custom arrangement of panel layout  
+                  aspect=1,
+                  layout=c(1,17), # single column, 17 rows, one for each quarter
+                  index.cond=list(17:1), # panels should be ordered from the top to the bottom
+                  strip=strip.custom(bg="#003d71", par.strip.text=list(font=1,col="white")),
+                  col="#413f3f",
+                  plot.points=FALSE))
+
+
+
+
+
+# summary data table as a supplement 
+p_data = group_by(p_data, QTR)
+viz_sum_table = summarise(p_data, round(mean(active_days)), round(median(active_days)), round(quantile(active_days,probs=0.25)), round(quantile(active_days,probs=0.75)))
+
+# create a dimagi theme
+
+
+
+# graph 2: subset data from 10 biggest domains (in terms of user-months)
+tum <- tally(group_by(data,domain_numeric)) # for each domain, count total user-month 
+colnames(tum) <- c("domain_numeric","user_months") # renaming variables
+top_10_dm <- tum %>%
+  arrange(desc(user_months)) %>%
+  top_n(10) # identify the qualifying 10 domains
+top_10 <- filter(data, domain_numeric %in% top_10_dm$domain_numeric) # subset data 
+top_10$calendar_month <- as.Date(top_10$calendar_month)
+
+# generate a sequence of active months on each domain
+top_10 = top_10 %>%
+  group_by(domain_numeric,user_pk) %>%
+  mutate(month_seq_by_user = seq(n())) # for each user this calculates the total active month 
+
+# for each user on each domain, categorize active month by quarter (not calendar quarter)
+top_10$YR = ifelse(top_10$month_seq_by_user <= 12, "Y1","Y2")
+top_10_Y1 = filter(top_10, YR == "Y1")
+top_10_Y1$QR = sapply(top_10_Y1$month_seq_by_user, function(x){
+  if(x <= 3) return("Q1")
+  if(x > 3 & x <= 6) return("Q2")
+  if(x > 6 & x <= 9) return("Q3")
+  else return("Q4")
+})
+
+
+# Distribution graph 2 (alternative: use sm.density.compare)
+d1 = densityplot(~active_days,
+                data=top_10_Y1,
+                groups=QR,
+                xlab="Active days per month",
+                main="Top 10 domains",
+                from=0,to=28,
+                auto.key=TRUE,
+                plot.points=FALSE)
+
+
+d2 = densityplot(~active_days|QR, data=top_10_Y1,
+            panel=function(x,...){
+              panel.densityplot(x,...)
+              lowerp.values <- quantile(x,probs=0.25)
+              median.values <- median(x) 
+              upperp.values <- quantile(x,probs=0.75)
+              mean.values <- mean(x)
+              panel.abline(v=median.values, col.line="red") 
+              panel.abline(v=lowerp.values, col.line="yellow")
+              panel.abline(v=upperp.values, col.line="green")
+              panel.abline(v=mean.values, col.line="pink", lty=2)
+            },
+            xlim=range(top_10_Y1$active_days), # reversing the order of x-axis values
+            layout=c(1,4),
+            index.cond=list(4:1),
+            plot.points=FALSE)
+
+# create plots separately for each domain
+
+dm_list = split(top_10_Y1, top_10_Y1$domain_numeric)
+dm_1 = densityplot(~active_days|QR, data=dm_list[[1]],
+                   panel=function(x,...){
+                     panel.densityplot(x,...)
+                     lowerp.values <- quantile(x,probs=0.25)
+                     median.values <- median(x) 
+                     upperp.values <- quantile(x,probs=0.75)
+                     mean.values <- mean(x)
+                     panel.abline(v=median.values, col.line="red") 
+                     panel.abline(v=lowerp.values, col.line="yellow")
+                     panel.abline(v=upperp.values, col.line="green")
+                     panel.abline(v=mean.values, col.line="pink", lty=2)
+                   },
+                   xlim=range(dm_list[[1]]$active_days), # reversing the order of x-axis values
+                   layout=c(1,4),
+                   index.cond=list(4:1),
+                   plot.points=FALSE)
+
+densPlot = function(dm_data){
+  densityplot(~active_days|QR, data=dm_data,
+              panel=function(x,...){
+                panel.densityplot(x,...)
+                lowerp.values <- quantile(x,probs=0.25)
+                median.values <- median(x) 
+                upperp.values <- quantile(x,probs=0.75)
+                mean.values <- mean(x)
+                panel.abline(v=median.values, col.line="red") 
+                panel.abline(v=lowerp.values, col.line="yellow")
+                panel.abline(v=upperp.values, col.line="green")
+                panel.abline(v=mean.values, col.line="pink", lty=2)
+              },
+              xlim=range(dm_data$active_days), # reversing the order of x-axis values
+              layout=c(1,4),
+              index.cond=list(4:1),
+              plot.points=FALSE)
+}
+
+plots_separate = lapply(names(dm_list), function(x) densPlot(dm_list[[x]]))
+png("density_plot_per_domain.png")
+do.call(grid.arrange, plots_separate)
+dev.off()
+
+
+
+
+toList = quantile(top_10_Y1$active_days, probs=c(0,0.25,0.5,0.75,1))
+
+lattice.theme <- trellis.par.get()
+col <- lattice.theme$superpose.symbol$col
+
+plots = lapply(1:4,function(.x){
+  densityplot(~active_days,
+              data=top_10_Y1,
+              group=QR,
+              from=toList[.x],
+              to=toList[.x + 1],
+              plot.points=FALSE,
+              xlab="Active days per month",
+              key=list(text=levels(top_10_Y1$QR),
+                             lines=TRUE),
+              main=paste("Percentile", names(toList)[.x], "to", names(toList[.x+1])))
+})
+
+do.call(grid.arrange,plots)
+
+
+
+# adding summary stats to the graph that calculates and plots quantiles
+q1 = ddply(top_10_Y1, .(QR), function(x){
+  x1 = quantile(x$active_days, .25)
+  y1 = density(x$active_days, from=x1, to=x1, n=1)$y # this calculates the density at a single point
+  df.dens.x1 = data.frame(x=x1,y=y1)
+  return(df.dens.x1)
+}) # this returns the 25 percentile for each quarter
+
+q2 = ddply(top_10_Y1, .(QR), function(x){
+
+  x1 = quantile(x$active_days, .5)
+  y1 = density(x$active_days, from=x1, to=x1, n=1)$y # this calculates the density at a single point
+  df.dens.x1 = data.frame(x=x1,y=y1)
+  return(df.dens.x1)
+})
+
+q3 = ddply(top_10_Y1, .(QR), function(x){
+  x1 = quantile(x$active_days, .75)
+  y1 = density(x$active_days, from=x1, to=x1, n=1)$y # this calculates the density at a single point
+  df.dens.x1 = data.frame(x=x1,y=y1)
+  return(df.dens.x1)
+})
+
+q = rbind(q1,q2,q3) # quantile data for geom plot
+p2 = p1 + 
+  geom_point(data=q, aes(x=x,y=y,group=factor(QR),shape=factor(QR),color=factor(QR),size=6))  
+
+ggsave("top_10_domains.pdf",p2,height=8,width=16)
+
+
+# data check
+  # is every domain contributing to every quarter?
+top_10_Y1 %>%
+  group_by(domain_numeric) %>%
+  n_distinct(QR)
+
+# graph 3: grouped by 3/6/12/more months##
+
+# for each user, add a sequence of active months
+data2 = data %>%
+  group_by(user_pk) %>%
+  mutate(nmonths = seq(n())) 
+
+# for each group, compute the distribution of active day percent
+data_adp2 = tbl_df(as.data.frame(with(data2, table(cut(nmonths, breaks=c(1,3,6,12,50)), adp))))
+data_adp2 = arrange(data_adp2, Var1) # Var1 is the resulting category
+data_adp2 = data_adp2 %>%
+  group_by(Var1) %>%
+  mutate(dist_adp = Freq/n())
+
+p_adp3 = ggplot(data_adp2,aes(x=adp,y=dist_adp,group=Var1))+
+  geom_line(aes(linetype=Var1,color=Var1))+
+  xlab("percent of active days")+
+  ylab("distribution")+
+  theme(
+    panel.background = element_blank()
+  )
+ggsave("p_adp3.pdf",p_adp3,width=16,height=8)
+
+# Graph 4: same with graph 3, only on users who are active for at least 15 of their first 18 months
+user_nmonths = tally(group_by(data2, user_pk)) # this returns total active months
+user_candidate = select(filter(user_nmonths, n >= 18), user_pk) # only users with at least 18 consecutive active months will be kept
+data_adp3 = filter(data2, user_pk %in% user_candidate$user_pk)
+
+adp3_sub_test = data_adp3 %>%
+  group_by(user_pk) %>%
+  summarise(seq_15 = length(row.names(table(seq(15)%in%month_index)))) # if a user is active for 15 consecutive months, row.names would have a length of 1, otherwise 2(TRUE,FALSE)
+adp3_users = select(filter(adp3_sub_test, seq_15 == 1),user_pk) # this returns all qualified users
+data_adp3 = filter(data_adp3, user_pk %in% adp3_users$user_pk) 
+
+data_adp3 = tbl_df(as.data.frame(with(data_adp3, table(cut(nmonths, breaks=c(1,3,6,12,50)), adp))))
+data_adp3 = arrange(data_adp3, Var1) # Var1 is the resulting category
+data_adp3 = data_adp3 %>%
+  group_by(Var1) %>%
+  mutate(dist_adp = round(100*(Freq/sum(Freq))))
+data_adp4 = filter(data_adp3, dist_adp > 0)
+
+p_adp4 = ggplot(data_adp4,aes(x=adp,y=dist_adp,group=Var1))+
+  geom_line(aes(linetype=Var1,color=Var1))+
+  xlab("percent of active days")+
+  ylab("distribution")+
+  theme(
+    panel.background = element_blank()
+  )+
+  ggtitle("422 users who are active for at least 15 of their first 18 months are included")
+
+ggsave("p_adp4.pdf",p_adp4,width=16,height=8)
+
diff --git a/visualizations/hq_viz.R b/visualizations/hq_viz.R
new file mode 100644
index 0000000..36e1275
--- /dev/null
+++ b/visualizations/hq_viz.R
@@ -0,0 +1,391 @@
+####################
+##HQ Visualization##
+####################
+
+library(ggplot2)
+library(scales)
+library(plyr)
+library(dplyr)
+library(grDevices)
+library(grid)
+
+data <- tbl_df(read.csv("data.csv", stringsAsFactors = FALSE))
+data$calendar_month <- as.Date(data$calendar_month,format="%m/%d/%Y")
+## manually delete a user: user_pk == 6421
+data = filter(data, user_pk != 6421)
+
+# for each domain, count total unique users 
+tuu <- data %>%
+  group_by(domain_numeric) %>%
+  summarise(tot_unique_users = n_distinct(user_pk))
+tuu = arrange(tuu, tot_unique_users)
+# rank domains by total number of users
+tuu$dm_rank_tuu = seq(nrow(tuu))
+
+# for each domain each month, count blocks of users
+  # blocks: (0,25],(25,50]... 
+  # expected result: in a domain this month, there would be N blocks of users
+bou = data %>%
+  group_by(domain_numeric, calendar_month) %>%
+  summarise(user_size = n_distinct(user_pk))
+
+bou = tbl_df(cbind(bou,findInterval(bou$user_size, c(25,50,75,100,125,150,175,200,225,250,275,300,325))))
+colnames(bou)[4] = c("user_block")
+bou$user_block = bou$user_block + 1
+rep_domain = rle(bou$domain_numeric)$lengths
+bou$dm_index = rep(seq(length(unique(bou$domain_numeric))),rep_domain) # adding a new index to correct for the random domain_numeric (hitting 7600s)
+
+bou = arrange(bou, dm_index, calendar_month)
+p_bou = ggplot(bou, aes(x=calendar_month,y=dm_index,fill=factor(dm_index)))+
+  geom_tile(aes(height=user_block))+
+  scale_fill_manual(values=c(rep(c("blue","yellow"),times=153)))
+# customize y-axis scale to be same with the height of geom tiles
+
+
+# for each domain, count total unique months
+tum <- data %>%
+  group_by(domain_numeric) %>%
+  summarise(tot_unique_months = n_distinct(calendar_month))
+tum <- arrange(tum, tot_unique_months)
+tum$dm_rank_tum <- seq(nrow(tum))
+
+# cbind by domain_numeric
+tuu <- arrange(tuu, domain_numeric)
+tum <- arrange(tum, domain_numeric)
+cbinded <- tbl_df(merge(tuu, tum))
+
+# add dm_rank_tuu, dm_rank_tum, user_bin back to monthly aggregate data
+rep_time <- as.data.frame(table(data$domain_numeric))$Freq
+cbinded_rep <- cbinded[rep(row.names(cbinded),rep_time),1:6]
+data <- arrange(data, domain_numeric)
+hq_viz_data <- tbl_df(cbind(data, cbinded_rep[,2:6]))
+
+# rank domains by total unique users (ascending)
+hvd <- arrange(hq_viz_data, dm_rank_tuu, user_pk, calendar_month)
+rep_time_user <- rle(hvd$user_pk)$lengths
+hvd$user_rank_1 <- rep(seq(length(unique(hvd$user_pk))), rep_time_user)
+
+
+# SETUP PARAMS
+# title text style 
+title.style <- element_text(family="Arial", face="bold", color="black", size=16)
+# axis text style
+label.style <- element_text(family="Arial Narrow", color="grey50", size=8, angle=90)
+# panel setup
+horizontal_theme <- theme(  
+  panel.background    = element_blank() ,
+  panel.grid.major.x  = element_blank() ,
+  panel.grid.minor    = element_blank() ,
+  axis.text.y = element_blank(),
+  axis.title.y = element_blank(),
+  axis.ticks.y = element_blank(),
+  axis.line  = element_line( size=.2 , colour="#666666" ),
+  legend.position = "none", # turn off legends
+  plot.margin = unit( c(.2,.25,.25,.5) , "in")
+) 
+non_horizontal_theme <- theme(
+  panel.background    = element_blank() ,
+  #  panel.grid.major.x  = element_blank() ,
+  panel.grid.minor.y    = element_blank() ,
+  axis.line  = element_line( size=.2 , colour="#666666" ),
+  legend.position = "none", # turn off legends
+  plot.margin = unit( c(.5,.25,.25,.5) , "in") 
+)
+
+# horizontal chart 1: each line represents one user. lines are grouped by domains
+monthStart <- min(hvd$calendar_month)
+monthEnd <- max(hvd$calendar_month)
+p_hvd = ggplot(hvd, aes(x=user_rank_1,y=calendar_month)) + 
+  geom_tile(aes(fill=factor(dm_rank_tuu))) + 
+  scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=26354)) +
+  scale_y_date(limits=c(monthStart,monthEnd),breaks=date_breaks(width="1 month"),labels=date_format("%m/%y")) +
+  theme(  
+    panel.background    = element_blank() ,
+    panel.grid.major.x  = element_blank() ,
+    panel.grid.minor.y  = element_blank() ,
+    axis.text.y = element_blank(),
+    axis.text.x = element_text(size = 10, angle = 90),
+    axis.title.y = element_blank(),
+    axis.ticks.y = element_blank(),
+    axis.line  = element_line( size=.2 , colour="#666666" ),
+    legend.position = "none", # turn off legends
+    plot.margin = unit( c(.2,.25,.25,.5) , "in"),
+    aspect.ratio = 1
+  ) +
+  ggtitle(expression(atop(italic("Domain activity over time"), 
+                          atop("Each line represents one user",
+                               atop("Two colors are recycled to distinguish between 264 domains", ""))))) +
+  ylab("calendar month") +
+  coord_flip() 
+
+# horizontal chart 1.2: one line per domain, domain(geom tile) height is proportional to its user size
+hvd2 = filter(hvd, domain_numeric < 310)
+hvd2 = arrange(hvd2, user_bin, dm_rank_tuu)
+p_hvd_1.1 = ggplot(hvd2, aes(x=dm_rank_tuu,y=calendar_month,fill=factor(user_bin)))+
+  geom_tile(aes(height=user_bin*5))+
+  scale_y_date(limits=c(monthStart,monthEnd),breaks=date_breaks(width="1 month"),labels=date_format("%m/%y")) +
+  theme(  
+    panel.background    = element_blank() ,
+    panel.grid.major.y  = element_blank() ,
+    panel.grid.minor.x  = element_blank() ,
+    axis.text.x = element_blank(),
+    axis.text.y = element_text(size = 10, angle = 0),
+    axis.title.x = element_blank(),
+    axis.title.y = element_blank(),
+    axis.ticks.x = element_blank(),
+    axis.line  = element_line( size=.2 , colour="#666666" ),
+    legend.position = "none", # turn off legends
+    plot.margin = unit( c(.2,.25,.25,.5) , "in"),
+    aspect.ratio = 1
+  )
+
+ggsave("hq_1.1.pdf", p_hvd_1.1, width=12, height=12)
+# horizontal chart 2: for each domain, plot a horizontal line visualizing their total active days
+# for each domain each month, calculate total active days
+hvd_tad = hvd %>%
+  group_by(dm_rank_tuu, calendar_month) %>%
+  summarise(tot_active_days = sum(active_days))
+
+p_hvd_tad = ggplot(hvd_tad, aes(x=dm_rank_tuu,y=calendar_month)) +
+  geom_tile(aes(fill=factor(dm_rank_tuu),alpha=log(tot_active_days))) + 
+  scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) +  
+  scale_alpha(range=c(0.2,1))+
+  theme(  
+    panel.background    = element_blank() ,
+    panel.grid.major.x  = element_blank() ,
+    panel.grid.minor.y  = element_blank() ,
+    axis.text.y = element_blank(),
+    axis.text.x = element_text(size = 10, angle = 90),
+    axis.title.y = element_blank(),
+    axis.ticks.y = element_blank(),
+    axis.line  = element_line( size=.2 , colour="#666666" ),
+    legend.position = "none", # turn off legends
+    plot.margin = unit( c(.2,.25,.25,.5) , "in"),
+    aspect.ratio = 1
+  ) +
+  scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) +    
+  ggtitle(expression(atop(italic("Domain activity: total active days per month"), 
+                          atop("Each line = one domain", 
+                               atop("Darker colors = more active days", ""))))) +
+  ylab("calendar month") +
+  coord_flip() 
+
+# horizontal chart 3: for each domain, plot difference in total active days between this and previous month
+arrange(hvd_tad, dm_rank_tuu, calendar_month)
+hvd_tad = mutate(hvd_tad, prev = lag(tot_active_days))
+hvd_tad = mutate(hvd_tad, tad_diff=ifelse(is.na(prev), "NA", tot_active_days-prev))
+hvd_tad = mutate(hvd_tad, tad_diff_cat=ifelse(tad_diff < 0, 0, 1)) # assign val=1 if one domain is more active this month than last month in terms of days
+hvd_tad$tad_diff_cat[which(hvd_tad$tad_diff == "NA")] = NA
+
+# deprecated 
+p_hvd_tdc = ggplot(hvd_tad, aes(x=dm_rank_tuu,y=calendar_month)) +
+  geom_tile(aes(fill=factor(dm_rank_tuu),alpha=factor(tad_diff_cat))) + 
+  scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) +  
+  scale_alpha_discrete(range=c(0.2,1))+
+  theme(  
+    panel.background    = element_blank() ,
+    panel.grid.major.x  = element_blank() ,
+    panel.grid.minor.y  = element_blank() ,
+    axis.text.y = element_blank(),
+    axis.text.x = element_text(size = 10, angle = 90),
+    axis.title.y = element_blank(),
+    axis.ticks.y = element_blank(),
+    axis.line  = element_line( size=.2 , colour="#666666" ),
+    legend.position = "none", # turn off legends
+    plot.margin = unit( c(.2,.25,.25,.5) , "in"),
+    aspect.ratio = 1
+  ) +
+  scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  ggtitle(expression(atop(italic("Domain activity: changes in total active days over time"), 
+                          atop("Each line = one domain (264 domains in total)",
+                               atop("Current vs. Previous Month: Darker color represents a growth in active days", ""))))) +
+  ylab("calendar month") +  
+  coord_flip() 
+
+# counting total positive values of tad_diff_cat and order domains by count
+dm_rank_tp = hvd_tad %>%
+  group_by(dm_rank_tuu) %>%
+  summarise(tot_pos = sum(tad_diff_cat[which(tad_diff_cat == 1)]))
+
+dm_rank_tp = arrange(dm_rank_tp, tot_pos)
+dm_rank_tp$dm_rank_pos_prev = seq(nrow(dm_rank_tp))
+
+# add the new ranking back to monthly data
+rep_time_2 = as.data.frame(table(hvd_tad$dm_rank_tuu))$Freq
+dm_rank_tp = arrange(dm_rank_tp, dm_rank_tuu)
+dm_rank_tp_rep = dm_rank_tp[rep(row.names(dm_rank_tp),rep_time_2),1:3]
+hvd_tad = arrange(hvd_tad, dm_rank_tuu)
+hvd_tad <- tbl_df(cbind(hvd_tad, dm_rank_tp_rep[,2:3]))
+
+hvd_tad = arrange(hvd_tad, dm_rank_pos_prev, calendar_month)
+
+# colors()[grep("azure",colors())] # returning all color names containing 'azure'
+
+p_hvd_tad_2 = ggplot(hvd_tad, aes(x=dm_rank_pos_prev,y=calendar_month)) +
+  geom_tile(aes(fill=factor(dm_rank_pos_prev),alpha=factor(tad_diff_cat))) + 
+  scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) +  
+  scale_alpha_discrete(range=c(0.3,1))+
+  theme(  
+    panel.background    = element_blank() ,
+    panel.grid.major.x  = element_blank() ,
+    panel.grid.minor.y  = element_blank() ,
+    axis.text.y = element_blank(),
+    axis.text.x = element_text(size = 10, angle = 90),
+    axis.title.y = element_blank(),
+    axis.ticks.y = element_blank(),
+    axis.line  = element_line( size=.2 , colour="#666666" ),
+    legend.position = "none", # turn off legends
+    plot.margin = unit( c(.2,.25,.25,.5) , "in"),
+    aspect.ratio = 1
+  ) +
+  scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  ylab("calendar month") +
+  ggtitle(expression(atop(italic("Domain activity: changes in total active days over time"), 
+                          atop("Current vs. Previous Month: Darker color represents an increase in active days", "")))) +
+  coord_flip() 
+
+ggsave("1.4_horizontal_domain_active_or_not.pdf", p_hvd, path=mypath, width=12, height=12, limitsize = FALSE, useDingbats=FALSE)
+ggsave("1.3_horizontal_domain_total_active_days_by_domain.pdf", p_hvd_tad, path=mypath, width=12, height=12, limitsize = FALSE)
+ggsave("1.3_horizontal_domain_change_in_total_active_days_by_domain.pdf", p_hvd_tad_2, path=mypath, width=12, height=12, limitsize = FALSE)
+
+
+#################
+# NON-HORIZONTAL#
+#################
+# for each domain each month, counts total unique users 
+tau <- data %>%
+  group_by(domain_numeric, calendar_month) %>%
+  summarise_each(funs(n_distinct), user_pk)
+
+p_tau <- ggplot(tau, aes(x=calendar_month, y=user_pk,group=factor(domain_numeric))) + 
+  geom_line(aes(colour=factor(domain_numeric),size=0.5)) +
+  scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  scale_y_discrete("user_pk", breaks = c(0,50,100,200,300,400,500)) +
+  labs(x="calendar month", y="total users") + 
+  ggtitle(expression(atop("Total active users on each domain",
+                          atop(italic("Aesthetic mapping: color - domain"),"")))) + 
+  non_horizontal_theme 
+
+# for each domain in each month, calculate the change in number of users from the previous month (user change)
+uc <- data %>% 
+  group_by(domain_numeric, calendar_month) %>%
+  summarise(tot_active_users = n_distinct(user_pk)) %>%
+  mutate(change = tot_active_users - lag(tot_active_users)) # to be rewritten with tally() function
+
+# bar chart
+p_uc_bar <- ggplot(uc, aes(x=calendar_month)) + 
+  geom_bar(aes(y=change, fill=factor(domain_numeric)),stat="identity") + 
+  scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  labs(x="calendar month", y="change compared to previous month") +
+  ggtitle(expression(atop("Changes in active users on each domain",
+                          atop(italic("Aesthetic mapping: color - domain"),"")))) +
+  non_horizontal_theme +
+  theme(axis.text.x=element_text(angle=90))+
+  theme(title=title.style)
+
+# for each domain, each month of a year, count total active days (of each user) and sort in descending order 
+tad <- data %>% 
+  group_by(domain_numeric, calendar_month) %>%
+  summarise(tot_active_days = sum(active_days)) 
+
+# TO BE UPDATED: Y-AXIS & LEGENDS 
+p_tad <- ggplot(tad, aes(x=calendar_month, y=tot_active_days,group=factor(domain_numeric))) + 
+  geom_line(aes(colour=factor(domain_numeric), size=1)) +
+  scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  scale_y_discrete("tot_active_days", breaks = c(500,1000,1500,2000,2500,3000,3500,4000,4500)) +
+  labs(list(x="calendar month", y="total active days")) +
+  ggtitle(expression(atop("Total active days on each domain",
+                          atop(italic("Aesthetic mapping: color - domain"),"")))) +
+  non_horizontal_theme +
+  theme(title=title.style) +
+  theme(axis.text.x=element_text(angle=90))
+
+
+# limit data submitted to submissions from 2012-10-31 to 2014-10-31 for all visuals 
+tau_sub <- filter(tau, calendar_month >= as.Date("2012-10-31") & calendar_month <= as.Date("2014-10-31"))
+tad_sub <- filter(tad, calendar_month >= as.Date("2012-10-31") & calendar_month <= as.Date("2014-10-31"))
+
+# for 10 skyrocketing months in y-axis variables, identity their domain_numeric
+tau_skyrockets <- arrange(tau_sub, desc(user_pk))[1:10,]
+tad_skyrockets <- arrange(tad_sub, desc(tot_active_days))[1:10,]
+
+# bar chart viz: total active users
+tau_sub <- arrange(tau_sub, calendar_month, user_pk, domain_numeric)
+p_tau_sub <- ggplot(tau_sub, aes(x=calendar_month)) + 
+  geom_bar(aes(y=(user_pk), fill=factor(domain_numeric)), # need to add color scale to distinguish adjacent domains with same color fill 
+           stat="identity", 
+           width=20) +
+  scale_fill_manual(values=rep(brewer.pal(10,"Set3"),times=26)) + 
+  scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  non_horizontal_theme + 
+  labs(x="calendar month", 
+       y="total active users") +
+  ggtitle(expression(atop("Total active users on each domain in last two years",
+                          atop(italic("Aesthetic mapping: color - domain, size - total users",""))))) +
+  theme(title=title.style)
+
+
+# bar chart viz: total active days
+tad_sub <- arrange(tad_sub, calendar_month, tot_active_days, domain_numeric)
+p_tad_sub <- ggplot(tad_sub, aes(x=calendar_month)) + 
+  geom_bar(aes(y=(tot_active_days), fill=factor(domain_numeric)), # need to add color scale to distinguish adjacent domains with same color fill 
+           stat="identity", 
+           width=20) +
+  scale_fill_manual(values=rep(brewer.pal(10,"Set3"),times=26)) + 
+  scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  horizontal_theme + 
+  labs(x="calendar month", 
+       y="total active days") +
+  ggtitle(expression(atop("Total active days on each domain in last two years",
+                          atop(italic("Aesthetic mapping: color - domain, size - active days",""))))) +
+  theme(title=title.style)
+
+
+# log scale y-axis variables
+bu_sub <- data[,c("domain_numeric","business_unit")]
+# find unique domain_numeric
+bu_sub <- bu_sub[!duplicated(bu_sub$domain_numeric),]
+tad_sub_2 <- inner_join(tad_sub, bu_sub, by = "domain_numeric")
+tad_sub_2$log_tot_active_days <- round(log2(tad_sub_2$tot_active_days),1)
+tad_sub_2 <- arrange(tad_sub_2, domain_numeric, calendar_month)
+tad_sub_2$bu <- ifelse(tad_sub_2$business_unit == "DSI", "1", "0")
+tad_sub_2[which(is.na(tad_sub_2$bu)),]$bu = 0
+
+p_tad_sub_2 <- ggplot(tad_sub_2, aes(x=calendar_month, y=log_tot_active_days, group=factor(domain_numeric))) + 
+  geom_line(aes(color=factor(bu),size=factor(bu))) + # need to add color scale to distinguish adjacent domains with same color fill 
+  scale_size_manual(values=c(0.05, 1.2)) + 
+  scale_x_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  non_horizontal_theme +
+  labs(x="calendar month", 
+       y="total active days"
+  ) +
+  ggtitle(expression(atop("Total active days on each domain in last two years (log scale)",
+                          atop(italic("Thick blue: DSI domains, Thin Pink: Non-DSI domains"),"")))) +
+  theme(title=title.style)
+
+
+# path to save plot to (each version should be stored in different sub-folders)
+mypath=file.path("D://Dropbox//Dropbox (Dimagi)//data tests", "motion_chart", "1.1")
+
+ggsave("1.2_domain_total_active_days_log_scale.pdf", p_tad_sub_2, path=mypath, width=12, height=8)
+ggsave("1.1_domain_total_active_days_bar_chart.pdf", p_tad_sub, path=mypath, scale=2)
+ggsave("1.2_domain_total_active_users.pdf", p_tau, path=mypath, width=12, height=8)
+ggsave("1.2_domain_active_user_change.pdf", p_uc_bar, path=mypath, width=12, height=8)
+ggsave("1.2_domain_total_active_days.pdf", p_tad, path=mypath, width=12, height=8)
+
+# polar histogram
+
+p_hvd_tad_3 = ggplot(hvd_tad, aes(x=dm_rank_pos_prev,y=calendar_month)) +
+  geom_tile(aes(fill=factor(dm_rank_pos_prev),alpha=factor(tad_diff_cat))) + 
+  scale_fill_manual(values=rep(c("darkgoldenrod1","azure3"),times=132)) +  
+  scale_alpha_discrete(range=c(0.3,1))+
+  horizontal_theme + 
+  theme(title=title.style,
+        aspect.ratio=1) +  
+  scale_y_date(breaks=date_breaks("months"), labels=date_format("%m/%y")) + 
+  ggtitle(expression(atop(italic("Domain activity: changes in total active days over time"), 
+                          atop("Current vs. Previous Month: Darker color represents an increase in active days", "")))) +
+  coord_flip() +
+  coord_polar()
+
+ggsave("1.2_domain_change_active_days_polar.pdf", p_hvd_tad_3, path=mypath, scale=2)
\ No newline at end of file
diff --git a/visualizations/hq_viz_data_table.R b/visualizations/hq_viz_data_table.R
new file mode 100644
index 0000000..aa86f42
--- /dev/null
+++ b/visualizations/hq_viz_data_table.R
@@ -0,0 +1,4 @@
+##################################
+# data table accompanying hq viz #
+##################################
+
diff --git a/visualizations/hq_viz_levelplot.R b/visualizations/hq_viz_levelplot.R
new file mode 100644
index 0000000..098878b
--- /dev/null
+++ b/visualizations/hq_viz_levelplot.R
@@ -0,0 +1,37 @@
+########################################
+# using levelplot for hq visualization #
+########################################
+
+library(lattice)
+library(zoo)
+# create a matrix using domain_numeric, calendar_month 
+ldata = select(data, domain_numeric, calendar_month)
+
+lmatrix = matrix(0,
+                 nrow=length(unique(ldata$domain_numeric)),
+                 ncol=length(unique(ldata$calendar_month))) # create an empty matrix
+rownames(lmatrix) = paste("domain_", unique(ldata$domain_numeric), sep="")
+colnames(lmatrix) = as.character(sort(unique(ldata$calendar_month)))
+
+# split ldata into a list of data frames by domain_numeric
+ldata = split(ldata, ldata$domain_numeric)
+
+# assign values to each cell in the matrix
+for (i in seq(nrow(lmatrix))) {  
+  lmatrix[i,] = as.numeric(as.Date(colnames(lmatrix)) %in% 
+                             unique(ldata[[i]]$calendar_month))  
+}
+
+# replace 0 with NA
+lmatrix[lmatrix == 0] <- NA
+
+levelplot(t(lmatrix), 
+          colorkey = FALSE,
+          scales = list(x = list(rot = 90)),
+          panel = function(x, y, z,...) {
+            panel.abline(v = unique(as.numeric(x)), 
+                         h = unique(as.numeric(y)), 
+                         col = "darkgrey")
+            panel.xyplot(x, y, pch = 16 * z, ...)
+          },
+          xlab = "calendar month")