From f3d8c1914188473e3947bbe7567e901fd25120df Mon Sep 17 00:00:00 2001
From: Liz Keller <alias1132@users.noreply.github.com>
Date: Wed, 12 Jul 2023 16:47:01 +1200
Subject: [PATCH] Add files via upload

Changes made to analysis for revised manuscript, submitted to Animals Jul 2023
---
 src/notebook-1-eda-revision.Rmd       | 805 ++++++++++++++++++++++++++
 src/notebook-2-inference-revision.Rmd | 574 ++++++++++++++++++
 2 files changed, 1379 insertions(+)
 create mode 100644 src/notebook-1-eda-revision.Rmd
 create mode 100644 src/notebook-2-inference-revision.Rmd

diff --git a/src/notebook-1-eda-revision.Rmd b/src/notebook-1-eda-revision.Rmd
new file mode 100644
index 0000000..8726a36
--- /dev/null
+++ b/src/notebook-1-eda-revision.Rmd
@@ -0,0 +1,805 @@
+---
+title: "Exploratory Data Analysis"
+date: "`r Sys.Date()`"
+output:
+  html_document: default
+  pdf_document: default
+---
+
+setwd("C:/Users/keller/Desktop/CCBS/ccbs-study-3/")
+
+
+```{r setup, include=FALSE}
+library(tidyverse)
+library(ggpubr)
+library(GGally)
+library(moments)
+library(scales)
+knitr::opts_chunk$set(echo=TRUE)
+
+
+```
+
+# Overview
+
+This notebook documents the exploratory data analysis undertaken for this
+project.
+
+# Loading the Data
+
+Load the tidy data set into memory. If it does not exist, then run the notebook
+that builds it.
+
+```{r}
+tidy_data_filepath <- '../data/tidy.Rds'
+if (!file.exists(tidy_data_filepath)) {
+  rmarkdown::render('./notebook-0-tidy.Rmd')
+}
+df <- readRDS(tidy_data_filepath)
+stopifnot(identical(dim(df)+0, c(1011, 111)))
+str(df)
+```
+## Overview of Data Set
+
+# summarize data
+
+```{r}
+summary(df)
+
+```
+
+```{r}
+# Number of unique owners after inclusion criteria.
+length(unique(df$owner_id))
+
+```
+
+```{r}
+#number of dogs per household.
+summary(plyr::count(df, 'owner_id'))
+```
+Median number of dogs per household = 1 (range: 1 to 21), mean 1.6. Now we can
+drop the column to simply the data set.
+
+```{r}
+df <- subset(df, select=-c(owner_id))
+```
+
+
+Add a column indicating number of reported behavior problems
+
+#how may dogs have at least one behavior issue? 
+#what is the average number of behavior problems?
+
+```{r}
+
+df <- df %>%
+  mutate(behav_problem = ifelse(
+    dog_was_aggressive | dog_has_fear_or_anxiety | dog_has_jumped_problematically | 
+      dog_has_barked_excessively | dog_has_coprophagia | dog_has_repetitive_behavior |
+      dog_has_house_soiled | dog_has_rolled_in_repulsive_material | dog_was_hyperactive | 
+      dog_was_destructive | dog_has_escaped | dog_has_mounted, TRUE, FALSE))
+summary(df$behav_problem)
+
+#create variable with number of behavior problems
+df_temp <- df[,c('dog_was_aggressive', 'dog_has_fear_or_anxiety', 'dog_has_jumped_problematically', 
+      'dog_has_barked_excessively', 'dog_has_coprophagia', 'dog_has_repetitive_behavior',
+      'dog_has_house_soiled', 'dog_has_rolled_in_repulsive_material', 'dog_was_hyperactive', 
+      'dog_was_destructive', 'dog_has_escaped', 'dog_has_mounted')]
+
+#count number of 'True' responses per row
+df$num_behv_problems <- rowSums(df_temp,na.rm=TRUE)
+summary(df$num_behv_problems)
+  
+```
+
+# Summary stats specific to only valid responses which meet the inclusion criteria (valid response = dog > 12 months of age, answered True or False to dog_adolescent_or_adult_training)
+
+```{r}
+
+df_valid_responses <- subset(df, dog_adolescent_or_adult_training==TRUE | dog_adolescent_or_adult_training==FALSE)
+summary(df_valid_responses)
+
+```
+
+#Break into control and experimental groups
+# Hypotheses: 
+# 1. adolescent / adult training will reduce odds of observing a behavior problem
+# 2. puppy training will reduce odds of observing a behavior problems
+# 3. Having both puppy and adult training will further reduce odds of behavior problems
+
+```{r}
+
+# Experiment 1: 
+# control and experimental groups
+# control = adult training = FALSE; exp = adult training = TRUE
+df1_control_no_adult_training <- subset(df_valid_responses, dog_adolescent_or_adult_training==FALSE)
+df1_exp_yes_adult_training <- subset(df_valid_responses, dog_adolescent_or_adult_training==TRUE)
+
+# Experiment 2: controlling for puppy training 
+# control and experimental groups
+# 4 groupings
+# first eliminate NA's from puppy training question
+df_puppy_train_valid_responses <- subset(df_valid_responses, dog_attended_pre_adolescent_training==FALSE | 
+         dog_attended_pre_adolescent_training==TRUE)
+
+df2_puppy_training_only <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==TRUE & dog_adolescent_or_adult_training==FALSE)
+df2_puppy_and_adult_training <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==TRUE & dog_adolescent_or_adult_training==TRUE)
+df2_adult_only <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==FALSE & dog_adolescent_or_adult_training==TRUE)
+df2_no_training <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==FALSE & dog_adolescent_or_adult_training==FALSE)
+
+nrow(df2_puppy_training_only) + nrow(df2_puppy_and_adult_training) + nrow(df2_adult_only) + nrow(df2_no_training)
+
+
+```
+
+
+```{r}
+
+#summary of number of behavior problems 
+
+summary(df_valid_responses$num_behv_problems)
+summary(df1_control_no_adult_training$num_behv_problems)
+summary(df1_exp_yes_adult_training$num_behv_problems)
+
+```
+
+```{r}
+#summary(df_puppy_train_valid_responses$num_behv_problems)
+
+```
+
+
+#look at distribution of number of behavior problems, by grouping
+
+```{r}
+
+p <- ggplot(df_valid_responses, aes(x=num_behv_problems)) + geom_bar(fill='steelblue') + 
+  scale_x_continuous(breaks=0:10) + theme(axis.text=element_text(size=14),    axis.title=element_text(size=18,face="bold")) # + geom_text(aes(label = after_stat(count)), stat = "count", vjust = 1.1, colour = "white")  
+
+p.labs <- p + labs(x = "Number of behavior problems per dog", y="Count") 
+p.labs
+
+```
+
+
+```{r} 
+#save figure
+filename = "C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/fig1_number_of_behavior_problems_all_valid_responses.pdf"
+
+pdf(file=filename, width=12, height=8)
+p.labs
+dev.off()
+```
+
+
+```{r}
+
+p <- ggplot(df1_control_no_adult_training, aes(x=num_behv_problems)) + geom_bar(fill='steelblue') + 
+  scale_x_continuous(breaks=0:10) + theme(axis.text=element_text(size=14),    axis.title=element_text(size=18,face="bold")) 
+
+p.labs <- p + labs(x = "Number of behavior problems per dog", y="Count") 
+p.labs
+
+```
+
+```{r} 
+#save figure
+filename = "C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/fig1_number_of_behavior_problems_df1_control_no_adult_training.pdf"
+
+pdf(file=filename, width=12, height=8)
+p.labs
+dev.off()
+```
+
+```{r}
+
+p <- ggplot(df1_exp_yes_adult_training, aes(x=num_behv_problems)) + geom_bar(fill='steelblue') + 
+  scale_x_continuous(breaks=0:10) + theme(axis.text=element_text(size=14),    axis.title=element_text(size=18,face="bold")) 
+
+p.labs <- p + labs(x = "Number of behavior problems per dog", y="Count") 
+p.labs
+
+```
+
+```{r} 
+#save figure
+filename = "C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/fig1_number_of_behavior_problems_df1_exp_yes_adult_training.pdf"
+
+pdf(file=filename, width=12, height=8)
+p.labs
+dev.off()
+```
+
+# is the mean number of behavior problems significantly different between control and exp?
+
+```{r}
+
+t.test(df1_control_no_adult_training$num_behv_problems, df1_exp_yes_adult_training$num_behv_problems, paired=FALSE, var.equal=TRUE)
+
+```
+# The t-test shows that the mean is not significantly different beween the two groups (p = 0.13)
+
+
+
+#calculate percentage of dogs with each behavior problem, by grouping
+
+```{r}
+
+behav_probs <- c(
+  'dog_was_aggressive',
+  'dog_has_fear_or_anxiety',
+  'dog_has_jumped_problematically',
+  'dog_has_barked_excessively',
+  'dog_has_coprophagia',
+  'dog_has_repetitive_behavior',
+  'dog_has_house_soiled',
+  'dog_has_rolled_in_repulsive_material',
+  'dog_was_hyperactive',
+  'dog_was_destructive',
+  'dog_has_escaped',
+  'dog_has_mounted'
+)
+behav_probs <- sort(behav_probs)
+
+tmp <- df_valid_responses %>% select(all_of(behav_probs))
+
+```
+
+```{r}
+
+#create table for percentage occurence
+
+# Define a dataframe 
+pct_df <- data.frame(Behavior  = behav_probs,
+                  Occurence = numeric(length(behav_probs)))
+
+for (behav_prob in behav_probs){
+  
+  tbl <- plyr::count(tmp, behav_prob) / nrow(tmp) * 100
+  #tmp <- tmp %>% 
+  ## add percentage label with `sprintf()`
+  #perc = paste0(sprintf("%4.1f", tbl$freq[2]), "%")
+  perc = tbl$freq[2]
+  
+  pct_df[pct_df$Behavior==behav_prob,2] = perc
+  
+}
+
+pct_df <- pct_df[order(pct_df$Occurence,decreasing = TRUE),] 
+
+
+```
+
+```{r}
+
+my_labels <- c("Excessive Barking","Coprophagia","Escaping", "Fear or Anxiety", "House soiling",               "Problematic jumping", "Mounting", "Repetitive Behavior", "Rolling in Repulsive Material", "Aggression","Destructive", "Hyperactivity" )
+
+pct_labels = paste0(sprintf("%4.1f", pct_df$Occurence), "%")
+
+df_sorted <- arrange(pct_df, Occurence) 
+
+p <- ggplot(pct_df, aes(x=Behavior,y=Occurence)) + geom_bar(stat="identity",fill='steelblue') + scale_x_discrete(labels=my_labels) + scale_y_continuous(limits=c(0,100)) + geom_text(aes(label = pct_labels), hjust = -0.2, colour = "black", size=5)  + theme(axis.text=element_text(size=14),    axis.title=element_text(size=16,face="bold")) 
+
+p.labs <- p + coord_flip() + labs(y = "percentage of dogs", ) 
+p.labs
+
+
+```
+
+
+```{r} 
+#save figure
+filename="C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/behavior_problems_pct_all_valid_responses.pdf"
+pdf(file=filename, width=12, height=8)
+p.labs
+dev.off()
+
+```
+
+
+# general notes on dataset: (all stats exclude no answer or NA's unless otherwise stated)
+-- 322 (out of 1011 total obs) responses are excluded due to no response (NA's) in either training or behavior questions; many of these dogs were too young to take part in the survey (<12 mo)
+-- 671 out of 689 or 97.4% reported at least one behavior problem
+-- median number of behavior problems is 4, and mean is 3.7
+-- Dog gender spilt almost even, slight bias towards male (478 F / 507 M total, 325 F / 364 M with valid responses)
+-- 634 unique dog owners, median 1 dog owned, mean 1.6 (all respondents)
+-- Mean age (in months) is 80.9 (6.7 years) and median age is 75 (6.25 years) [Note all dogs less than 12 months excluded from analysis]
+-- a majority of respondents (389/689 = 56%) acquired their dog in the pre-adolescent stage
+-- of the respondents who provided an answer (559), just under half of dogs had attended pre-adolescent training (260/559 = 46.5%). The majority of these dogs (209/260 = 80.4%) had received reward-based training only. A further 15% (39/260) received a mix of reward and discipline-based training. The vast majority (229/260 = 88%) attended group classes (as opposed to private training or boot camp).
+-- More than half of dogs had attended adolescent or adult training (442/689 = 64.2%). The vast majority (358/442 = 81.0%) reported receiving reward-based training only and attended group classes (387/442 = 87.6%)
+
+-- the majority continued some form of training post-puppyhood: 11% (77/689) completed in adolescence (defined as 6 to 18 months), 10.6% (73/689) completed between ages 18-36 months (social maturity), 39.6% (273/689) continue to train in-home, and 32.8% (226/689) continue to attend classes, with 53% (365/689) reporting continued training either in home or through classes 
+
+
+#specific behavior problems (overall survey population):
+-- 56.2% reported aggression (387/689)
+-- 73.9% reported fear or anxiety (509/689)
+-- 19.4% reported problematic jumping (134/689)
+-- 19.0% reported excessive barking (131/689)
+-- 41.7% reported coprophagia (287/689)
+-- 26.9% reported repetitive behavior (185/689)
+-- 23.1% reported house soiling (159/689)
+-- 41.1% reported rolling in repulsive material (283/689)
+-- 12.5% reported hyperactivity (86/689)
+-- 15.4% reported destructive behavior (106/689)
+-- 26.3% reported escaping (181/689)
+-- 18.4% reported mounting (127/689)
+
+
+#split by control and exp group
+
+#control
+```{r}
+
+tmp <- df1_control_no_adult_training %>% select(all_of(behav_probs))
+
+#create table for percentage occurence
+
+# Define a dataframe 
+pct_df <- data.frame(Behavior  = behav_probs,
+                  Occurence = numeric(length(behav_probs)))
+
+for (behav_prob in behav_probs){
+  
+  tbl <- plyr::count(tmp, behav_prob) / nrow(tmp) * 100
+  
+  perc = tbl$freq[2]
+  
+  pct_df[pct_df$Behavior==behav_prob,2] = perc
+  
+}
+
+pct_df <- pct_df[order(pct_df$Occurence,decreasing = TRUE),] 
+
+
+```
+
+```{r}
+
+my_labels <- c("Excessive Barking","Coprophagia","Escaping", "Fear or Anxiety", "House soiling",               "Problematic jumping", "Mounting", "Repetitive Behavior", "Rolling in Repulsive Material", "Aggression","Destructive", "Hyperactivity" )
+
+pct_labels = paste0(sprintf("%4.1f", pct_df$Occurence), "%")
+
+df_sorted <- arrange(pct_df, Occurence) 
+
+p <- ggplot(pct_df, aes(x=Behavior,y=Occurence)) + geom_bar(stat="identity",fill='steelblue') + scale_x_discrete(labels=my_labels) + scale_y_continuous(limits=c(0,100)) + geom_text(aes(label = pct_labels), hjust = -0.2, colour = "black", size=5)  + theme(axis.text=element_text(size=14),    axis.title=element_text(size=16,face="bold")) 
+
+p.labs <- p + coord_flip() + labs(y = "percentage of dogs", ) 
+p.labs
+
+
+```
+
+```{r} 
+#save figure
+filename="C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/behavior_problems_pct_df1_control_no_adult_training.pdf"
+pdf(file=filename, width=12, height=8)
+p.labs
+dev.off()
+
+```
+
+#experiment
+```{r}
+
+tmp <- df1_exp_yes_adult_training %>% select(all_of(behav_probs))
+
+#create table for percentage occurence
+
+# Define a dataframe 
+pct_df <- data.frame(Behavior  = behav_probs,
+                  Occurence = numeric(length(behav_probs)))
+
+for (behav_prob in behav_probs){
+  
+  tbl <- plyr::count(tmp, behav_prob) / nrow(tmp) * 100
+  
+  perc = tbl$freq[2]
+  
+  pct_df[pct_df$Behavior==behav_prob,2] = perc
+  
+}
+
+pct_df <- pct_df[order(pct_df$Occurence,decreasing = TRUE),] 
+
+
+```
+
+```{r}
+
+my_labels <- c("Excessive Barking","Coprophagia","Escaping", "Fear or Anxiety", "House soiling",               "Problematic jumping", "Mounting", "Repetitive Behavior", "Rolling in Repulsive Material", "Aggression","Destructive", "Hyperactivity" )
+
+pct_labels = paste0(sprintf("%4.1f", pct_df$Occurence), "%")
+
+df_sorted <- arrange(pct_df, Occurence) 
+
+p <- ggplot(pct_df, aes(x=Behavior,y=Occurence)) + geom_bar(stat="identity",fill='steelblue') + scale_x_discrete(labels=my_labels) + scale_y_continuous(limits=c(0,100)) + geom_text(aes(label = pct_labels), hjust = -0.2, colour = "black", size=5)  + theme(axis.text=element_text(size=14),    axis.title=element_text(size=16,face="bold")) 
+
+p.labs <- p + coord_flip() + labs(y = "percentage of dogs", ) 
+p.labs
+
+
+```
+```{r} 
+#save figure
+filename="C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/behavior_problems_pct_df1_exp_yes_adult_training.pdf"
+pdf(file=filename, width=12, height=8)
+p.labs
+dev.off()
+
+```
+#demographics for each group separately
+
+```{r}
+
+#how many dogs in each group have at least one behavior problem?
+nrow(subset(df1_control_no_adult_training,num_behv_problems > 0)) / nrow(df1_control_no_adult_training) *100
+nrow(subset(df1_exp_yes_adult_training,num_behv_problems > 0)) / nrow(df1_exp_yes_adult_training) *100
+
+```
+```{r}
+
+#mean and median # of behavior problems
+summary(df1_control_no_adult_training$num_behv_problems) 
+summary(df1_exp_yes_adult_training$num_behv_problems)
+
+```
+```{r}
+
+#what % of dogs in each group are female?
+nrow(subset(df1_control_no_adult_training,dog_gender=="Female")) / nrow(df1_control_no_adult_training) *100
+nrow(subset(df1_exp_yes_adult_training,dog_gender=="Female")) / nrow(df1_exp_yes_adult_training) *100
+
+```
+```{r}
+
+#mean and median age of dogs
+summary(df1_control_no_adult_training$dog_age_in_months) 
+summary(df1_exp_yes_adult_training$dog_age_in_months)
+
+```
+
+```{r}
+
+#what % of dogs in each group were acquired pre-adolescent, as puppies (6 mo or younger)?
+nrow(subset(df1_control_no_adult_training,dog_was_acquired_pre_adolescent==TRUE)) / nrow(df1_control_no_adult_training) *100
+nrow(subset(df1_exp_yes_adult_training,dog_was_acquired_pre_adolescent==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+```
+```{r}
+
+#what % of dogs in each group attended puppy training?
+nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) / nrow(df1_control_no_adult_training) *100
+nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+```
+
+```{r}
+
+#what % of dogs in each group received reward-based puppy training?
+nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Reward based")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE  & dog_pre_adolescent_training_type=="Reward based")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+```
+```{r}
+
+#what % of dogs in each group received discipline-based puppy training?
+nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Discipline based")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE  & dog_pre_adolescent_training_type=="Discipline based")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+
+#Mixed balance of both reward and discipline
+nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Mixed balance of both reward and discipline")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE  & dog_pre_adolescent_training_type=="Mixed balance of both reward and discipline")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+```
+
+
+
+```{r}
+
+#what % of dogs in each group received group puppy training?
+nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training=="TRUE" & dog_pre_adolescent_training_location=="Myself/family member attended group classes")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE  & dog_pre_adolescent_training_location=="Myself/family member attended group classes")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+```
+
+```{r}
+
+#what % of dogs in each group received private in-home trainer as puppies?
+nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training=="TRUE" & dog_pre_adolescent_training_location=="Hired private in-home trainer")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE  & dog_pre_adolescent_training_location=="Hired private in-home trainer")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+
+
+#Sent to a professional training school/boot camp as puppies
+nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training=="TRUE" & dog_pre_adolescent_training_location=="Sent to a professional training school/boot camp")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE  & dog_pre_adolescent_training_location=="Sent to a professional training school/boot camp")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100
+
+```
+
+
+
+```{r}
+
+# exp group only by definition
+
+#what % of dogs received reward-based adult training?
+nrow(subset(df1_exp_yes_adult_training, reward_only_adult_training==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+nrow(subset(df1_exp_yes_adult_training, dog_adolescent_or_adult_training_style=="Discipline based")) / nrow(df1_exp_yes_adult_training) *100
+
+nrow(subset(df1_exp_yes_adult_training, dog_adolescent_or_adult_training_style=="Mixed balance of both reward and discipline")) / nrow(df1_exp_yes_adult_training) *100
+
+
+
+```
+
+```{r}
+
+#what % of dogs in each group received group puppy training?
+nrow(subset(df1_exp_yes_adult_training, trainer_group_classes==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+nrow(subset(df1_exp_yes_adult_training, trainer_home_private==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+nrow(subset(df1_exp_yes_adult_training, trainer_boot_camp==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+```
+
+```{r}
+
+#when did dogs "complete" training?
+nrow(subset(df1_exp_yes_adult_training,training_complete_age_6_to_18mo==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+nrow(subset(df1_exp_yes_adult_training,training_complete_age_18_to_36mo==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+nrow(subset(df1_exp_yes_adult_training,training_complete_age_continue_in_home==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+nrow(subset(df1_exp_yes_adult_training,training_complete_age_continue_classes==TRUE)) / nrow(df1_exp_yes_adult_training) *100
+
+```
+
+# make some plots to visualise data
+
+# examine variable distribution for all variables
+
+```{r}
+
+# all responses
+ggplot(df, aes(dog_age_in_months)) + geom_bar(fill='steelblue')
+skewness(df$dog_age_in_months,na.rm=TRUE)
+
+ggplot(df, aes(owner_age)) + geom_bar(fill='steelblue')
+skewness(df$owner_age,na.rm=TRUE)
+
+#valid responses only
+ggplot(df_valid_responses, aes(dog_age_in_months)) + geom_bar(fill='steelblue')
+skewness(df_valid_responses$dog_age_in_months,na.rm=TRUE)
+
+ggplot(df_valid_responses, aes(owner_age)) + geom_bar(fill='steelblue')
+skewness(df_valid_responses$owner_age,na.rm=TRUE)
+
+```
+
+We see a modest right skew (younger) in the dog age plot, and a left skew (older) in the owner age plot. 
+
+
+# We want to isolate the effect of adolescent / adult training
+
+ -- using all valid answers which meet our inclusion criteria = df_valid_responses
+
+```{r} 
+
+#testing some different groupings of the data
+
+#subset of data with dogs that are continuing training at the time of survey
+df_continue_training <- subset(df, training_complete_age_continue_in_home==TRUE | training_complete_age_continue_classes==TRUE)
+
+#subset of data to eliminate dogs that received some discipline-based training
+df_reward_only_training <- subset(df_valid_responses, reward_only_adult_training==TRUE)
+df_control_no_adult_training <- subset(df_valid_responses, dog_adolescent_or_adult_training==FALSE)
+
+nrow(subset(df_reward_only_training))
+nrow(subset(df_control_no_adult_training))
+
+```
+
+# Including the effect of pre-adolescent training:
+Of the 1011 total survey responses, 452 are excluded because they did not answer whether the dog had received pre-adolescent training and/or adult training. This leaves 559 responses. The breakdown of the group:
+
+177 had NO training (neither pre-adolescent or adult)
+37 had pre-adolescent but NO further training
+223 had pre-adolescent AND adult training
+122 had NO pre-adolescent training but did have adult training
+
+#These groups are not well-balanced, therefore they will not be used for further analysis
+
+
+# Controlling for the adverse impact of discipline-based training
+
+Looking at reward-only based training vs. no training (eliminating dogs that had received discipline-based or mixed discipline and reward training):
+605 responses in dataset
+358 received reward-only adult training (experiment)
+247 did not receive adult training (control)
+
+
+# How many dogs with any type of training (either puppy or adult) have behavior problems? 
+
+```{r}
+
+# Create column indicating whether the dog had any type of training (pre-adolescent or adolescent/adult)
+
+df_valid_responses <- df_valid_responses %>%
+  mutate(any_training = ifelse(
+    dog_attended_pre_adolescent_training | dog_adolescent_or_adult_training & !is.na(dog_attended_pre_adolescent_training), 
+    TRUE, FALSE))
+summary(df_valid_responses$any_training)
+
+```
+
+
+```{r}
+
+df1 = subset(df_valid_responses, any_training == FALSE & behav_problem == FALSE)
+df2 = subset(df_valid_responses, any_training == FALSE & behav_problem == TRUE)
+
+df3 = subset(df_valid_responses, any_training == TRUE & behav_problem == FALSE)
+df4 = subset(df_valid_responses, any_training == TRUE & behav_problem == TRUE)
+
+```
+
+6 dogs with no training have no behavior problems
+171 dogs with no training do have behavior problems
+11 dogs with some training have no behavior problems
+371 dogs with some training do have behavior problems
+
+Most dogs in the study have at least one behavior problem so this is not very insightful
+
+
+
+the majority of dogs with behavior problems have between 2-5 separate behavior issues
+
+Is there a correlation between dog age and behavior problems?
+
+```{r}
+
+data<-dplyr::select(df_valid_responses,num_behv_problems,dog_age_in_months)
+ggpairs(data)
+
+```
+
+There does not appear to be a significant correlation between a dog's age and behavior problems, except perhaps at the extreme end (the dogs with 9 or 10 problems tend to be younger)
+
+#visualise continuous variables (dog age and num of behavior problems)
+
+```{r}
+df_valid_responses %>%
+  ggpairs(columns=c('dog_age_in_months', 
+                    'num_behv_problems'),
+          mapping=ggplot2::aes(color=dog_attended_pre_adolescent_training),
+          diag=list(discrete='barDiag',
+                    continuous=wrap('densityDiag', alpha=0.5)),
+          legend=1,
+          progress=FALSE) +
+  theme(legend.position='bottom')
+
+
+df_valid_responses %>%
+  ggpairs(columns=c('dog_age_in_months', 
+                    'num_behv_problems'),
+          mapping=ggplot2::aes(color=dog_adolescent_or_adult_training),
+          diag=list(discrete='barDiag',
+                    continuous=wrap('densityDiag', alpha=0.5)),
+          legend=1,
+          progress=FALSE) +
+  theme(legend.position='bottom')
+```
+Older dogs are more likely to have received some adult training. 
+
+
+
+## Discrete Variables
+
+### Independent Variables
+
+```{r, fig.height=5, fig.width=5}
+vars <- c(
+  'any_training',
+  'dog_was_acquired_pre_adolescent',
+  'dog_attended_pre_adolescent_training',
+  'dog_adolescent_or_adult_training',
+  'dog_gender',
+  'trainer_group_classes',
+  'trainer_boot_camp',
+  'trainer_home_private',
+  'training_complete_age_6_to_18mo',
+  'training_complete_age_18_to_36mo',
+  'training_complete_age_continue_in_home',
+  'training_complete_age_continue_classes'
+)
+
+plot_list <- list()
+for (i in 1:length(vars)) {
+  col <- vars[i]
+  p <- df_valid_responses %>%
+    select(col) %>%
+    drop_na(col) %>%
+    ggplot(aes_string(x = col)) +
+    geom_bar(fill='steelblue')
+  plot_list[[i]] <- p
+}
+ggarrange(plotlist=plot_list, ncol=3, nrow=4)
+
+```
+
+
+```{r} 
+#save figure
+png(file="/Users/ekeller/ccbs/ccbs-study-3/figures/training_variables_barplot.png",
+width=12, height=8, units="in", res=300)
+ggarrange(plotlist=plot_list, ncol=3, nrow=4)
+dev.off()
+```
+
+
+### Dependent Variables
+
+```{r, fig.height=4, fig.width=5}
+outcomes <- c(
+  'dog_was_aggressive',
+  'dog_has_fear_or_anxiety',
+  'dog_has_jumped_problematically',
+  'dog_has_barked_excessively',
+  'dog_has_coprophagia',
+  'dog_has_repetitive_behavior',
+  'dog_has_house_soiled',
+  'dog_has_rolled_in_repulsive_material',
+  'dog_was_hyperactive',
+  'dog_was_destructive',
+  'dog_has_escaped',
+  'dog_has_mounted'
+)
+
+plot_list <- list()
+for (i in 1:length(outcomes)) {
+  col <- outcomes[i]
+  p <- df_valid_responses %>%
+    select(col) %>%
+    drop_na(col) %>%
+    ggplot(aes_string(x = col)) +
+    geom_bar(fill='steelblue')
+  plot_list[[i]] <- p
+}
+ggarrange(plotlist=plot_list, ncol=4, nrow=3)
+```
+
+```{r} 
+#save figure
+png(file="/Users/ekeller/ccbs/ccbs-study-3/figures/behavior_problems_barplot.png",
+width=12, height=8, units="in", res=300)
+ggarrange(plotlist=plot_list, ncol=3, nrow=4)
+dev.off()
+```
+
+
+
+```{r}
+
+#save all valid responses as dataset
+saveRDS(df_valid_responses, '/Users/ekeller/ccbs/ccbs-study-3/data/df_valid_responses.Rds')
+
+
+#save datasets separated by training style 
+saveRDS(df_reward_only_training, '/Users/ekeller/ccbs/ccbs-study-3/data/df_reward_only_training.Rds')
+saveRDS(df_control_no_adult_training,'/Users/ekeller/ccbs/ccbs-study-3/data/df_control_no_adult_training.Rds')
+
+
+```
+
diff --git a/src/notebook-2-inference-revision.Rmd b/src/notebook-2-inference-revision.Rmd
new file mode 100644
index 0000000..03182c6
--- /dev/null
+++ b/src/notebook-2-inference-revision.Rmd
@@ -0,0 +1,574 @@
+---
+title: "Inferential Data Analysis"
+author: "Liz Keller"
+output: html_notebook
+date: "`r Sys.Date()`"
+
+---
+
+```{r setup, include=FALSE}
+library(tidyverse)
+library(ggpubr)
+library(GGally)
+library(car)
+
+knitr::opts_chunk$set(echo=TRUE)
+
+#set.seed()
+```
+
+# Loading the Data
+
+Load the raw data and verify its dimensions and structure.
+
+```{r}
+df <- readRDS('../data/tidy.Rds')
+dim(df)
+summary(df)
+
+```
+
+# subset only valid responses
+# this is defined by dogs > 12 months of age, and respondent answered question whether dog had received adolescent/adult training
+
+```{r}
+df_valid_responses <- readRDS('../data/df_valid_responses.Rds')
+dim(df_valid_responses)
+summary(df_valid_responses)
+
+#command to derive from full dataset:
+#df_valid_responses <- subset(df, dog_adolescent_or_adult_training==TRUE | dog_adolescent_or_adult_training==FALSE)
+
+```
+
+# Control vs Experimental Groups
+
+# with all valid responses:
+control = adult training FALSE
+exp = adult training TRUE
+
+
+# using reward-only based adult training as exp
+# (dogs who received discipline-based or mixed training excluded from exp group)
+control = adult training FALSE
+exp = reward_only_adult_training TRUE
+
+```{r}
+
+#drawing from all valid responses
+df_control <- df_valid_responses %>%
+  filter(dog_adolescent_or_adult_training == FALSE)
+dim(df_control)
+
+df_exp <- df_valid_responses %>%
+  filter(dog_adolescent_or_adult_training == TRUE)
+dim(df_exp)
+
+
+# only dogs that received reward-based training
+df_exp_reward <- readRDS('../data/df_reward_only_training.Rds')
+dim(df_exp_reward)
+
+
+```
+
+## Overview
+
+The first question we want to answer is if training during adolescence and/or 
+adulthood has an impact on the likelihood of a dog having certain behavior problems. 
+
+The behavior problems we are exploring are:
+
+- Aggression
+- Barking excessively
+- Coprophagia
+- Destructive behavior
+- Escaping/running away
+- Fear/anxiety
+- House soiling
+- Hyperactivity
+- Jumping problematically
+- Mounting
+- Repetitive behavior
+- Rolling in repulsive materials
+
+```{r}
+outcomes <- c(
+  'dog_was_aggressive',
+  'dog_has_fear_or_anxiety',
+  'dog_has_jumped_problematically',
+  'dog_has_barked_excessively',
+  'dog_has_coprophagia',
+  'dog_has_repetitive_behavior',
+  'dog_has_house_soiled',
+  'dog_has_rolled_in_repulsive_material',
+  'dog_was_hyperactive',
+  'dog_was_destructive',
+  'dog_has_escaped',
+  'dog_has_mounted'
+)
+outcomes <- sort(outcomes)
+```
+
+## Statistical Models
+
+We check for correlation between the predictor (attending adolescent / adult training) and
+each outcome (the presence of a specific behavior problem). 
+
+Here we perform the Fisher Exact test with a Benjamini-Hochberg correction.
+(The Fischer test is useful for categorical data where the sample size is small)
+
+```{r}
+
+pred <- 'dog_adolescent_or_adult_training'
+
+idx <- NULL
+p_values <- NULL
+odds_ratios <- NULL 
+conf_int_low <- NULL
+conf_int_high <- NULL
+
+for (outcome in outcomes) {
+  tbl <- table(df_valid_responses[, pred], df_valid_responses[, outcome], dnn=c(pred, outcome))
+  fisher <- fisher.test(tbl)
+  idx <- c(idx, outcome)
+  p_values <- c(p_values, fisher$p.value)
+  odds_ratios <- c(odds_ratios, fisher$estimate[[1]])
+  conf_int_low <- c(conf_int_low,fisher$conf.int[[1]])
+  conf_int_high <- c(conf_int_high,fisher$conf.int[[2]])
+}
+
+# Correct for the possibility of Type I errors.
+p_values <- p.adjust(p_values, method='BH')
+
+# Form a result data frame.
+df_out <- data.frame(outcome=idx, p_value=p_values,
+                     odds_ratio=odds_ratios, 
+                     conf_int1=conf_int_low, conf_int2=conf_int_high)
+
+add_sig_columns <- function(df) {
+  df$level <- ''
+  df[df$p_value <= .05, 'level'] <- '*'
+  df[df$p_value <= .01, 'level'] <- '**'
+  df[df$p_value <= .001, 'level'] <- '***'
+  
+  df$dir <- ''
+  df[df$odds_ratio < 1, 'dir'] <- '-'
+  df[df$odds_ratio > 1, 'dir'] <- '+'
+  
+  return (df)
+}
+
+df_out <- add_sig_columns(df_out)
+print(knitr::kable(df_out))
+
+```
+There is a significant negative correlation between adult 
+training and some specific behaviors: house soiling and destructiveness
+
+there is a significant positive correlation with adult training and
+problematic jumping
+
+
+#check chi-squared test for comparison
+
+```{r}
+
+pred <- 'dog_adolescent_or_adult_training'
+
+idx <- NULL
+p_values <- NULL
+odds_ratios <- NULL 
+for (outcome in outcomes) {
+  tbl <- table(df_valid_responses[, pred], df_valid_responses[, outcome], dnn=c(pred, outcome))
+  chisq <- chisq.test(tbl)
+  idx <- c(idx, outcome)
+  p_values <- c(p_values, chisq$p.value)
+  #odds_ratios <- c(odds_ratios, chisq$estimate[[1]])
+}
+
+# Correct for the possibility of Type I errors (false positive/discovery rate).
+p_values <- p.adjust(p_values, method='BH')
+
+# Form a result data frame.
+df_out_chisq <- data.frame(outcome=idx, p_value=p_values)
+
+df_out_chisq <- add_sig_columns(df_out_chisq)
+print(knitr::kable(df_out_chisq))
+
+```
+Result: same variables are of same order of significance with chi squared test
+
+
+## Binary Logistic Regression
+
+#The simplest possible model: adult / adolescent training as the sole predictor
+
+```{r}
+pred <- 'dog_adolescent_or_adult_training'
+
+for (outcome in outcomes) {
+  cat(paste(replicate(80, '-'), collapse=''))
+  cat(paste0('\n', outcome, '\n'))
+  f <- as.formula(paste0(outcome, '~', '.'))
+  
+  df_tmp <- df_valid_responses[,c(outcome, pred)]
+
+  glm_fit <- glm(f, data=df_tmp, family='binomial')
+  print(summary(glm_fit))
+  print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+  #cat('\nVIF:\n')
+  #print(car::vif(glm_fit))
+  cat('\n')
+}
+```
+
+Let's consider what other factors may also come in to play:
+
+- Dog Age
+- Dog Sex
+- Acquiring the dog pre-adolescent
+- Pre-adolescent (puppy) training
+- interaction between puppy and adult training
+
+We'll perform logistic regression to determine the impact of these factors. To
+perform logistic regression we'll need to ensure out data subsets have enough
+responses (`n >= 10`) for each possible answer to be included in the model.
+
+```{r}
+apply_min_xtab <- function(df, outcome, cutoff=10)
+{
+  drops <- NULL
+  for (col in names(df)) {
+    if (col == outcome) next
+    if (is.integer(df[,col])) next
+    if (is.numeric(df[,col])) next # added in order not to exclude dog age
+    
+    xtab <- table(df[,col], df[,outcome])
+    if (min(xtab) < cutoff) {
+      drops <- c(drops, col)
+      break
+    }
+  }
+  
+  if (length(drops) > 0) {
+    cat('\nDropped from model due to insufficient responses:\n')
+    cat(drops)
+    cat('\n')
+  }
+  
+  return(df[, !(names(df) %in% drops)])
+}
+```
+
+Now we perform the logistic regression.
+
+when including age (the only continuous variable)
+do not apply exclusion for min number of responses 
+
+#without interaction terms
+
+```{r}
+pred <- 'dog_adolescent_or_adult_training'
+glm_attribs <- c(
+  'dog_age_in_months',
+  'dog_gender',
+  'dog_was_acquired_pre_adolescent',
+  'dog_attended_pre_adolescent_training'
+)
+
+for (outcome in outcomes) {
+  cat(paste(replicate(80, '-'), collapse=''))
+  cat(paste0('\n', outcome, '\n'))
+  f <- as.formula(paste0(outcome, '~', '.'))
+  
+  df_tmp <- df_valid_responses[,c(outcome, pred, glm_attribs)]
+  #do not apply min function-- this only excludes age, which is continuous
+  #df_tmp <- apply_min_xtab(df_tmp, outcome) 
+  
+  glm_fit <- glm(f, data=df_tmp, family='binomial')
+  print(summary(glm_fit))
+  print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+  cat('\nVIF:\n')
+  print(car::vif(glm_fit))
+  cat('\n')
+}
+
+# note VIF above 5 is red flag
+
+```
+
+# now try a model with an interaction term
+
+
+```{r}
+
+
+for (outcome in outcomes){
+  
+  #new section
+  cat(paste(replicate(80, '-'), collapse=''))
+  cat(paste0('\n', outcome, '\n'))
+  
+  #format variables as formula
+  f <- as.formula(paste0(outcome, '~', 'dog_adolescent_or_adult_training  + dog_attended_pre_adolescent_training        + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent        + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months +  dog_gender'))
+  
+
+  df_tmp1 <- df_valid_responses[,c(outcome, 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')]
+
+  df_tmp2 <- df_tmp1[complete.cases(df_tmp1),]
+
+  glm_fit <- glm(f, data=df_tmp2, family = 'binomial')
+
+  print(summary(glm_fit))
+  
+  print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+    cat('\nVIF:\n')
+    print(car::vif(glm_fit,type="predictor"))
+    cat('\n')
+
+}
+
+
+
+```
+
+
+
+# instead of individual behaviors, use number of behavior problems as outcome
+
+```{r}
+
+
+#format variables as formula
+f <- as.formula(paste0('num_behv_problems', '~', 'dog_adolescent_or_adult_training  + dog_attended_pre_adolescent_training  + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months + dog_gender'))
+  
+  df_tmp1 <- df_valid_responses[,c('num_behv_problems', 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')]
+
+df_tmp2 <- df_tmp1[complete.cases(df_tmp1),]
+
+glm_fit <- glm(f, data=df_tmp2, family = 'quasi')
+
+print(summary(glm_fit))
+
+print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+  cat('\nVIF:\n')
+  print(car::vif(glm_fit,type="predictor"))
+  cat('\n')
+
+```
+
+
+# Investigating Training Specifics
+
+we have identified that adolescent/adult training has a  
+significant impact on jumping.
+
+Does training style make a difference in the outcomes?
+eliminate dogs who received discipline-based or mixed training from experimental group
+
+```{r}
+
+#identify dogs who received reward only adult training
+df_reward_only_training <- subset(df_valid_responses, reward_only_adult_training==TRUE)
+
+#combine control and modified exp groups
+df_valid_responses_reward_only <- rbind(df_reward_only_training,df_control)
+
+```
+
+
+## repeat Binary Logistic Regression with reward-only experiment group
+
+
+```{r}
+
+
+for (outcome in outcomes){
+  
+  #new section
+  cat(paste(replicate(80, '-'), collapse=''))
+  cat(paste0('\n', outcome, '\n'))
+  
+  #format variables as formula
+  f <- as.formula(paste0(outcome, '~', 'dog_adolescent_or_adult_training  + dog_attended_pre_adolescent_training  + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months + dog_gender'))
+  
+  df_tmp1 <- df_valid_responses_reward_only[,c(outcome, 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')]
+
+df_tmp2 <- df_tmp1[complete.cases(df_tmp1),]
+
+glm_fit <- glm(f, data=df_tmp2, family = 'binomial')
+
+print(summary(glm_fit))
+
+print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+  cat('\nVIF:\n')
+  print(car::vif(glm_fit,type="predictor"))
+  cat('\n')
+
+}
+
+
+
+```
+# instead of individual behaviors, use number of behavior problems as outcome
+
+```{r}
+
+
+#format variables as formula
+f <- as.formula(paste0('num_behv_problems', '~', 'dog_adolescent_or_adult_training  + dog_attended_pre_adolescent_training  + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months + dog_gender'))
+  
+  df_tmp1 <- df_valid_responses_reward_only[,c('num_behv_problems', 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')]
+
+df_tmp2 <- df_tmp1[complete.cases(df_tmp1),]
+
+glm_fit <- glm(f, data=df_tmp2, family = 'quasi')
+
+print(summary(glm_fit))
+
+print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+  cat('\nVIF:\n')
+  print(car::vif(glm_fit,type="predictor"))
+  cat('\n')
+
+```
+
+
+
+############################################################
+
+Now we look just at the experiment group to see if style or location influenced the outcome
+
+We want to answer the following questions about the training:
+
+- Does continued training throughout dog's life produce a better outcome? 
+- Did training technique (reward vs. punishment) have an impact on the outcome?
+- Did the location of training have an impact on the outcome?
+
+We will need to expand the independent variables used for the model to answer
+these questions.
+
+(The vast majority of dogs who had adolescent/adult training received 
+reward-based training at group classes)
+
+```{r}
+
+outcomes_sig <- c(
+  'dog_was_aggressive',
+  'dog_has_fear_or_anxiety',
+  'dog_has_jumped_problematically',
+  'dog_has_barked_excessively',
+  'dog_has_coprophagia',
+  'dog_has_repetitive_behavior',
+  'dog_has_house_soiled',
+  'dog_has_rolled_in_repulsive_material',
+  'dog_was_hyperactive',
+  'dog_was_destructive',
+  'dog_has_escaped',
+  'dog_has_mounted'
+)
+outcomes_sig <- sort(outcomes_sig)
+
+#use single factor, reward or no
+training_style <- c(
+  'reward_only_puppy_training',
+  'reward_only_adult_training'
+  #'discipline',
+  #'mixed_style'
+)
+training_age <- c(
+  #'training_complete_age_6_to_18mo',
+  #'training_complete_age_18_to_36mo',
+  'training_complete_age_continue_in_home',
+  'training_complete_age_continue_classes'
+)
+training_location <- c(
+  'puppy_group_classes', 
+  'trainer_group_classes'
+)
+
+glm_attribs <- c(
+  training_style,
+  training_age,
+  training_location
+)
+
+print(glm_attribs)
+
+```
+
+# Build and evaluate models for the various outcomes.
+
+```{r}
+
+# we don't need to include adult training as variable because by definition all dogs in experimental group have had adult training
+# There will be some cases where there are less than 10 responses for a given variable/outcome combination, so eliminate those variables from the model using apply_min_xtab function defined above
+# Puppy training style and location creates problems in some models due to small sample size / multi colinearity
+
+for (outcome in outcomes){
+  
+  #new section
+  cat(paste(replicate(80, '-'), collapse=''))
+  cat(paste0('\n', outcome, '\n'))
+  
+  #format variables as formula
+  #f <- as.formula(paste0(outcome, '~', 'reward_only_puppy_training + reward_only_adult_training + puppy_group_classes + trainer_group_classes + training_complete_age_continue_in_home + training_complete_age_continue_classes'))
+  f <- as.formula(paste0(outcome, '~', '.'))
+  
+  #df_tmp1 <- df_exp[,c(outcome, 'reward_only_puppy_training', 'reward_only_adult_training','puppy_group_classes', 'trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')]
+  df_tmp1 <- df_exp[,c(outcome, 'reward_only_adult_training','trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')]
+
+df_tmp2 <- df_tmp1[complete.cases(df_tmp1),]
+df_tmp2 <- apply_min_xtab(df_tmp2, outcome) 
+
+glm_fit <- glm(f, data=df_tmp2, family = 'binomial')
+
+print(summary(glm_fit))
+
+print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+  cat('\nVIF:\n')
+  print(car::vif(glm_fit))#,type="predictor"))
+  cat('\n')
+
+}
+
+
+
+```
+
+# instead of individual behaviors, use number of behavior problems as outcome
+
+```{r}
+
+
+#format variables as formula
+
+f <- as.formula(paste0('num_behv_problems', '~', '.'))
+  
+#df_tmp1 <- df_exp[,c(outcome, 'reward_only_puppy_training', 'reward_only_adult_training','puppy_group_classes', 'trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')]
+df_tmp1 <- df_exp[,c('num_behv_problems', 'reward_only_adult_training','trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')]
+  
+df_tmp2 <- df_tmp1[complete.cases(df_tmp1),]
+
+glm_fit <- glm(f, data=df_tmp2, family = 'quasi')
+
+print(summary(glm_fit))
+
+print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit)))))
+  cat('\nVIF:\n')
+  print(car::vif(glm_fit,type="predictor"))
+  cat('\n')
+
+```
+
+
+There seems to be very little impact of style of training, completion age, or training location on behavior outcomes
+
+
+
+```{r}
+
+
+```
+