From f3d8c1914188473e3947bbe7567e901fd25120df Mon Sep 17 00:00:00 2001 From: Liz Keller Date: Wed, 12 Jul 2023 16:47:01 +1200 Subject: [PATCH] Add files via upload Changes made to analysis for revised manuscript, submitted to Animals Jul 2023 --- src/notebook-1-eda-revision.Rmd | 805 ++++++++++++++++++++++++++ src/notebook-2-inference-revision.Rmd | 574 ++++++++++++++++++ 2 files changed, 1379 insertions(+) create mode 100644 src/notebook-1-eda-revision.Rmd create mode 100644 src/notebook-2-inference-revision.Rmd diff --git a/src/notebook-1-eda-revision.Rmd b/src/notebook-1-eda-revision.Rmd new file mode 100644 index 0000000..8726a36 --- /dev/null +++ b/src/notebook-1-eda-revision.Rmd @@ -0,0 +1,805 @@ +--- +title: "Exploratory Data Analysis" +date: "`r Sys.Date()`" +output: + html_document: default + pdf_document: default +--- + +setwd("C:/Users/keller/Desktop/CCBS/ccbs-study-3/") + + +```{r setup, include=FALSE} +library(tidyverse) +library(ggpubr) +library(GGally) +library(moments) +library(scales) +knitr::opts_chunk$set(echo=TRUE) + + +``` + +# Overview + +This notebook documents the exploratory data analysis undertaken for this +project. + +# Loading the Data + +Load the tidy data set into memory. If it does not exist, then run the notebook +that builds it. + +```{r} +tidy_data_filepath <- '../data/tidy.Rds' +if (!file.exists(tidy_data_filepath)) { + rmarkdown::render('./notebook-0-tidy.Rmd') +} +df <- readRDS(tidy_data_filepath) +stopifnot(identical(dim(df)+0, c(1011, 111))) +str(df) +``` +## Overview of Data Set + +# summarize data + +```{r} +summary(df) + +``` + +```{r} +# Number of unique owners after inclusion criteria. +length(unique(df$owner_id)) + +``` + +```{r} +#number of dogs per household. +summary(plyr::count(df, 'owner_id')) +``` +Median number of dogs per household = 1 (range: 1 to 21), mean 1.6. Now we can +drop the column to simply the data set. + +```{r} +df <- subset(df, select=-c(owner_id)) +``` + + +Add a column indicating number of reported behavior problems + +#how may dogs have at least one behavior issue? +#what is the average number of behavior problems? + +```{r} + +df <- df %>% + mutate(behav_problem = ifelse( + dog_was_aggressive | dog_has_fear_or_anxiety | dog_has_jumped_problematically | + dog_has_barked_excessively | dog_has_coprophagia | dog_has_repetitive_behavior | + dog_has_house_soiled | dog_has_rolled_in_repulsive_material | dog_was_hyperactive | + dog_was_destructive | dog_has_escaped | dog_has_mounted, TRUE, FALSE)) +summary(df$behav_problem) + +#create variable with number of behavior problems +df_temp <- df[,c('dog_was_aggressive', 'dog_has_fear_or_anxiety', 'dog_has_jumped_problematically', + 'dog_has_barked_excessively', 'dog_has_coprophagia', 'dog_has_repetitive_behavior', + 'dog_has_house_soiled', 'dog_has_rolled_in_repulsive_material', 'dog_was_hyperactive', + 'dog_was_destructive', 'dog_has_escaped', 'dog_has_mounted')] + +#count number of 'True' responses per row +df$num_behv_problems <- rowSums(df_temp,na.rm=TRUE) +summary(df$num_behv_problems) + +``` + +# Summary stats specific to only valid responses which meet the inclusion criteria (valid response = dog > 12 months of age, answered True or False to dog_adolescent_or_adult_training) + +```{r} + +df_valid_responses <- subset(df, dog_adolescent_or_adult_training==TRUE | dog_adolescent_or_adult_training==FALSE) +summary(df_valid_responses) + +``` + +#Break into control and experimental groups +# Hypotheses: +# 1. adolescent / adult training will reduce odds of observing a behavior problem +# 2. puppy training will reduce odds of observing a behavior problems +# 3. Having both puppy and adult training will further reduce odds of behavior problems + +```{r} + +# Experiment 1: +# control and experimental groups +# control = adult training = FALSE; exp = adult training = TRUE +df1_control_no_adult_training <- subset(df_valid_responses, dog_adolescent_or_adult_training==FALSE) +df1_exp_yes_adult_training <- subset(df_valid_responses, dog_adolescent_or_adult_training==TRUE) + +# Experiment 2: controlling for puppy training +# control and experimental groups +# 4 groupings +# first eliminate NA's from puppy training question +df_puppy_train_valid_responses <- subset(df_valid_responses, dog_attended_pre_adolescent_training==FALSE | + dog_attended_pre_adolescent_training==TRUE) + +df2_puppy_training_only <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==TRUE & dog_adolescent_or_adult_training==FALSE) +df2_puppy_and_adult_training <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==TRUE & dog_adolescent_or_adult_training==TRUE) +df2_adult_only <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==FALSE & dog_adolescent_or_adult_training==TRUE) +df2_no_training <- subset(df_puppy_train_valid_responses , dog_attended_pre_adolescent_training==FALSE & dog_adolescent_or_adult_training==FALSE) + +nrow(df2_puppy_training_only) + nrow(df2_puppy_and_adult_training) + nrow(df2_adult_only) + nrow(df2_no_training) + + +``` + + +```{r} + +#summary of number of behavior problems + +summary(df_valid_responses$num_behv_problems) +summary(df1_control_no_adult_training$num_behv_problems) +summary(df1_exp_yes_adult_training$num_behv_problems) + +``` + +```{r} +#summary(df_puppy_train_valid_responses$num_behv_problems) + +``` + + +#look at distribution of number of behavior problems, by grouping + +```{r} + +p <- ggplot(df_valid_responses, aes(x=num_behv_problems)) + geom_bar(fill='steelblue') + + scale_x_continuous(breaks=0:10) + theme(axis.text=element_text(size=14), axis.title=element_text(size=18,face="bold")) # + geom_text(aes(label = after_stat(count)), stat = "count", vjust = 1.1, colour = "white") + +p.labs <- p + labs(x = "Number of behavior problems per dog", y="Count") +p.labs + +``` + + +```{r} +#save figure +filename = "C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/fig1_number_of_behavior_problems_all_valid_responses.pdf" + +pdf(file=filename, width=12, height=8) +p.labs +dev.off() +``` + + +```{r} + +p <- ggplot(df1_control_no_adult_training, aes(x=num_behv_problems)) + geom_bar(fill='steelblue') + + scale_x_continuous(breaks=0:10) + theme(axis.text=element_text(size=14), axis.title=element_text(size=18,face="bold")) + +p.labs <- p + labs(x = "Number of behavior problems per dog", y="Count") +p.labs + +``` + +```{r} +#save figure +filename = "C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/fig1_number_of_behavior_problems_df1_control_no_adult_training.pdf" + +pdf(file=filename, width=12, height=8) +p.labs +dev.off() +``` + +```{r} + +p <- ggplot(df1_exp_yes_adult_training, aes(x=num_behv_problems)) + geom_bar(fill='steelblue') + + scale_x_continuous(breaks=0:10) + theme(axis.text=element_text(size=14), axis.title=element_text(size=18,face="bold")) + +p.labs <- p + labs(x = "Number of behavior problems per dog", y="Count") +p.labs + +``` + +```{r} +#save figure +filename = "C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/fig1_number_of_behavior_problems_df1_exp_yes_adult_training.pdf" + +pdf(file=filename, width=12, height=8) +p.labs +dev.off() +``` + +# is the mean number of behavior problems significantly different between control and exp? + +```{r} + +t.test(df1_control_no_adult_training$num_behv_problems, df1_exp_yes_adult_training$num_behv_problems, paired=FALSE, var.equal=TRUE) + +``` +# The t-test shows that the mean is not significantly different beween the two groups (p = 0.13) + + + +#calculate percentage of dogs with each behavior problem, by grouping + +```{r} + +behav_probs <- c( + 'dog_was_aggressive', + 'dog_has_fear_or_anxiety', + 'dog_has_jumped_problematically', + 'dog_has_barked_excessively', + 'dog_has_coprophagia', + 'dog_has_repetitive_behavior', + 'dog_has_house_soiled', + 'dog_has_rolled_in_repulsive_material', + 'dog_was_hyperactive', + 'dog_was_destructive', + 'dog_has_escaped', + 'dog_has_mounted' +) +behav_probs <- sort(behav_probs) + +tmp <- df_valid_responses %>% select(all_of(behav_probs)) + +``` + +```{r} + +#create table for percentage occurence + +# Define a dataframe +pct_df <- data.frame(Behavior = behav_probs, + Occurence = numeric(length(behav_probs))) + +for (behav_prob in behav_probs){ + + tbl <- plyr::count(tmp, behav_prob) / nrow(tmp) * 100 + #tmp <- tmp %>% + ## add percentage label with `sprintf()` + #perc = paste0(sprintf("%4.1f", tbl$freq[2]), "%") + perc = tbl$freq[2] + + pct_df[pct_df$Behavior==behav_prob,2] = perc + +} + +pct_df <- pct_df[order(pct_df$Occurence,decreasing = TRUE),] + + +``` + +```{r} + +my_labels <- c("Excessive Barking","Coprophagia","Escaping", "Fear or Anxiety", "House soiling", "Problematic jumping", "Mounting", "Repetitive Behavior", "Rolling in Repulsive Material", "Aggression","Destructive", "Hyperactivity" ) + +pct_labels = paste0(sprintf("%4.1f", pct_df$Occurence), "%") + +df_sorted <- arrange(pct_df, Occurence) + +p <- ggplot(pct_df, aes(x=Behavior,y=Occurence)) + geom_bar(stat="identity",fill='steelblue') + scale_x_discrete(labels=my_labels) + scale_y_continuous(limits=c(0,100)) + geom_text(aes(label = pct_labels), hjust = -0.2, colour = "black", size=5) + theme(axis.text=element_text(size=14), axis.title=element_text(size=16,face="bold")) + +p.labs <- p + coord_flip() + labs(y = "percentage of dogs", ) +p.labs + + +``` + + +```{r} +#save figure +filename="C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/behavior_problems_pct_all_valid_responses.pdf" +pdf(file=filename, width=12, height=8) +p.labs +dev.off() + +``` + + +# general notes on dataset: (all stats exclude no answer or NA's unless otherwise stated) +-- 322 (out of 1011 total obs) responses are excluded due to no response (NA's) in either training or behavior questions; many of these dogs were too young to take part in the survey (<12 mo) +-- 671 out of 689 or 97.4% reported at least one behavior problem +-- median number of behavior problems is 4, and mean is 3.7 +-- Dog gender spilt almost even, slight bias towards male (478 F / 507 M total, 325 F / 364 M with valid responses) +-- 634 unique dog owners, median 1 dog owned, mean 1.6 (all respondents) +-- Mean age (in months) is 80.9 (6.7 years) and median age is 75 (6.25 years) [Note all dogs less than 12 months excluded from analysis] +-- a majority of respondents (389/689 = 56%) acquired their dog in the pre-adolescent stage +-- of the respondents who provided an answer (559), just under half of dogs had attended pre-adolescent training (260/559 = 46.5%). The majority of these dogs (209/260 = 80.4%) had received reward-based training only. A further 15% (39/260) received a mix of reward and discipline-based training. The vast majority (229/260 = 88%) attended group classes (as opposed to private training or boot camp). +-- More than half of dogs had attended adolescent or adult training (442/689 = 64.2%). The vast majority (358/442 = 81.0%) reported receiving reward-based training only and attended group classes (387/442 = 87.6%) + +-- the majority continued some form of training post-puppyhood: 11% (77/689) completed in adolescence (defined as 6 to 18 months), 10.6% (73/689) completed between ages 18-36 months (social maturity), 39.6% (273/689) continue to train in-home, and 32.8% (226/689) continue to attend classes, with 53% (365/689) reporting continued training either in home or through classes + + +#specific behavior problems (overall survey population): +-- 56.2% reported aggression (387/689) +-- 73.9% reported fear or anxiety (509/689) +-- 19.4% reported problematic jumping (134/689) +-- 19.0% reported excessive barking (131/689) +-- 41.7% reported coprophagia (287/689) +-- 26.9% reported repetitive behavior (185/689) +-- 23.1% reported house soiling (159/689) +-- 41.1% reported rolling in repulsive material (283/689) +-- 12.5% reported hyperactivity (86/689) +-- 15.4% reported destructive behavior (106/689) +-- 26.3% reported escaping (181/689) +-- 18.4% reported mounting (127/689) + + +#split by control and exp group + +#control +```{r} + +tmp <- df1_control_no_adult_training %>% select(all_of(behav_probs)) + +#create table for percentage occurence + +# Define a dataframe +pct_df <- data.frame(Behavior = behav_probs, + Occurence = numeric(length(behav_probs))) + +for (behav_prob in behav_probs){ + + tbl <- plyr::count(tmp, behav_prob) / nrow(tmp) * 100 + + perc = tbl$freq[2] + + pct_df[pct_df$Behavior==behav_prob,2] = perc + +} + +pct_df <- pct_df[order(pct_df$Occurence,decreasing = TRUE),] + + +``` + +```{r} + +my_labels <- c("Excessive Barking","Coprophagia","Escaping", "Fear or Anxiety", "House soiling", "Problematic jumping", "Mounting", "Repetitive Behavior", "Rolling in Repulsive Material", "Aggression","Destructive", "Hyperactivity" ) + +pct_labels = paste0(sprintf("%4.1f", pct_df$Occurence), "%") + +df_sorted <- arrange(pct_df, Occurence) + +p <- ggplot(pct_df, aes(x=Behavior,y=Occurence)) + geom_bar(stat="identity",fill='steelblue') + scale_x_discrete(labels=my_labels) + scale_y_continuous(limits=c(0,100)) + geom_text(aes(label = pct_labels), hjust = -0.2, colour = "black", size=5) + theme(axis.text=element_text(size=14), axis.title=element_text(size=16,face="bold")) + +p.labs <- p + coord_flip() + labs(y = "percentage of dogs", ) +p.labs + + +``` + +```{r} +#save figure +filename="C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/behavior_problems_pct_df1_control_no_adult_training.pdf" +pdf(file=filename, width=12, height=8) +p.labs +dev.off() + +``` + +#experiment +```{r} + +tmp <- df1_exp_yes_adult_training %>% select(all_of(behav_probs)) + +#create table for percentage occurence + +# Define a dataframe +pct_df <- data.frame(Behavior = behav_probs, + Occurence = numeric(length(behav_probs))) + +for (behav_prob in behav_probs){ + + tbl <- plyr::count(tmp, behav_prob) / nrow(tmp) * 100 + + perc = tbl$freq[2] + + pct_df[pct_df$Behavior==behav_prob,2] = perc + +} + +pct_df <- pct_df[order(pct_df$Occurence,decreasing = TRUE),] + + +``` + +```{r} + +my_labels <- c("Excessive Barking","Coprophagia","Escaping", "Fear or Anxiety", "House soiling", "Problematic jumping", "Mounting", "Repetitive Behavior", "Rolling in Repulsive Material", "Aggression","Destructive", "Hyperactivity" ) + +pct_labels = paste0(sprintf("%4.1f", pct_df$Occurence), "%") + +df_sorted <- arrange(pct_df, Occurence) + +p <- ggplot(pct_df, aes(x=Behavior,y=Occurence)) + geom_bar(stat="identity",fill='steelblue') + scale_x_discrete(labels=my_labels) + scale_y_continuous(limits=c(0,100)) + geom_text(aes(label = pct_labels), hjust = -0.2, colour = "black", size=5) + theme(axis.text=element_text(size=14), axis.title=element_text(size=16,face="bold")) + +p.labs <- p + coord_flip() + labs(y = "percentage of dogs", ) +p.labs + + +``` +```{r} +#save figure +filename="C:/Users/keller/Desktop/CCBS/ccbs-study-3/figures/behavior_problems_pct_df1_exp_yes_adult_training.pdf" +pdf(file=filename, width=12, height=8) +p.labs +dev.off() + +``` +#demographics for each group separately + +```{r} + +#how many dogs in each group have at least one behavior problem? +nrow(subset(df1_control_no_adult_training,num_behv_problems > 0)) / nrow(df1_control_no_adult_training) *100 +nrow(subset(df1_exp_yes_adult_training,num_behv_problems > 0)) / nrow(df1_exp_yes_adult_training) *100 + +``` +```{r} + +#mean and median # of behavior problems +summary(df1_control_no_adult_training$num_behv_problems) +summary(df1_exp_yes_adult_training$num_behv_problems) + +``` +```{r} + +#what % of dogs in each group are female? +nrow(subset(df1_control_no_adult_training,dog_gender=="Female")) / nrow(df1_control_no_adult_training) *100 +nrow(subset(df1_exp_yes_adult_training,dog_gender=="Female")) / nrow(df1_exp_yes_adult_training) *100 + +``` +```{r} + +#mean and median age of dogs +summary(df1_control_no_adult_training$dog_age_in_months) +summary(df1_exp_yes_adult_training$dog_age_in_months) + +``` + +```{r} + +#what % of dogs in each group were acquired pre-adolescent, as puppies (6 mo or younger)? +nrow(subset(df1_control_no_adult_training,dog_was_acquired_pre_adolescent==TRUE)) / nrow(df1_control_no_adult_training) *100 +nrow(subset(df1_exp_yes_adult_training,dog_was_acquired_pre_adolescent==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +``` +```{r} + +#what % of dogs in each group attended puppy training? +nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) / nrow(df1_control_no_adult_training) *100 +nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +``` + +```{r} + +#what % of dogs in each group received reward-based puppy training? +nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Reward based")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Reward based")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +``` +```{r} + +#what % of dogs in each group received discipline-based puppy training? +nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Discipline based")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Discipline based")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + + +#Mixed balance of both reward and discipline +nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Mixed balance of both reward and discipline")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_type=="Mixed balance of both reward and discipline")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +``` + + + +```{r} + +#what % of dogs in each group received group puppy training? +nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training=="TRUE" & dog_pre_adolescent_training_location=="Myself/family member attended group classes")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_location=="Myself/family member attended group classes")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +``` + +```{r} + +#what % of dogs in each group received private in-home trainer as puppies? +nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training=="TRUE" & dog_pre_adolescent_training_location=="Hired private in-home trainer")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_location=="Hired private in-home trainer")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + + + +#Sent to a professional training school/boot camp as puppies +nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training=="TRUE" & dog_pre_adolescent_training_location=="Sent to a professional training school/boot camp")) / nrow(subset(df1_control_no_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE & dog_pre_adolescent_training_location=="Sent to a professional training school/boot camp")) / nrow(subset(df1_exp_yes_adult_training,dog_attended_pre_adolescent_training==TRUE)) *100 + +``` + + + +```{r} + +# exp group only by definition + +#what % of dogs received reward-based adult training? +nrow(subset(df1_exp_yes_adult_training, reward_only_adult_training==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +nrow(subset(df1_exp_yes_adult_training, dog_adolescent_or_adult_training_style=="Discipline based")) / nrow(df1_exp_yes_adult_training) *100 + +nrow(subset(df1_exp_yes_adult_training, dog_adolescent_or_adult_training_style=="Mixed balance of both reward and discipline")) / nrow(df1_exp_yes_adult_training) *100 + + + +``` + +```{r} + +#what % of dogs in each group received group puppy training? +nrow(subset(df1_exp_yes_adult_training, trainer_group_classes==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +nrow(subset(df1_exp_yes_adult_training, trainer_home_private==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +nrow(subset(df1_exp_yes_adult_training, trainer_boot_camp==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +``` + +```{r} + +#when did dogs "complete" training? +nrow(subset(df1_exp_yes_adult_training,training_complete_age_6_to_18mo==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +nrow(subset(df1_exp_yes_adult_training,training_complete_age_18_to_36mo==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +nrow(subset(df1_exp_yes_adult_training,training_complete_age_continue_in_home==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +nrow(subset(df1_exp_yes_adult_training,training_complete_age_continue_classes==TRUE)) / nrow(df1_exp_yes_adult_training) *100 + +``` + +# make some plots to visualise data + +# examine variable distribution for all variables + +```{r} + +# all responses +ggplot(df, aes(dog_age_in_months)) + geom_bar(fill='steelblue') +skewness(df$dog_age_in_months,na.rm=TRUE) + +ggplot(df, aes(owner_age)) + geom_bar(fill='steelblue') +skewness(df$owner_age,na.rm=TRUE) + +#valid responses only +ggplot(df_valid_responses, aes(dog_age_in_months)) + geom_bar(fill='steelblue') +skewness(df_valid_responses$dog_age_in_months,na.rm=TRUE) + +ggplot(df_valid_responses, aes(owner_age)) + geom_bar(fill='steelblue') +skewness(df_valid_responses$owner_age,na.rm=TRUE) + +``` + +We see a modest right skew (younger) in the dog age plot, and a left skew (older) in the owner age plot. + + +# We want to isolate the effect of adolescent / adult training + + -- using all valid answers which meet our inclusion criteria = df_valid_responses + +```{r} + +#testing some different groupings of the data + +#subset of data with dogs that are continuing training at the time of survey +df_continue_training <- subset(df, training_complete_age_continue_in_home==TRUE | training_complete_age_continue_classes==TRUE) + +#subset of data to eliminate dogs that received some discipline-based training +df_reward_only_training <- subset(df_valid_responses, reward_only_adult_training==TRUE) +df_control_no_adult_training <- subset(df_valid_responses, dog_adolescent_or_adult_training==FALSE) + +nrow(subset(df_reward_only_training)) +nrow(subset(df_control_no_adult_training)) + +``` + +# Including the effect of pre-adolescent training: +Of the 1011 total survey responses, 452 are excluded because they did not answer whether the dog had received pre-adolescent training and/or adult training. This leaves 559 responses. The breakdown of the group: + +177 had NO training (neither pre-adolescent or adult) +37 had pre-adolescent but NO further training +223 had pre-adolescent AND adult training +122 had NO pre-adolescent training but did have adult training + +#These groups are not well-balanced, therefore they will not be used for further analysis + + +# Controlling for the adverse impact of discipline-based training + +Looking at reward-only based training vs. no training (eliminating dogs that had received discipline-based or mixed discipline and reward training): +605 responses in dataset +358 received reward-only adult training (experiment) +247 did not receive adult training (control) + + +# How many dogs with any type of training (either puppy or adult) have behavior problems? + +```{r} + +# Create column indicating whether the dog had any type of training (pre-adolescent or adolescent/adult) + +df_valid_responses <- df_valid_responses %>% + mutate(any_training = ifelse( + dog_attended_pre_adolescent_training | dog_adolescent_or_adult_training & !is.na(dog_attended_pre_adolescent_training), + TRUE, FALSE)) +summary(df_valid_responses$any_training) + +``` + + +```{r} + +df1 = subset(df_valid_responses, any_training == FALSE & behav_problem == FALSE) +df2 = subset(df_valid_responses, any_training == FALSE & behav_problem == TRUE) + +df3 = subset(df_valid_responses, any_training == TRUE & behav_problem == FALSE) +df4 = subset(df_valid_responses, any_training == TRUE & behav_problem == TRUE) + +``` + +6 dogs with no training have no behavior problems +171 dogs with no training do have behavior problems +11 dogs with some training have no behavior problems +371 dogs with some training do have behavior problems + +Most dogs in the study have at least one behavior problem so this is not very insightful + + + +the majority of dogs with behavior problems have between 2-5 separate behavior issues + +Is there a correlation between dog age and behavior problems? + +```{r} + +data<-dplyr::select(df_valid_responses,num_behv_problems,dog_age_in_months) +ggpairs(data) + +``` + +There does not appear to be a significant correlation between a dog's age and behavior problems, except perhaps at the extreme end (the dogs with 9 or 10 problems tend to be younger) + +#visualise continuous variables (dog age and num of behavior problems) + +```{r} +df_valid_responses %>% + ggpairs(columns=c('dog_age_in_months', + 'num_behv_problems'), + mapping=ggplot2::aes(color=dog_attended_pre_adolescent_training), + diag=list(discrete='barDiag', + continuous=wrap('densityDiag', alpha=0.5)), + legend=1, + progress=FALSE) + + theme(legend.position='bottom') + + +df_valid_responses %>% + ggpairs(columns=c('dog_age_in_months', + 'num_behv_problems'), + mapping=ggplot2::aes(color=dog_adolescent_or_adult_training), + diag=list(discrete='barDiag', + continuous=wrap('densityDiag', alpha=0.5)), + legend=1, + progress=FALSE) + + theme(legend.position='bottom') +``` +Older dogs are more likely to have received some adult training. + + + +## Discrete Variables + +### Independent Variables + +```{r, fig.height=5, fig.width=5} +vars <- c( + 'any_training', + 'dog_was_acquired_pre_adolescent', + 'dog_attended_pre_adolescent_training', + 'dog_adolescent_or_adult_training', + 'dog_gender', + 'trainer_group_classes', + 'trainer_boot_camp', + 'trainer_home_private', + 'training_complete_age_6_to_18mo', + 'training_complete_age_18_to_36mo', + 'training_complete_age_continue_in_home', + 'training_complete_age_continue_classes' +) + +plot_list <- list() +for (i in 1:length(vars)) { + col <- vars[i] + p <- df_valid_responses %>% + select(col) %>% + drop_na(col) %>% + ggplot(aes_string(x = col)) + + geom_bar(fill='steelblue') + plot_list[[i]] <- p +} +ggarrange(plotlist=plot_list, ncol=3, nrow=4) + +``` + + +```{r} +#save figure +png(file="/Users/ekeller/ccbs/ccbs-study-3/figures/training_variables_barplot.png", +width=12, height=8, units="in", res=300) +ggarrange(plotlist=plot_list, ncol=3, nrow=4) +dev.off() +``` + + +### Dependent Variables + +```{r, fig.height=4, fig.width=5} +outcomes <- c( + 'dog_was_aggressive', + 'dog_has_fear_or_anxiety', + 'dog_has_jumped_problematically', + 'dog_has_barked_excessively', + 'dog_has_coprophagia', + 'dog_has_repetitive_behavior', + 'dog_has_house_soiled', + 'dog_has_rolled_in_repulsive_material', + 'dog_was_hyperactive', + 'dog_was_destructive', + 'dog_has_escaped', + 'dog_has_mounted' +) + +plot_list <- list() +for (i in 1:length(outcomes)) { + col <- outcomes[i] + p <- df_valid_responses %>% + select(col) %>% + drop_na(col) %>% + ggplot(aes_string(x = col)) + + geom_bar(fill='steelblue') + plot_list[[i]] <- p +} +ggarrange(plotlist=plot_list, ncol=4, nrow=3) +``` + +```{r} +#save figure +png(file="/Users/ekeller/ccbs/ccbs-study-3/figures/behavior_problems_barplot.png", +width=12, height=8, units="in", res=300) +ggarrange(plotlist=plot_list, ncol=3, nrow=4) +dev.off() +``` + + + +```{r} + +#save all valid responses as dataset +saveRDS(df_valid_responses, '/Users/ekeller/ccbs/ccbs-study-3/data/df_valid_responses.Rds') + + +#save datasets separated by training style +saveRDS(df_reward_only_training, '/Users/ekeller/ccbs/ccbs-study-3/data/df_reward_only_training.Rds') +saveRDS(df_control_no_adult_training,'/Users/ekeller/ccbs/ccbs-study-3/data/df_control_no_adult_training.Rds') + + +``` + diff --git a/src/notebook-2-inference-revision.Rmd b/src/notebook-2-inference-revision.Rmd new file mode 100644 index 0000000..03182c6 --- /dev/null +++ b/src/notebook-2-inference-revision.Rmd @@ -0,0 +1,574 @@ +--- +title: "Inferential Data Analysis" +author: "Liz Keller" +output: html_notebook +date: "`r Sys.Date()`" + +--- + +```{r setup, include=FALSE} +library(tidyverse) +library(ggpubr) +library(GGally) +library(car) + +knitr::opts_chunk$set(echo=TRUE) + +#set.seed() +``` + +# Loading the Data + +Load the raw data and verify its dimensions and structure. + +```{r} +df <- readRDS('../data/tidy.Rds') +dim(df) +summary(df) + +``` + +# subset only valid responses +# this is defined by dogs > 12 months of age, and respondent answered question whether dog had received adolescent/adult training + +```{r} +df_valid_responses <- readRDS('../data/df_valid_responses.Rds') +dim(df_valid_responses) +summary(df_valid_responses) + +#command to derive from full dataset: +#df_valid_responses <- subset(df, dog_adolescent_or_adult_training==TRUE | dog_adolescent_or_adult_training==FALSE) + +``` + +# Control vs Experimental Groups + +# with all valid responses: +control = adult training FALSE +exp = adult training TRUE + + +# using reward-only based adult training as exp +# (dogs who received discipline-based or mixed training excluded from exp group) +control = adult training FALSE +exp = reward_only_adult_training TRUE + +```{r} + +#drawing from all valid responses +df_control <- df_valid_responses %>% + filter(dog_adolescent_or_adult_training == FALSE) +dim(df_control) + +df_exp <- df_valid_responses %>% + filter(dog_adolescent_or_adult_training == TRUE) +dim(df_exp) + + +# only dogs that received reward-based training +df_exp_reward <- readRDS('../data/df_reward_only_training.Rds') +dim(df_exp_reward) + + +``` + +## Overview + +The first question we want to answer is if training during adolescence and/or +adulthood has an impact on the likelihood of a dog having certain behavior problems. + +The behavior problems we are exploring are: + +- Aggression +- Barking excessively +- Coprophagia +- Destructive behavior +- Escaping/running away +- Fear/anxiety +- House soiling +- Hyperactivity +- Jumping problematically +- Mounting +- Repetitive behavior +- Rolling in repulsive materials + +```{r} +outcomes <- c( + 'dog_was_aggressive', + 'dog_has_fear_or_anxiety', + 'dog_has_jumped_problematically', + 'dog_has_barked_excessively', + 'dog_has_coprophagia', + 'dog_has_repetitive_behavior', + 'dog_has_house_soiled', + 'dog_has_rolled_in_repulsive_material', + 'dog_was_hyperactive', + 'dog_was_destructive', + 'dog_has_escaped', + 'dog_has_mounted' +) +outcomes <- sort(outcomes) +``` + +## Statistical Models + +We check for correlation between the predictor (attending adolescent / adult training) and +each outcome (the presence of a specific behavior problem). + +Here we perform the Fisher Exact test with a Benjamini-Hochberg correction. +(The Fischer test is useful for categorical data where the sample size is small) + +```{r} + +pred <- 'dog_adolescent_or_adult_training' + +idx <- NULL +p_values <- NULL +odds_ratios <- NULL +conf_int_low <- NULL +conf_int_high <- NULL + +for (outcome in outcomes) { + tbl <- table(df_valid_responses[, pred], df_valid_responses[, outcome], dnn=c(pred, outcome)) + fisher <- fisher.test(tbl) + idx <- c(idx, outcome) + p_values <- c(p_values, fisher$p.value) + odds_ratios <- c(odds_ratios, fisher$estimate[[1]]) + conf_int_low <- c(conf_int_low,fisher$conf.int[[1]]) + conf_int_high <- c(conf_int_high,fisher$conf.int[[2]]) +} + +# Correct for the possibility of Type I errors. +p_values <- p.adjust(p_values, method='BH') + +# Form a result data frame. +df_out <- data.frame(outcome=idx, p_value=p_values, + odds_ratio=odds_ratios, + conf_int1=conf_int_low, conf_int2=conf_int_high) + +add_sig_columns <- function(df) { + df$level <- '' + df[df$p_value <= .05, 'level'] <- '*' + df[df$p_value <= .01, 'level'] <- '**' + df[df$p_value <= .001, 'level'] <- '***' + + df$dir <- '' + df[df$odds_ratio < 1, 'dir'] <- '-' + df[df$odds_ratio > 1, 'dir'] <- '+' + + return (df) +} + +df_out <- add_sig_columns(df_out) +print(knitr::kable(df_out)) + +``` +There is a significant negative correlation between adult +training and some specific behaviors: house soiling and destructiveness + +there is a significant positive correlation with adult training and +problematic jumping + + +#check chi-squared test for comparison + +```{r} + +pred <- 'dog_adolescent_or_adult_training' + +idx <- NULL +p_values <- NULL +odds_ratios <- NULL +for (outcome in outcomes) { + tbl <- table(df_valid_responses[, pred], df_valid_responses[, outcome], dnn=c(pred, outcome)) + chisq <- chisq.test(tbl) + idx <- c(idx, outcome) + p_values <- c(p_values, chisq$p.value) + #odds_ratios <- c(odds_ratios, chisq$estimate[[1]]) +} + +# Correct for the possibility of Type I errors (false positive/discovery rate). +p_values <- p.adjust(p_values, method='BH') + +# Form a result data frame. +df_out_chisq <- data.frame(outcome=idx, p_value=p_values) + +df_out_chisq <- add_sig_columns(df_out_chisq) +print(knitr::kable(df_out_chisq)) + +``` +Result: same variables are of same order of significance with chi squared test + + +## Binary Logistic Regression + +#The simplest possible model: adult / adolescent training as the sole predictor + +```{r} +pred <- 'dog_adolescent_or_adult_training' + +for (outcome in outcomes) { + cat(paste(replicate(80, '-'), collapse='')) + cat(paste0('\n', outcome, '\n')) + f <- as.formula(paste0(outcome, '~', '.')) + + df_tmp <- df_valid_responses[,c(outcome, pred)] + + glm_fit <- glm(f, data=df_tmp, family='binomial') + print(summary(glm_fit)) + print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + #cat('\nVIF:\n') + #print(car::vif(glm_fit)) + cat('\n') +} +``` + +Let's consider what other factors may also come in to play: + +- Dog Age +- Dog Sex +- Acquiring the dog pre-adolescent +- Pre-adolescent (puppy) training +- interaction between puppy and adult training + +We'll perform logistic regression to determine the impact of these factors. To +perform logistic regression we'll need to ensure out data subsets have enough +responses (`n >= 10`) for each possible answer to be included in the model. + +```{r} +apply_min_xtab <- function(df, outcome, cutoff=10) +{ + drops <- NULL + for (col in names(df)) { + if (col == outcome) next + if (is.integer(df[,col])) next + if (is.numeric(df[,col])) next # added in order not to exclude dog age + + xtab <- table(df[,col], df[,outcome]) + if (min(xtab) < cutoff) { + drops <- c(drops, col) + break + } + } + + if (length(drops) > 0) { + cat('\nDropped from model due to insufficient responses:\n') + cat(drops) + cat('\n') + } + + return(df[, !(names(df) %in% drops)]) +} +``` + +Now we perform the logistic regression. + +when including age (the only continuous variable) +do not apply exclusion for min number of responses + +#without interaction terms + +```{r} +pred <- 'dog_adolescent_or_adult_training' +glm_attribs <- c( + 'dog_age_in_months', + 'dog_gender', + 'dog_was_acquired_pre_adolescent', + 'dog_attended_pre_adolescent_training' +) + +for (outcome in outcomes) { + cat(paste(replicate(80, '-'), collapse='')) + cat(paste0('\n', outcome, '\n')) + f <- as.formula(paste0(outcome, '~', '.')) + + df_tmp <- df_valid_responses[,c(outcome, pred, glm_attribs)] + #do not apply min function-- this only excludes age, which is continuous + #df_tmp <- apply_min_xtab(df_tmp, outcome) + + glm_fit <- glm(f, data=df_tmp, family='binomial') + print(summary(glm_fit)) + print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + cat('\nVIF:\n') + print(car::vif(glm_fit)) + cat('\n') +} + +# note VIF above 5 is red flag + +``` + +# now try a model with an interaction term + + +```{r} + + +for (outcome in outcomes){ + + #new section + cat(paste(replicate(80, '-'), collapse='')) + cat(paste0('\n', outcome, '\n')) + + #format variables as formula + f <- as.formula(paste0(outcome, '~', 'dog_adolescent_or_adult_training + dog_attended_pre_adolescent_training + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months + dog_gender')) + + + df_tmp1 <- df_valid_responses[,c(outcome, 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')] + + df_tmp2 <- df_tmp1[complete.cases(df_tmp1),] + + glm_fit <- glm(f, data=df_tmp2, family = 'binomial') + + print(summary(glm_fit)) + + print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + cat('\nVIF:\n') + print(car::vif(glm_fit,type="predictor")) + cat('\n') + +} + + + +``` + + + +# instead of individual behaviors, use number of behavior problems as outcome + +```{r} + + +#format variables as formula +f <- as.formula(paste0('num_behv_problems', '~', 'dog_adolescent_or_adult_training + dog_attended_pre_adolescent_training + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months + dog_gender')) + + df_tmp1 <- df_valid_responses[,c('num_behv_problems', 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')] + +df_tmp2 <- df_tmp1[complete.cases(df_tmp1),] + +glm_fit <- glm(f, data=df_tmp2, family = 'quasi') + +print(summary(glm_fit)) + +print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + cat('\nVIF:\n') + print(car::vif(glm_fit,type="predictor")) + cat('\n') + +``` + + +# Investigating Training Specifics + +we have identified that adolescent/adult training has a +significant impact on jumping. + +Does training style make a difference in the outcomes? +eliminate dogs who received discipline-based or mixed training from experimental group + +```{r} + +#identify dogs who received reward only adult training +df_reward_only_training <- subset(df_valid_responses, reward_only_adult_training==TRUE) + +#combine control and modified exp groups +df_valid_responses_reward_only <- rbind(df_reward_only_training,df_control) + +``` + + +## repeat Binary Logistic Regression with reward-only experiment group + + +```{r} + + +for (outcome in outcomes){ + + #new section + cat(paste(replicate(80, '-'), collapse='')) + cat(paste0('\n', outcome, '\n')) + + #format variables as formula + f <- as.formula(paste0(outcome, '~', 'dog_adolescent_or_adult_training + dog_attended_pre_adolescent_training + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months + dog_gender')) + + df_tmp1 <- df_valid_responses_reward_only[,c(outcome, 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')] + +df_tmp2 <- df_tmp1[complete.cases(df_tmp1),] + +glm_fit <- glm(f, data=df_tmp2, family = 'binomial') + +print(summary(glm_fit)) + +print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + cat('\nVIF:\n') + print(car::vif(glm_fit,type="predictor")) + cat('\n') + +} + + + +``` +# instead of individual behaviors, use number of behavior problems as outcome + +```{r} + + +#format variables as formula +f <- as.formula(paste0('num_behv_problems', '~', 'dog_adolescent_or_adult_training + dog_attended_pre_adolescent_training + dog_adolescent_or_adult_training*dog_attended_pre_adolescent_training + dog_was_acquired_pre_adolescent + dog_was_acquired_pre_adolescent*dog_attended_pre_adolescent_training + dog_age_in_months + dog_gender')) + + df_tmp1 <- df_valid_responses_reward_only[,c('num_behv_problems', 'dog_adolescent_or_adult_training', 'dog_attended_pre_adolescent_training','dog_was_acquired_pre_adolescent','dog_age_in_months','dog_gender')] + +df_tmp2 <- df_tmp1[complete.cases(df_tmp1),] + +glm_fit <- glm(f, data=df_tmp2, family = 'quasi') + +print(summary(glm_fit)) + +print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + cat('\nVIF:\n') + print(car::vif(glm_fit,type="predictor")) + cat('\n') + +``` + + + +############################################################ + +Now we look just at the experiment group to see if style or location influenced the outcome + +We want to answer the following questions about the training: + +- Does continued training throughout dog's life produce a better outcome? +- Did training technique (reward vs. punishment) have an impact on the outcome? +- Did the location of training have an impact on the outcome? + +We will need to expand the independent variables used for the model to answer +these questions. + +(The vast majority of dogs who had adolescent/adult training received +reward-based training at group classes) + +```{r} + +outcomes_sig <- c( + 'dog_was_aggressive', + 'dog_has_fear_or_anxiety', + 'dog_has_jumped_problematically', + 'dog_has_barked_excessively', + 'dog_has_coprophagia', + 'dog_has_repetitive_behavior', + 'dog_has_house_soiled', + 'dog_has_rolled_in_repulsive_material', + 'dog_was_hyperactive', + 'dog_was_destructive', + 'dog_has_escaped', + 'dog_has_mounted' +) +outcomes_sig <- sort(outcomes_sig) + +#use single factor, reward or no +training_style <- c( + 'reward_only_puppy_training', + 'reward_only_adult_training' + #'discipline', + #'mixed_style' +) +training_age <- c( + #'training_complete_age_6_to_18mo', + #'training_complete_age_18_to_36mo', + 'training_complete_age_continue_in_home', + 'training_complete_age_continue_classes' +) +training_location <- c( + 'puppy_group_classes', + 'trainer_group_classes' +) + +glm_attribs <- c( + training_style, + training_age, + training_location +) + +print(glm_attribs) + +``` + +# Build and evaluate models for the various outcomes. + +```{r} + +# we don't need to include adult training as variable because by definition all dogs in experimental group have had adult training +# There will be some cases where there are less than 10 responses for a given variable/outcome combination, so eliminate those variables from the model using apply_min_xtab function defined above +# Puppy training style and location creates problems in some models due to small sample size / multi colinearity + +for (outcome in outcomes){ + + #new section + cat(paste(replicate(80, '-'), collapse='')) + cat(paste0('\n', outcome, '\n')) + + #format variables as formula + #f <- as.formula(paste0(outcome, '~', 'reward_only_puppy_training + reward_only_adult_training + puppy_group_classes + trainer_group_classes + training_complete_age_continue_in_home + training_complete_age_continue_classes')) + f <- as.formula(paste0(outcome, '~', '.')) + + #df_tmp1 <- df_exp[,c(outcome, 'reward_only_puppy_training', 'reward_only_adult_training','puppy_group_classes', 'trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')] + df_tmp1 <- df_exp[,c(outcome, 'reward_only_adult_training','trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')] + +df_tmp2 <- df_tmp1[complete.cases(df_tmp1),] +df_tmp2 <- apply_min_xtab(df_tmp2, outcome) + +glm_fit <- glm(f, data=df_tmp2, family = 'binomial') + +print(summary(glm_fit)) + +print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + cat('\nVIF:\n') + print(car::vif(glm_fit))#,type="predictor")) + cat('\n') + +} + + + +``` + +# instead of individual behaviors, use number of behavior problems as outcome + +```{r} + + +#format variables as formula + +f <- as.formula(paste0('num_behv_problems', '~', '.')) + +#df_tmp1 <- df_exp[,c(outcome, 'reward_only_puppy_training', 'reward_only_adult_training','puppy_group_classes', 'trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')] +df_tmp1 <- df_exp[,c('num_behv_problems', 'reward_only_adult_training','trainer_group_classes', 'training_complete_age_continue_in_home', 'training_complete_age_continue_classes')] + +df_tmp2 <- df_tmp1[complete.cases(df_tmp1),] + +glm_fit <- glm(f, data=df_tmp2, family = 'quasi') + +print(summary(glm_fit)) + +print(exp(cbind(OR=coef(glm_fit), suppressMessages(confint(glm_fit))))) + cat('\nVIF:\n') + print(car::vif(glm_fit,type="predictor")) + cat('\n') + +``` + + +There seems to be very little impact of style of training, completion age, or training location on behavior outcomes + + + +```{r} + + +``` +