data-preparation-V2


###############################################################
#########RMS-to-RBM:Data Preparation###########################
###############################################################

###Install Packages

install.packages("remotes")
install.packages("DiagrammeR")
remotes::install_github("dickoa/robotoolbox")
library(haven)
library(tidyverse)
library(readxl)
library(srvyr)
library(ggplot2)
library(robotoolbox)
library(labelled)
library(remotes)
library(dm)
library(janitor)
library(dplyr)
library(DiagrammeR)

##Clear environment, if needed
rm(list = ls())

###Import data from KoBo

kobo_token(username = "XXXXX",
           password = "XXXXX",
           url = "https://kobo.unhcr.org")


kobo_setup(url = "https://kobo.unhcr.org",
           token = "XXXXXXXXXXXXXXXXXXX")

asset_list <- kobo_asset_list()
uid <- filter(asset_list, name == "RMS CAPI v2") |> ## change the name accordingly
  pull(uid)
asset <- kobo_asset(uid)
asset

df <- kobo_data(asset)
df

## Get individual tables from the DM object if needed

main <- pull_tbl(df, main, keyed = TRUE)
S1 <- pull_tbl(df, S1, keyed = TRUE)
S2 <- pull_tbl(df, S2, keyed = TRUE)
P2.S3 <- pull_tbl(df, P2.S3, keyed = TRUE)


glimpse(df$main)
glimpse(df$S1)
glimpse(df$S2)
glimpse(df$P2.S3)

####Data cleaning before merge

###1. Check duplicates

duplicated(main) # Check if there are any duplicates
sum(duplicated(main)) # Number of duplicates


duplicated(S1) # Check if there are any duplicates
sum(duplicated(S1)) #Number of duplicates

get_dupes(main)
get_dupes(S1)

###2. Check HH head 

s1.oldestHHM <- S1 |> # find oldest HH member per HH
  group_by(`_parent_index`) |>
  slice_max(HH07, with_ties = FALSE) |>
  ungroup() |>
  mutate(oldestHHM = 1) |>
  select(`_index`, oldestHHM)

s1.oldestHead <- S1 |> # find oldest head of household per HH
  filter(HH03 == "1") |>
  group_by(`_parent_index`) |>
  slice_max(HH07, with_ties = FALSE) |>
  ungroup() |>
  mutate(oldestHead = 1) |>
  select(`_index`, oldestHead)

S1 <- S1 |>
  left_join(s1.oldestHHM) |>
  left_join(s1.oldestHead) |>
  group_by(`_parent_index`) |>
  mutate(
    householdHead = case_when(
      sum(HH03 == "1") == 1 & HH03 == "1" ~ 1,
      sum(HH03 == "1") == 1 & HH03 != "1" ~ 0,
      sum(HH03 == "1") == 0 & oldestHHM == 1 ~ 1,
      sum(HH03 == "1") == 0 & is.na(oldestHHM) ~ 0,
      sum(HH03 == "1") > 1 & HH03 == "1" & oldestHead == 1 ~ 1,
      sum(HH03 == "1") > 1 & HH03 == "1" & is.na(oldestHead) ~ 0,
      sum(HH03 == "1") > 1 & HH03 != "1"  ~ 0
    )) |>
  mutate(
    householdHead = labelled(householdHead,
                             labels = c(
                               "Head of household" = 1,
                               "Not head of household" = 0
                             ),
                             label = "Head of household")) |>
  ungroup()

table(S1$householdHead) # Double check if you have any 0

###3.Check HH size

##Check HH size


main %>%
  count(HH01)


count()
main$HH01

main |>
  summarise(avg_hh_size = mean(HH01, na.rm = TRUE),
            med_hh_size = median(HH01, na.rm = TRUE))

##Check HH size in individual dataset
S1 |>
  add_count(`_parent_index`, name = "hhsize") |>
  select(`_index`, `_parent_index`, hhsize)


### adding it back, by zooming in the S1 table
raw <- raw |>
  dm_zoom_to(S1) |>
  count(`_parent_index`, name = "hhsize") |>
  dm_insert_zoomed("S1_hhsize")

## Merge count to the main table
df <- df |>
  dm_zoom_to(main) |>
  left_join(S1_hhsize) |>
  dm_update_zoomed()


### Check
df |>
  pull_tbl(main) |>
  select(`_index`, HH01, hhsize) |>
  mutate(diff = abs(HH01 - hhsize) > 1e-6)

### Check whether we have some difference, if any
raw |>
  pull_tbl(main) |>
  select(`_index`, HH01, hhsize) |>
  mutate(diff = abs(HH01 - hhsize) > 1e-6) |>
  filter(diff)
#################################
####Merge all individual datasets
#################################

#Check repeat group datasets
dm_draw(df)

### Check the columns
glimpse(df)

##
dm_nrow(df)

main <- pull_tbl(df, main, keyed = FALSE)
S1 <- pull_tbl(df, S1, keyed = FALSE)
S2 <- pull_tbl(df, S2, keyed = FALSE)
P2.S3 <- pull_tbl(df, P2.S3, keyed = FALSE)

##Merge all individual level datasets into one single individual dataset

S1 <- pull_tbl(df, S1, keyed = FALSE)
S2 <- pull_tbl(df, S2, keyed = FALSE)
P2.S3 <- pull_tbl(df, P2.S3, keyed = FALSE)

ind <- S1 |>
  left_join(S2, by = c("_index", "_parent_index")) |>
  left_join(P2.S3, by = c("_index", "_parent_index"))

###Remove datasets that are not needed

rm(asset,asset_list,df,P2.3,S1,s1.oldestHead, s1.oldestHHM, S2, P2.S3)


#####Calculate disaggregation variables


####1. Primary citizenship


ind <- ind |>
  mutate( # primary citizenship from REF01 and REF02
    citizenship = case_when(
      REF01 == "1" ~ "XXX", ##here enter the country code (where RMS took place)
      REF01 %in% c("0", "98") ~ as.character(ind$REF02),
      REF01 == "99" ~ "99"
    )
  ) |>
  mutate(citizenship = labelled(citizenship,
                                labels = val_labels(ind$REF02),
                                label = var_label(ind$REF02)))
table(ind$citizenship)

#####2. Age groups

ind$HH07_cat <- cut(ind$HH07,
                    breaks = c(-1, 4, 17, 59, Inf),
                    labels = c("0-4", "5-17", "18-59", "60+"))
ind$HH07_cat2 <- cut(ind$HH07 ,
                     breaks = c(-1, 17, Inf),
                     labels = c("0-17", "18-60+"))

table(ind$HH07_cat)
table(ind$HH07_cat2)


###3. Disability

ind <-  ind %>%
  mutate( # disability identifier variables according to Washington Group standards
    disaux1_234 = DIS01 %in% c("2","3","4"), # indicator variables for all 6 domains with value TRUE if SOME DIFFICULTY or A LOT OF DIFFICULTY or CANNOT DO AT ALL
    disaux2_234 = DIS02 %in% c("2","3","4"),
    disaux3_234 = DIS03 %in% c("2","3","4"),
    disaux4_234 = DIS04 %in% c("2","3","4"),
    disaux5_234 = DIS05 %in% c("2","3","4"),
    disaux6_234 = DIS06 %in% c("2","3","4"),
    
    disaux1_34 = DIS01 %in% c("3","4"), # indicator variables for all 6 domains with value TRUE if A LOT OF DIFFICULTY or CANNOT DO AT ALL
    disaux2_34 = DIS02 %in% c("3","4"),
    disaux3_34 = DIS03 %in% c("3","4"),
    disaux4_34 = DIS04 %in% c("3","4"),
    disaux5_34 = DIS05 %in% c("3","4"),
    disaux6_34 = DIS06 %in% c("3","4")
  ) %>%
  mutate(
    disSum234 = rowSums(select(., disaux1_234, disaux2_234 , disaux3_234 , disaux4_234 , disaux5_234 , disaux6_234)), # count number of TRUE indicator variables over 6 domains
    disSum34 = rowSums(select(., disaux1_34, disaux2_34 , disaux3_34 , disaux4_34 , disaux5_34 , disaux6_34)) # count number of TRUE indicator variables over 6 domains
    
  ) %>%
  mutate(
    DISABILITY1 = case_when( # : the level of inclusion is at least one domain/question is coded SOME DIFFICULTY or A LOT OF DIFFICULTY or CANNOT DO AT ALL.
      disSum234 >= 1 ~ 1,
      disSum234 == 0 & (!(DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99"))) ~ 0,
      DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99") ~ 98
    )
  ) %>%
  mutate(
    DISABILITY2 = case_when( # : the level of inclusion is at least two domains/questions are coded SOME DIFFICULTY or A LOT OF DIFFICULTY or CANNOT DO AT ALL or any 1 domain/question is coded A LOT OF DIFFICULTY or CANNOT DO AT ALL
      disSum234 >= 2 | disSum34 >=1  ~ 1,
      disSum234 < 2 & disSum34 == 0 & (!(DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99"))) ~ 0,
      DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99") ~ 98
    )
  ) %>%
  mutate(
    DISABILITY3 = case_when( # : the level of inclusion is at least one domain/question is coded A LOT OF DIFFICULTY or CANNOT DO AT ALL.
      disSum34 >= 1 ~ 1,
      disSum34 == 0 & (!(DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99"))) ~ 0,
      DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99") ~ 98
    )
  ) %>%
  mutate(
    DISABILITY4 = case_when( # : the level of inclusion is at least one domain/question is coded CANNOT DO AT ALL.
      DIS01=="4" | DIS02=="4" | DIS03=="4" | DIS04=="4" | DIS05=="4" | DIS06=="4" ~ 1,
      !(DIS01=="4" | DIS02=="4" | DIS03=="4" | DIS04=="4" | DIS05=="4" | DIS06=="4") & (!(DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99"))) ~ 0,
      DIS01 %in% c("98","99") & DIS02 %in% c("98","99") & DIS03 %in% c("98","99") & DIS04 %in% c("98","99") & DIS05 %in% c("98","99") & DIS06 %in% c("98","99") ~ 98
    )
  ) %>%
  mutate(
    DISABILITY1 = labelled(DISABILITY1,
                           labels = c(
                             "Without disability" = 0,
                             "With disability" = 1,
                             "Unknown" = 98
                           ),
                           label = "Washington Group disability identifier 1"),
    DISABILITY2 = labelled(DISABILITY2,
                           labels = c(
                             "Without disability" = 0,
                             "With disability" = 1,
                             "Unknown" = 98
                           ),
                           label = "Washington Group disability identifier 2"),
    DISABILITY3 = labelled(DISABILITY3,
                           labels = c(
                             "Without disability" = 0,
                             "With disability" = 1,
                             "Unknown" = 98
                           ),
                           label = "Washington Group disability identifier 3"),
    DISABILITY4 = labelled(DISABILITY4,
                           labels = c(
                             "Without disability" = 0,
                             "With disability" = 1,
                             "Unknown" = 98
                           ),
                           label = "Washington Group disability identifier 4"))
###Calculate having at least one disability identifier among 4 categories 
ind <- ind %>%
  mutate(disab=
           case_when(DISABILITY1==1 | DISABILITY2==1 | DISABILITY3==1 | DISABILITY4==1 ~ 1, 
                     DISABILITY1==0 | DISABILITY2==0 | DISABILITY3==0 | DISABILITY4==0 ~ 0,
                     TRUE ~ NA_real_)
  ) %>%
  mutate(disab = labelled(disab,
                          labels = c(
                            "Without disability" = 0,
                            "With disability" = 1)
  ))
table(ind$disab)


###Below indicators will be used to disaggregate during the analysis.
##Country of origin : `citizenship`
##Age categories : `HH07_cat` and `HH07_cat2`
##Gender : `HH04`
##Population groups: `pop_groups`

table(main$pop_groups)
table(ind$HH04)


########Create similar variable names to get the data on randomly selected adult

main$HH02 <- main$name_selectedadult18 #name of randomly selected adult
main$HH07 <- main$name_selectedadult18_age #age of randomly selected adult
main$"_parent_index" <- main$"_index" #merge variable from kobo

##Create a new dataset with indicators for merge, below you can add all indicators you want to import from individual dataset

ind_m <- ind %>%
  select("_parent_index", "HH07_cat", "HH07_cat2", "disab", "citizenship","HH02", "HH07", "HH04") 


main <- left_join(main, ind_m,
                  by = c("HH02", "HH07", "_index" = "_parent_index"))

rm(ind_m)


##Create a new dataset with the indicators that you want to import
main_m <- main %>%
  select(`_parent_index`, pop_groups, end_result) ## add variables here

ind <- left_join(main_m, ind, by = "_parent_index")

rm(main_m)