unit_testing_vid1.Rmd

---
title: Responsible modelling: Unit testing for infectious disease epidemiology.
author: 
  - name: Tim CD Lucas
    email: timcdlucas@gmail.com
  - name: Timothy M Pollington
  - name: Emma L Davis
  - name: T Déirdre Hollingsworth
---

```{r, setup}
library(ggplot2)
library(tidyr)

# A package for testing
library(testthat)

set.seed(01011885)
```


# A unit test

```{r, a_unit_test}

expect_equal(sqrt(4), 10)
expect_equal(sqrt(4), 2)

```


# An example multi-pathogen re-infection model

- Consider a multi-pathogen system, 
- with a population of $N$ infected individuals 
- who each get infected by a new pathogen at every time step.
- individuals are infected with exactly one pathogen at a time.

Each individual $i$, at time $t$, is defined by the pathogen they are currently infected with $I_{it} \in \{a, b, c\}$ for a 3-pathogen system. 
The population is therefore defined by a length $N$ state vector $\mathbf{I}_t = (I_{it})_{i=[1,N]}$.
At each time step, every individual's infection status is updated as:
$$I_{it} = \text{Unif}(\mathbf{I}_{t-1}).$$


```{r modelexample}
include_graphics("figures/modelexample.png")
```


## What do we expect to happen?
- Firstly, we would expect an individual to be repeatedly infected with different strains.
- Secondly, we would expect the proportions of the different pathogens to stochastically drift, until all but one pathogen goes extinct.


## First attempt at implementing this model.

```{r first_code}
N <- 12 # infected individuals
n_steps <- 20 # study length
# create the matrix to store the simulation data
I <- matrix(data = NA, nrow = n_steps, ncol = N)

# Initialise the population at t=1 with repeating configuration
I[1, ] <- rep(x = c("a", "b", "c"), length.out = N)
I[1, ]

# At each time step, everyone is re-infected 
# by someone from the previous time step.
for(t in seq(2, n_steps)){
  I[t, ] <- sample(x = I[t-1, ], size = N)
}
```

```{r firstplots}
d1 <- data.frame(time = seq(n_steps), 
                 pathogen = as.numeric(factor((I[, 1]))))
ggplot(d1, aes(time, pathogen)) +
  geom_path() +
  geom_point() +
  scale_y_continuous(breaks = seq(3), labels = c('a', 'b', 'c')) 
```


```{r, incorrectplot}

(apply(I, 1, function(x) table(factor(x, levels = c("a", "b", "c")))) / N) %>% 
  t %>% 
  data.frame %>% 
  cbind(Time = seq_len(nrow(I))) %>% 
  pivot_longer(cols = -Time, names_to = 'Pathogen', values_to = 'Proportion') %>% 
  ggplot(aes(x = Time, y = Proportion, colour = Pathogen, linetype = Pathogen)) +
    ylim(0, 1) +
    geom_line(size = 2)

```


## Lets fix the baseline code

```{r, correctcode}
N <- 12
n_steps <- 20

I <- matrix(data = NA, nrow = n_steps, ncol = N)

I[1, ] <- rep(x = c("a", "b", "c"), length.out = N)

for(t in seq(2, n_steps)){
  # Must sample with replacement.
  I[t, ] <- sample(x = I[t-1, ], size = N, replace = TRUE)
}
```

```{r, correctplots}

(apply(I, 1, function(x) table(factor(x, levels = c("a", "b", "c")))) / N) %>% 
  t %>% 
  data.frame %>% 
  cbind(Time = seq_len(nrow(I))) %>% 
  pivot_longer(cols = -Time, names_to = 'Pathogen', values_to = 'Proportion') %>% 
  ggplot(aes(x = Time, y = Proportion, colour = Pathogen, linetype = Pathogen)) +
    geom_line(size = 2) 

```

# Basic unit testing
## Write small functions

```{r, compactfunctions}
initialisePop <- function(n_steps, N, pathogens){
  I <- matrix(data = NA, nrow = n_steps, ncol = N)
  I[1, ] <- rep(x = letters[1:pathogens], length.out = N)
  return(I)
}

updatePop <- function(x, t, N){
  x[t, ] <- sample(x = x[t-1, ], size = N, replace = TRUE)
  return(x)
}
```

## Test simple cases first

```{r, test_simple_first}
pop1 <- initialisePop(n_steps = 2, N = 3, pathogens = 3) 
expect_equal(pop1[1, ], c("a", "b", "c"))

pop2 <- initialisePop(n_steps = 2, N = 6, pathogens = 3) 
expect_equal(pop2[1, ], c("a", "b", "c", "a", "b", "c"))

pop3 <- initialisePop(n_steps = 2, N = 5, pathogens = 3) 
expect_equal(pop3[1, ], c("a", "b", "c", "a", "b"))

```

In contrast, if we had defined the `initialisePop()` function incorrectly, the test would fail and return an error.

```{r, test_error}

# A broken function that does not add the pathogens.
initialisePopBroken <- function(n_steps, N, pathogens){
  I <- matrix(data = NA, nrow = n_steps, ncol = N)
  return(I)
}

popBroken <- initialisePopBroken(n_steps = 2, N = 3, 
                                   pathogens = 3) 
expect_equal(popBroken[1, ], c("a", "b", "c"))
```


## Test all arguments

```{r, test_all_args}
pop1 <- initialisePop(n_steps = 2, N = 3, pathogens = 3) 
expect_equal(dim(pop1), c(2, 3))

pop2 <- initialisePop(n_steps = 6, N = 3, pathogens = 3) 
expect_equal(dim(pop2), c(6, 3))

pop3 <- initialisePop(n_steps = 2, N = 20, pathogens = 3) 
expect_equal(dim(pop3), c(2, 20))

pop4 <- initialisePop(n_steps = 2, N = 10, pathogens = 5) 
expect_equal(length(unique(pop4[1, ])), 5)
```

## Does the function logic meet your expectations?

```{r, test_complex}
pop1 <- initialisePop(n_steps = 20, N = 12, pathogens = 3) 

# expect all (except the first row) are NAs
expect_true(all(is.na(pop1[-1, ]))) 

# the unique values of pop1[1, ] should be a, b, c
#   and nothing else.
expect_true(setequal(c("a", "b", "c"), pop1[1, ])) 

pop2 <- updatePop(pop1, t = 2, N = 12)
# after update, expect 1st & 2nd row not to have NAs
expect_true(all(!is.na(pop2[1:2, ]))) 
# and also expect that rows other than 1st & 2nd are NAs.
expect_true(all(is.na(pop2[-c(1:2), ]))) 

```

## Combine simple functions and test them at a higher level


```{r, combine_simple_func}
fullSim <- function(n_steps, N, pathogens){
  pop <- initialisePop(n_steps, N, pathogens) 
  for(t in seq(2, n_steps)){
    pop <- updatePop(pop, t, N)
  }
  return(pop)
}

pop <- fullSim(n_steps = 12, N = 20, pathogens = 3)

# expect no NAs
expect_true(!any(is.na(pop))) 
# expect all elements to be one of a, b, or c
expect_true(all(pop %in% c("a", "b", "c"))) 
```

# Stochastic code

## Split stochastic and deterministic parts


```{r, split_deter_stoch}
chooseInfector <- function(N){
  sample(x = N, size = N, replace = TRUE)
}

updateInfectionStatus <- function(x, t, infector_pathogen){
  x[t, ] <- x[t - 1, infector_pathogen]
  return(x)
}

updatePop <- function(x, t, N){
  infector_pathogen <- chooseInfector(N)
  x <- updateInfectionStatus(x, t, infector_pathogen)
  return(x)
}
```

Now, half of `updatePop()` is deterministic so can be checked as previously discussed.
We still have `chooseInfector()` that is irreducibly stochastic.
We now examine some techniques for directly testing these stochastic parts.

## Pick a smart parameter for a deterministic result

We can often find simple cases for which stochastic functions become deterministic.

```{r, test_stoch_determin}
pop <- initialisePop(n_steps = 2, N = 3, pathogens = 1) 
pop <- updatePop(pop, t = 2, N = 3)
expect_equal(pop[1, ], pop[2, ])
```

## Test all possible answers (if few)

Working again with a simple parameter set, there are some cases where the code is stochastic, but with a small, finite set of outputs. 

```{r, test_stoch_fewvalues}
# Collapse each draw into a single string
#   to make comparisons easier.
manyPops <- 
  replicate(300, paste0(chooseInfector(N = 2), collapse = ""))

# Check that all outputs are one of the four possible values
expect_true(all(manyPops %in% c("11", "22", "12", "21")))
```

## Use very large samples for the stochastic part

Testing can be made easier by using very large numbers.

```{r, test_stoch_largenum}
set.seed(10261985)
n <- 1e3
infector_pathogen <- chooseInfector(n)

# Test if an individual infects more than one individual, 
expect_true(any(duplicated(infector_pathogen)))
```

In our original buggy code (\@ref(firstcode)) we found that the proportions remained identical for entire simulations.
We would expect this to happen only very rarely.

```{r, returningpathogen}
set.seed(11121955)
manySims <- replicate(500, 
                      fullSim(n_steps = 20, N = 40, 
                              pathogens = 3), 
                      simplify = FALSE)

# Define a function that returns TRUE if the
#   pathogen proportions are the same at the 
#   first and last time point and FALSE otherwise.
diffProportions <- function(x){
  !identical(table(x[1, ]), table(x[20, ]))
}

# Check that at least one simulation had non-identical
#   proportions. sapply runs the function diffProportions 
#   on each list element of manySims i.e. each simulation.
expect_true(any(vapply(manySims, diffProportions, TRUE)))
```


# Further testing

## Test incorrect inputs

```{r, wrong1}
expect_error(
  initialisePop(n_steps = 10, N = 4, pathogens = "three"))
```

```{r, wrong1b}
expect_error(initialisePop(n_steps = 5, N = 4, pathogens = 1:3))
```

This test fails because the function does not throw an error.
Instead the code takes the first element of `pathogens` and ignores the rest.


```{r, wrong1c}
initialisePop <- function(n_steps, N, pathogens){
  
  # Add a defensive argument check
  if(length(pathogens) > 1) stop("pathogens must have length 1")
  
  I <- matrix(data = NA, nrow = n_steps, ncol = N)
  I[1, ] <- rep(x = letters[1:pathogens], length.out = N)
  return(I)
}

expect_error(initialisePop(n_steps = 5, N = 4, pathogens = 1:3))
```


`initialisePop()` does not throw an error if a vector is supplied to `n_steps`. 
However, `fullSim()` does throw an error if a vector is supplied to `n_steps`.

```{r, wrong2}
initialisePop <- function(n_steps, N, pathogens){
  
  # Argument checks
  if(length(pathogens) > 1) stop("pathogens must have length 1")
  if(length(n_steps) > 1) stop("n_steps must have length 1")

  I <- matrix(data = NA, nrow = n_steps, ncol = N)
  I[1, ] <- rep(x = letters[1:pathogens], length.out = N)
  return(I)
}

expect_error(fullSim(n_steps = 1:100, N = 4, pathogens = 3), 
             regexp = "n_steps must have")

```

## Test special cases 

```{r, edge2}
popt1 <- fullSim(n_steps = 2, N = 5, pathogens = 3) 
expect_equal(dim(popt1), c(2, 5))

popt2 <- fullSim(n_steps = 1, N = 5, pathogens = 3)
```


```{r, edge3}
fullSim <- function(n_steps, N, pathogens){
  pop <- initialisePop(n_steps, N, pathogens) 
  if(n_steps >= 2){
    for(t in seq(2, n_steps)){
      pop <- updatePop(pop, t, N)
    }
  }
  return(pop)
}

popt2 <- fullSim(n_steps = 1, N = 5, pathogens = 3)
expect_equal(dim(popt2), c(1, 5))
```