serl_smart_meter_documentation.Rmd

---
title: "Smart meter consumption data:  \nTechnical documentation"
output: 
  word_document:
    reference_doc: SERL_word_template_landscape1.docx
    toc: yes
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
#following only works for edition not monthly - and only when called from main
#release_version <- str_sub(getwd(), start = -9) 
#ed <- stringr::str_sub(sub('.*edition', '', getwd()),
#                       start = 1, end = 2)
#edition <- paste0("edition", ed)

# Deal with different working directories depending on whether file is knitted from RStudio or
#   rendered from main.R
wd <- getwd()
if(stringr::str_sub(wd, -13) == "documentation") {
  parent_dir<-stringr::str_sub(wd, 1, nchar(wd) - 14)
  release_version<-sub(".*/","",parent_dir)
} else {
  release_version<-sub(".*/","",wd)
}

if(stringr::str_sub(wd, -13) == "documentation") {
  source(paste0(stringr::str_sub(wd, 1, nchar(wd) - 13), 
                "/functions/create.info.table.R"))
} else {
  source("./functions/create.info.table.R")
}

library(data.table)
library(ggplot2)
library(knitr)
library(captioner)
library(lubridate)
library(flextable)
library(officer)
library(stringr)

# Import all filenames, locations, source function files
data_folder <- paste0("your/data/foler", release_version)
final_rdata_folder <- paste0(data_folder, "/processed_rdata/")
rt_processed_file <- paste0(final_rdata_folder, "serl_smart_meter_rt_summary_", release_version, ".RDS")

intermediary_folder <- paste0(data_folder, "/intermediary_rdata/")
sm_doc_input_file <- paste0(intermediary_folder, "sm_data_documentation_input.RData")

#source(paste("your/folder/here", 
#             ed, 
#             "/Code/setup_edition", ed, ".R", sep = ""))


```

```{r defineInputs, include = FALSE}

# load_table_data <- FALSE # TRUE if we've run this file before and don't want to re-process the data (only edit the text, formatting, etc.)

min_n_to_determine_unit_error <- 30

# Define plotting/reporting variables for the markdown output
fig.h <- 6
fig.w <- 10
fullwidth <- TRUE
font.size <- 18
geom.text.size <- 5

```

```{r loadData, include = FALSE}

#load(rt_processed_file)
rt_summary <- readRDS(rt_processed_file)
load(sm_doc_input_file)

```

```{r functions, include = FALSE}

add.simple.table.superscript <- function(flex_table,
                                         before = "m",
                                         after = "",
                                         power = "3",
                                         row_ids = c(1, 2),
                                         col_name = "Units",
                                         t_part = "body") {
  # puts superscript formatting in a flextable. E.g. with the
  #  defaults, we'd add m^3 (formatted) to rows 1 and 2 in
  #  column called units.
  
  formatted_table <- compose(
    flex_table,
    i = row_ids,
    j = col_name,
    part = t_part,
    value = as_paragraph(before,
                         as_sup(power),
                         after)
  )
  
  return(formatted_table)
}

my.flex <- function(t, autofit = TRUE) {
  ft <- flextable(t)
  ft <- theme_booktabs(ft)
  ft <- theme_zebra(ft, 
                    odd_header = rgb(84/255, 141/255, 212/255, 1),
                    even_body = "transparent",
                    odd_body = rgb(242/255, 242/255, 242/255, 1))
  ft <- color(ft, color = "white", part = "header")
  ft <- italic(ft, italic = TRUE, part = "header")
  ft <- bold(ft, bold = FALSE, part = "header")
  if(autofit == TRUE) {
    ft <- autofit(ft)
  } else{
    ft <- set_table_properties(ft, width = 1, layout = "autofit")
  }
  return(ft)
}

remove.filename.version <- function(file_id) {
  name_long <- get.serl.filename(file_id, release_version)
  name_short <- str_sub(name_long, end = -3)
}

get.daily.filename.structure <- function(file_id) {
  name_long <- get.serl.filename(file_id, release_version)
  name_short <- str_sub(name_long, end = -11)
  name_short <- paste(name_short, "_{YYYY}_edition{number}", sep = "")
  
}

```

```{r}
info_tab <- create.info.table(auth = "Ellen Zapata-Webborn, Jessica Few, Clare Hanmer")
info_tab 
```

```{r captions, include = FALSE}
fig_caps <- captioner::captioner(prefix = "Figure")
tab_caps <- captioner::captioner(prefix = "Table")

tab_caps(name = "read.type.tab", 
         caption = "Smart meter data types, defined by 'deviceType' and 'readType'. ")

tab_caps(name = "hh.cols.tab", 
         caption = "Half-hourly data fields. Error flags are defined in the Error Flags section below.")

tab_caps(name = "d.cols.tab", 
         caption = "Daily data fields. See the Error Flags section below for definitions of the error flags used in this table.")

tab_caps("relevant.flags", 
         caption = "Error flags and their meanings.") 

tab_caps("high_lim_tab", 
         caption = "Limits used to determine if a read is high enough for a 'very high read' flag (-2).") 

tab_caps("sum.match.tab", 
         caption = "Electricity and gas 'sum_match' values and their meanings.")

tab_caps("sched.tab", 
         caption = "Read-type data quality summary: data fields. Error codes are explained above in the Error Flags section. Note that statistics for valid reads exclude valid reads recorded at the wrong time.")

tab_caps("partic.tab", 
         caption = "Participant data quality summary: data fields")

```


# Introduction

This document describes the half-hourly and daily smart meter datasets available to researchers with secure access to the SERL Observatory datasets, along with two data quality summary tables. The half-hourly data is provided in monthly files and the daily data is provided in yearly files. The data were collected from the earliest date available. The datasets described in this document are:


- serl_smart_meter_daily_{YYYY}_edition{number}.csv: yearly files of daily electricity and gas readings with some additional derived columns

- serl_smart_meter_half_hourly_{YYYY}_{MM}_edition{number}: monthly files of half-hourly electricity and gas readings with some additional derived columns (note that this dataset also includes reactive readings and export readings where available). Note that the monthly cut off follows UTC time. 

- serl_smart_meter_rt_summary_edition{number}: data quality summary for each read type for each participant (such as number of errors found by type) and basic read statistics (such as mean and maximum)

where _{number}_ is the number of the data release, _e.g._ "02" (note that the first data release files are labelled with the release date rather than edition number), _{YYYY}_ and _{MM}_ are the year and month that files relate to respectively.

This document is structured as follows: we start with some basic information about how the data were collected and the different types of reading available, then we describe the two types of smart meter data tables (daily - multiple files each containing a year of daily data, and half-hourly - multiple files each containing a month of half-hourly data). Next we define the different types of error flag created, and finally we describe the data quality summary table (at the read-type level). Note that the participant summary table documentation was previously given in this file, but has been moved to its own document.  


## Data collection

Half-hourly and daily smart meter readings are stored on the smart meter, and accessed by the Smart Energy Research Lab (SERL) as follows. The SERL technical team (The University of Essex (UK Data Archive) between 2018 and August 2022, and then the Advanced Research Computing Centre (ARCC) at University College London)  uses a DCC adaptor service to communicate with Great Britain's central smart meter system (the Smart Data Communications Company (DCC)), which acts as a secure messaging service to transfer the smart meter readings (with the consent of SERL participants)  to the secure SERL technical infrastructure. Not all properties have a gas meter we are able to access - for example, if there is only a SMETS2 electricity meter but not a SMETS2 gas meter, or if the property does not have mains gas. Researchers can check the EPC and survey data to identify properties which have gas central heating but no SERL gas data if this may affect their analysis. 


## Smart meter read types

Smart meter read types are defined by two variables: 'deviceType' and 'readType'. Together they combine to define the type of smart meter data. The full list of smart meter data types are shown in the table below. Note that 'GPF' stands for 'Gas Proxy Function' (a proxy for the gas meter) and 'ESME' stands for 'Electricity Smart Metering Equipment' (the electricity meter). 


```{r readTypes, include = FALSE}

  read.type.tab[, units := "Wh"]
  read.type.tab[deviceType == "GPF", units := "m^3^"]
  
  read.type.tab[deviceType == "ESME" & readType == "DL", 
                `:=`(description = "Daily active electricity import", 
                     position = 1)]
  read.type.tab[deviceType == "ESME" & readType == "AI", 
                `:=`(description = "Half-hourly active electricity import", 
                     position = 3)]
  read.type.tab[deviceType == "ESME" & readType == "AE", 
                `:=`(description = "Half-hourly active electricity export", 
                     position = 6)]
  read.type.tab[deviceType == "ESME" & readType == "RI", 
                `:=`(description = "Half-hourly reactive electricity import", 
                     position = 4)]
  read.type.tab[deviceType == "ESME" & readType == "RE", 
                `:=`(description = "Half-hourly reactive electricity export", 
                     position = 7)]
  read.type.tab[deviceType == "GPF" & readType == "DL", 
                `:=`(description = "Daily gas import", 
                     position = 2)]
  read.type.tab[deviceType == "GPF" & readType == "AI", 
                `:=`(description = "Half-hourly gas import", 
                     position = 5)]
  
  setkey(read.type.tab, position)
  
  
  read.type.tab <-
    read.type.tab[, .(deviceType, readType, units, description)]
  setnames(
    read.type.tab,
    old = c("units", "description"),
    new = c("Units", "Description")
  )


read_type_tab <- my.flex(read.type.tab)
read_type_tab <- add.simple.table.superscript(read_type_tab,
                                                row_ids = c(2, 5))


```

`r tab_caps("read.type.tab")`
```{r}
read_type_tab
```


## Changes since the previous edition

There have been a few small changes to the data since the previous data: 

- The data now runs up to the end of December 2023 for the smart meter data and climate data.

- Participants who have withdrawn and specified full erasure have been removed from all data files. 

- We have increased the minimum threshold for flagging 'high' half hourly electricity active export readings from 2000 Wh to 5000 Wh, as domestic solar photovoltaic arrays have increased in size and some households are now exporting more than our original cut-off (the previous limit was the maximum output for a 4kW array, we have changed the limit to the maximum for a 10kW array). 

We continue to analyse and improve the SERL datasets and we aim to strike a balance between improving the data and minimising inconvenience to researcher caused by changes. We welcome feedback to improve our processes. 


# Smart meter data tables

## Half-hourly data

The half-hourly data tables have `r ncol_hh` columns. The fields are described in the table below. While the original data have not been modified, additional columns have been added to flag potential errors (see the Error Flags section below) and convert between units. This processing was done using `r paste(R.Version()$version.string)`. Code used for processing will be made available shortly on the SERL github repository [github.com/smartEnergyResearchLab](github.com/smartEnergyResearchLab).

Note that if no data were returned for any meter at a particular time then that row will be missing from the dataset rather than being an empty row. The 'class' field is the R class (e.g. R calls a Boolean a logical).  

```{r hhColumnsTable, include = FALSE}

hh.cols.tab <- data.table(
  columnNames = hh_colnames,
  description = rep(NA_character_, ncol_hh),
  units = rep(NA_character_, ncol_hh),
  class = hh_col_class,
  example = rep(NA_character_, ncol_hh,),
  type = rep("Derived", ncol_hh)
)

hh.cols.tab[columnNames == "PUPRN",
            `:=`(description = "Pseudonymised participant identifier",
                 units = "NA",
                 example = "1VUXXXF1",
                 type = "Assigned")]

hh.cols.tab[columnNames == "Read_date_effective_local",
            `:=`(description = "Date of read (same as date of Read_date_time_local unless read taken at midnight, then the previous day since data pertains to the previous day)",
                 units = "%Y-%m-%d",
                 example = "2019-11-01")]

hh.cols.tab[columnNames == "Read_date_time_local",
            `:=`(description = "Time read taken (local time: GMT or BST)",
                 units = "%Y-%m-%d %H:%M:%S tz",
                 example = "2020-07-02 00:03:30 BST")]

hh.cols.tab[columnNames == "Read_date_time_UTC",
            `:=`(description = "Time read taken in UTC",
                 units = "%Y-%m-%d %H:%M:%S",
                 example = "2020-07-02 00:02:30",
                 type = "Primary")]

hh.cols.tab[columnNames == "HH",
            `:=`(description = "Half-hour identifier between 1 and 48 (NA if not exactly on the hour or half-hour)",
                 units = "NA",
                 example = "48",
                 type = "Primary")]

hh.cols.tab[columnNames == "Elec_act_imp_hh_Wh",
            `:=`(description = "Half-hourly electricity active import read",
                 units = "Wh",
                 example = "109",
                 type = "Primary")]

hh.cols.tab[columnNames == "Elec_react_imp_hh_varh",
            `:=`(description = "Half-hourly electricity reactive import read",
                 units = "varh",
                 example = "15",
                 type = "Primary")]

hh.cols.tab[columnNames == "Elec_act_exp_hh_Wh",
            `:=`(description = "Half-hourly electricity active export read",
                 units = "Wh",
                 example = "65",
                 type = "Primary")]

hh.cols.tab[columnNames == "Gas_hh_m3",
            `:=`(description = "Half-hourly gas import read",
                 units = "m^3^",
                 example = "0.244",
                 type = "Primary")]

hh.cols.tab[columnNames == "Elec_react_exp_hh_varh",
            `:=`(description = "Half-hourly electricity reactive export read",
                 units = "varh",
                 example = "14",
                 type = "Primary")]

hh.cols.tab[columnNames == "Elec_import_exists",
            `:=`(description = "TRUE if electricity import meter exists in the inventory, otherwise FALSE",
                 units = "NA",
                 example = "TRUE")]

hh.cols.tab[columnNames == "Gas_exists",
            `:=`(description = "TRUE if gas meter exists in the inventory, otherwise FALSE",
                 units = "NA",
                 example = "TRUE")]

hh.cols.tab[columnNames == "Elec_act_imp_flag",
            `:=`(description = "Half-hourly electricity active import error flag",
                 units = "NA",
                 example = "-2")]

hh.cols.tab[columnNames == "Gas_flag",
            `:=`(description = "Half-hourly gas import error flag",
                 units = "NA",
                 example = "0")]

hh.cols.tab[columnNames == "Gas_hh_kWh",
            `:=`(description = "Half-hourly gas import read in kWh using standard conversion, assuming calorific value = 39.5",
                 units = "kWh",
                 example = "2.737835")]

hh.cols.tab[columnNames == "Gas_hh_Wh",
            `:=`(description = "Half-hourly gas import read in Wh using standard conversion, assuming calorific value = 39.5",
                 units = "Wh",
                 example = "2737.835")]

hh.cols.tab[columnNames == "Elec_export_exists",
            `:=`(description = "TRUE if electricity export meter exists in the inventory, otherwise FALSE",
                 units = "NA",
                 example = "FALSE")]

hh.cols.tab[columnNames == "Elec_act_exp_flag",
            `:=`(description = "Half-hourly electricity active export error flag",
                 units = "NA",
                 example = "-4")]

hh.cols.tab[columnNames == "Elec_react_imp_flag",
            `:=`(description = "Half-hourly electricity reactive import error flag",
                 units = "NA",
                 example = "1")]

hh.cols.tab[columnNames == "Elec_react_exp_flag",
            `:=`(description = "Half-hourly electricity reactive export error flag",
                 units = "NA",
                 example = "2")]

hh.cols.tab[columnNames == "Valid_read_time",
            `:=`(description = "FALSE if read time is not on the hour or half hour, otherwise TRUE",
                 units = "NA",
                 example = "TRUE")]

colnames(hh.cols.tab) <- c("Field", "Description", "Units", "Class", "Example value", "Variable type")


hh_cols_tab <- my.flex(hh.cols.tab, autofit = FALSE)
hh_cols_tab <- add.simple.table.superscript(hh_cols_tab,
                                            row_ids = hh.cols.tab[ , which(Field == "Gas_hh_m3")])


```

`r tab_caps("hh.cols.tab")`
```{r}
hh_cols_tab
```

## Daily data

The daily data tables have `r ncol_d` columns. The fields are described in the table below. While original data has not been modified, additional columns have been added to flag potential errors (see Error Flags section below) and to convert between units. This processing was done using `r paste(R.Version()$version.string)`.  Code used for processing is available on the SERL github repository [github.com/smartEnergyResearchLab](github.com/smartEnergyResearchLab).

Note that if no data were returned for any meter on a particular day then that row will be missing from the dataset rather than being empty. The exception is if there were no daily reads but the right number of half-hourly reads (48 unless the clocks changed) to sum to the daily total. In these instances the rows have been added to allow for easy imputation of a missing daily read with the sum of the half-hourly reads. For comparison between daily reads and daily sums, half-hourly data requires conversion (provided) to local time. A csv file is provided *(bst_dates_to_2024.csv)* which lists the start and end dates of British Summer Time (BST) for reference so the number of half-hours expected on each day is clear. Researchers may wish to avoid clock change days in their analysis, or take them into consideration. 

The daily data includes fields added for comparison between the daily readings and half-hourly readings for the same days. As described in `r tab_caps("d.cols.tab", display = "cite")` there are columns for the sum of the half-hourly readings if there were 48 valid reads taken at the right times (or 46 or 50 reads if the clocks changed), the difference between the half-hourly sum and the daily read (if both exist and are valid), and a 'sum_match' column coded to state the condition of this match. The sum match codes are defined in `r tab_caps("sum.match.tab", display = "cite")` in the Error Flags section. Note that reads are considered invalid if taken at the wrong time (not at midnight for daily reads; not on the hour or half hour for half-hourly reads). 

```{r DColumnsTable, include = FALSE}


  d.cols.tab <- data.table(columnNames = d_colnames,
                            description = rep(NA_character_, ncol_d),
                            units = rep(NA_character_, ncol_d),
                            class = d_col_class,
                            example = rep(NA_character_, ncol_d),
                            type = rep("Derived", ncol_d))
  
  d.cols.tab[columnNames == "PUPRN", `:=`(description = "Pseudonymised participant identifier",
                                          units = "NA",
                                          example = "1VUXXXF1",
                                          type = "Assigned")]
  
  # correct
  d.cols.tab[columnNames == "Read_date_effective_local", `:=`(description = "Date that read relates to (in local time): previous day, unless after midday (then same day)",
                                                    units = "%Y-%m-%d",
                                                    example = "2019-11-01")]
  
  d.cols.tab[columnNames == "Read_date_time_UTC", `:=`(description = "Time and date of read (UTC). Time not stated if at midnight",
                                                    units = "%Y-%m-%d",
                                                    example = "2019-11-02",
                                                    type = "Primary")]
  
  d.cols.tab[columnNames == "Valid_read_time", `:=`(description = "TRUE if reading was at midnight, otherwise FALSE",
                                                    units = "NA",
                                                    example = "TRUE")]
  

  d.cols.tab[columnNames == "Elec_act_imp_d_Wh", `:=`(description = "Daily electricity active import read",
                                                    units = "Wh",
                                                    example = "5839",
                                                    type = "Primary")]
  
  d.cols.tab[columnNames == "Unit_correct_elec_act_imp_d_Wh", `:=`(description = "Daily electricity active import read corrected from kWh to Wh where kWh reporting is suspected, otherwise equals Elec_act_imp_d_Wh",
                                                    units = "Wh",
                                                    example = "5839")]
  
  d.cols.tab[columnNames == "Elec_act_imp_d_kWh", `:=`(description = "Unit_correct_elec_act_imp_d_Wh divided by 1000. Note that reads originally recorded in kWh will be integers, otherwise 3 decimal places.",
                                                    units = "kWh",
                                                    example = "5.839")]
  
  d.cols.tab[columnNames == "Elec_act_imp_flag", `:=`(description = "Daily electricity active import error flag",
                                                    units = "NA",
                                                    example = "-2")]
  

  d.cols.tab[columnNames == "Elec_act_imp_hh_sum_Wh", `:=`(description = "Sum of half-hourly electricity active import reads for this date (NA if there were not 48* valid reads). *46 required when the clocks go forward, 50 when the clocks go back.",
                                                    units = "Wh",
                                                    example = "5742")]
  
  d.cols.tab[columnNames == "Elec_act_imp_sum_diff", `:=`(description = "Unit_correct_elec_act_imp_d_Wh - Elec_act_imp_hh_sum_Wh",
                                                    units = "Wh",
                                                    example = "0")]
  
  d.cols.tab[columnNames == "Elec_sum_match", `:=`(description = "Error code for whether the sum of half-hourly electricity active import matches the daily electricity read",
                                                    units = "NA",
                                                    example = "1")]
  
  d.cols.tab[columnNames == "Gas_exists", `:=`(description = "TRUE if gas meter exists in the inventory, otherwise FALSE",
                                                    units = "NA",
                                                    example = "TRUE")]
  
  d.cols.tab[columnNames == "Gas_d_m3", `:=`(description = "Daily gas read",
                                                    units = "m^3",
                                                    example = "8.214",
                                                    type = "Primary")]
  
  d.cols.tab[columnNames == "Gas_d_kWh", `:=`(description = "Daily gas import read in kWh using standard conversion, assuming calorific value = 39.5",
                                                    units = "kWh",
                                                    example = "92.16628")]
  
  d.cols.tab[columnNames == "Gas_flag", `:=`(description = "Daily gas import error flag",
                                                    units = "NA",
                                                    example = "2")]
  
  d.cols.tab[columnNames == "N_gas_hh", `:=`(description = "Number of valid half-hourly gas import readings available on this date excluding invalid read time data",
                                                    units = "NA",
                                                    example = "48")]
  
  d.cols.tab[columnNames == "Gas_hh_sum_m3", `:=`(description = "Sum of half-hourly gas reads for this date (NA if there were not 48* valid reads). *46 required when the clocks go forward, 50 when the clocks go back.",
                                                    units = "m^3",
                                                    example = "8.763")]
  
  d.cols.tab[columnNames == "Gas_sum_diff", `:=`(description = "Gas_d_m3 - Gas_hh_sum_m3",
                                                    units = "m^3",
                                                    example = "0.273")]
  
  d.cols.tab[columnNames == "Gas_sum_match", `:=`(description = "Error code for whether the sum of half-hourly gas import matches the daily gas read",
                                                    units = "NA",
                                                    example = "0")]
  
  d.cols.tab[columnNames == "Valid_hh_sum_or_daily_elec", `:=`(description = "True if a valid half-hourly sum or daily electricity read exists",
                                                    units = "NA",
                                                    example = "TRUE")]
  
  d.cols.tab[columnNames == "Valid_hh_sum_or_daily_gas", `:=`(description = "True if a valid half-hourly sum or daily gas read exists",
                                                    units = "NA",
                                                    example = "TRUE")]
  

  colnames(d.cols.tab) <- c("Field", "Description", "Units", 
                            "Class", "Example value", "Variable Type")

d.cols.tab = d.cols.tab[1:(ncol_d - 1)]

d_cols_tab <- my.flex(d.cols.tab, autofit = FALSE)
d_cols_tab <- add.simple.table.superscript(d_cols_tab,
                                           row_ids = 14:15)


```


`r tab_caps("d.cols.tab")`
```{r}
d_cols_tab
```


# Error flags

## Flags for individual reads

`r tab_caps("relevant.flags", display = "cite")` shows the meaning of each error flag value. These flags are used for both daily and half-hourly reads. The error flags have evolved over the editions so please check the documentation relating to the dataset in use to learn about the relevant error flags.  

```{r ErrorFlagTable, include = FALSE}
# -----
# Create a table called 'flags' to store the error flags, used for presenting the results in the markdown document
# -----

#   3   Ignore - Reading missing but not required as incorrect read time
#   2   No meter - or at least none we find via the DCC inventory
#   1   Valid read - a read that doesn't meet any other error flagging criteria
#   0   Missing read for an existing meter
#  -1  'Max read' - see definition in setup
#  -2  'High read' - see definition in setup
#  -3  Negative read
#  -4  Incorrect units - see 'correctUnits' code chunk
#  -5  Valid read but invalid read time

relevant.flags <- data.table(
  Flag = c(3:-6),
  Meaning = c(
    "Ignore",
    "No meter",
    "Valid",
    "Missing",
    "Max read",
    "Very high but not max",
    "Negative",
    "Elec in kWh",
    "Valid but invalid time",
    "Suspicious zero"
  ),
  Details = c(
    "Invalid read time and no read for this read type - row exists for a different read type so ignore",
    "The gas (or very rarely electricity) meter does not exist in the DCC inventory",
    "The read exists and does not meet any of the other error flag criteria, thus presumed valid",
    "The read should exist but is missing",
    "The read is (presumably) the largest storable number on the meter - details below",
    "The read is higher than plausible but not a 'Max read' - see section below for thresholds",
    "The read is negative (none found)",
    "The electricity read was reported in kWh rather than Wh - details below",
    "The read time is invalid but otherwise the read is valid",
    "The zero read is suspected to be an error - see details in section below"
  )
)


relevant.flags <- my.flex(relevant.flags, autofit = FALSE)


```

`r tab_caps("relevant.flags")`
```{r}
relevant.flags
```


### 'Max reads' (error flag -1)

We discovered that multiple participants have some electricity readings as 16777215 Wh and multiple gas readings of 16777.215 m^3^. These numbers are all 1s in binary which implies they are the maximum read the (32-bit) meter can store, and likely due to some technical error. We call this type of error 'Max read'. Note that in a few cases the number is the max read stored in 64 bits - these are replaced by the 32-bit maximum to save memory. 

### Very high reads (error flag -2)


```{r}
high_limits <- data.table(deviceType = c(rep("ESME", 3), rep("GPF", 2)),
                          readType = c("DL", "AI", "AE", "DL", "AI"),
                          lim = c(48 * 24 * 1000,
                                  24 * 1000,
                                  10000 / 2,
                                  16 * 24,
                                  16/2),
                          Units = c(rep("Wh", 3), rep("m^3", 2)),
                          Assumptions = c("Max 200A fuse, 240V",
                                          "Max 200A fuse, 240V",
                                          "10kW max PV capacity",
                                          "16m^3/hr max capacity",
                                          "16m^3/hr max capacity")
                          )

setnames(high_limits, "lim", "High Read Limit")

high_lim_tab <- my.flex(high_limits, autofit = FALSE)
high_lim_tab <- add.simple.table.superscript(high_lim_tab,
                                             row_ids = c(4, 5))
high_lim_tab <- add.simple.table.superscript(high_lim_tab,
                                             before = "16m",
                                             after = "/hr max capacity",
                                             col_name = "Assumptions",
                                             row_ids = c(4, 5))

```

For gas and active electricity reads we flag if the reading is larger than we deem plausible, attempting to be cautious with our definition of 'plausible'. The following table shows our definitions of what constitutes a 'very high' read (so long as the read is not high enough to be a 'max read'). 


`r tab_caps("high_lim_tab")`
```{r}
high_lim_tab
```


### Readings in the wrong units (error flag -4)

We also discovered that for some participants all daily electricity active import readings were all extremely low. At the time of the initial investigation (August 2020) there were no participants with a maximum read between 85 and 2285 Wh; therefore we determined that any electricity active import (or export) meter with its highest reading less than 100 was erroneously reporting in kWh rather than in Wh as specified by the Smart Energy Code. This was verified by comparing the daily readings with the sum of the half-hours for the same day.
Subsequent investigations in March 2021 revealed that some meters were correctly recording daily electricity readings in Wh up until the date of a meter replacement, at which point the daily reads became approximately 1/1000th of the sum of the half-hourly reads. By 'approximately' we mean that after dividing the half-hourly sum by 1000 and rounding down, the result is within 1 of the daily read. For example, half-hourly sums between 5000 and 599 would be classed as approximately 1000 times bigger than a daily read between 4 and 6 (in order to handle rounding errors and slight read mismatch).  In such cases, if a household has at least 5 rows with the daily and half-hourly sums in this situation (excluding daily reads of 0 which may be a different kind of error), all of such cases for the household are flagged as a unit error as above. Without checking all cases manually, a very small number of reads will be mis-flagged as unit errors or unit errors will be missed. 

It is possible that some half-hourly sums are approximately 1/100th of the daily read, indicating readings in tens of Wh. This has been found to affect just a handful of meters at present, and it is left to researchers to decide how best to deal with daily and half-hourly sum mismatches in general. In most cases we believe that the sum of half-hours is more reliable than a daily read, but it depends on the individual meter, and not all days have complete valid half-hourly reads to sum. Note that if there are not the correct number of valid half-hourly reads taken at the right times then the half-hourly sum variable will be NA. 

### 'Suspicious' Zero Reads (error flag -6)

From Edition 04 zero readings for daily electricity active import reads are flagged with -6 to indicate that they may be suspicious. Data quality analysis revealed that this read type is susceptible to errors. In particular, there are meters that start recording all zeros when the clocks go forward in the Spring until they change back, or in some cases, until they are fixed part-way through this period. No evidence has been found that this issue affects the other types of reads collected. 


## Flags for the daily and half-hourly sum match

The daily data table contains fields called "Elec_sum_match" and "Gas_sum_match" which give an error flag for how the sum of the half-hourly reads for that day compares with the daily read. They are described in the table below. It isn't always possible to compare the sum and the daily readings; if any were reported in the wrong units, if any reads were taken at the wrong time, or if any of the reads were missing. By 'any' we mean the daily read and the 48 half-hourly reads (46 or 50 on clock change days). 

```{r sumMatchTable, include = FALSE}

elec_match_limit <- 1     # (Wh)
elec_similar_limit <- 10  # (Wh)
gas_match_limit <- 0.001*1000  # (L)
gas_similar_limit <- 0.01*1000 # (L)

sum.match.tab <- data.table(Code = 3:-2,
                            Meaning = c(
                              "Daily electricity read originally recorded in kWh, match not tested due to rounding issues.",
                              "No meter: the (gas) meter does not exist in the DCC inventory",
                              paste("Daily read and half-hourly sum match: for electricity difference <= ", 
                              elec_match_limit," Wh, for gas difference <= ", gas_match_limit, " L", sep = ""),
                              "Comparison not possible: do not have 48 valid half-hourly reads or daily read isn't valid",
                              paste("Daily read and half-hourly sum are similar but don't match: for electricity ",
                              elec_match_limit," < difference <= ", elec_similar_limit ," Wh, for gas ", 
                              gas_match_limit, " < difference <= ", gas_similar_limit, " L", sep = ""), 
                              "Daily read and half-hourly read are neither similar nor match but are both valid"
                              )
                            )

sum.match.tab <- my.flex(sum.match.tab)

```

`r tab_caps("sum.match.tab")`
```{r}
sum.match.tab
```


# Read-type data quality summary

A read type is a combination of device type and schedule type, such as daily active electricity import or half-hourly reactive electricity export. There are `r ncol_reads` columns in the read-type data quality table. Each read type for each participant has its own row, which gives the number of readings with each error code, the start and end dates of the schedule (when we expect the earliest and latest readings to be), and some calculated columns for the percentage missing and valid etc. The details are given in the table below. Note that reads with error code 3 are not included in this table as they are an artefact of the data structure rather than genuine reads (see above for details). 


```{r schedules, include = FALSE}

  ncols.sched.tab <- ncol(rt_summary)
  
  sched.tab <- data.table(columnNames = rt_colnames,
                            description = rep(NA_character_, ncols.sched.tab),
                            units = rep(NA_character_, ncols.sched.tab),
                            class = rt_col_class,
                            example = rep(NA_character_, ncols.sched.tab))
  
  
  sched.tab[columnNames == "PUPRN",
            `:=`(description = "Pseudonymised participant identifier",
                 units = "NA",
                 example = "1VUXXXF1")]
  
  sched.tab[columnNames == "valid",
            `:=`(description = "Number of valid readings taken at the right time (error flag 1 and Valid_read_time = TRUE)",
                 units = "NA",
                 example = "96")]
  
  sched.tab[columnNames == "validOrHHsumValid",
            `:=`(description = "Number of valid readings taken at the right time (error flag 1 and Valid_read_time = TRUE), or days with a valid half hourly sum (all expected half hourly readings have error flag 1)",
                 units = "NA",
                 example = "96")]
  
  sched.tab[columnNames == "missing",
            `:=`(description = "Number of missing readings (error flag 0)",
                 units = "NA",
                 example = "27")]
  
  
  sched.tab[columnNames == "maxRead",
            `:=`(description = "Number of readings with the 'Max Read' error (flag -1)",
                 units = "NA",
                 example = "4")]
  
  sched.tab[columnNames == "highRead",
            `:=`(description = "Number of readings between the 'very high' and the 'max read' thresholds (error flag -2)",
                 units = "NA",
                 example = "2")]
  
  sched.tab[columnNames == "negative",
            `:=`(description = "Number of negative readings with (error flag -3)",
                 units = "NA",
                 example = "0")]
  
  sched.tab[columnNames == "wrongUnits",
            `:=`(description = "Number of readings with electricity recorded in kWh (error flag -4)",
                 units = "NA",
                 example = "3")]
  
  
  sched.tab[columnNames == "validWrongTime",   
            `:=`(description = "Number of valid readings taken at the incorrect time but that otherwise would have been given a valid flag",
                  units = "NA",
                  example = "7")]
  
  sched.tab[columnNames == "suspiciousZero",   
            `:=`(description = "Daily electricity active import readings that are zero - our analysis has shown these are likely to be errors, particularly during BST",
                  units = "NA",
                  example = "7")]
  
  sched.tab[columnNames == "deviceType",
            `:=`(description = "Device type: gas (GPF) or electricity (ESME) meter",
                 units = "NA",
                 example = "GPF")]
  
  sched.tab[columnNames == "readType",
            `:=`(description = "Defined in Table 1",
                 units = "NA",
                 example = "AI")]
  
  sched.tab[columnNames == "theoreticalStart",
            `:=`(description = "Earliest possible reading for the schedule",
                 units = "%Y-%m-%d",
                 example = "2019-11-01")]
  
  sched.tab[columnNames == "theoreticalEnd",
            `:=`(description = "Latest possible reading for the schedule",
                 units = "%Y-%m-%d",
                 example = "2020-02-29")]
  
  sched.tab[columnNames == "firstValidReadDate",
            `:=`(description = "Earliest date with a valid read (error flag 1 and Valid_read_time = TRUE)",
                 units = "%Y-%m-%d",
                 example = "2018-11-26")]
  
  sched.tab[columnNames == "lastValidReadDate",
            `:=`(description = "Latest date with a valid read (error flag 1 and Valid_read_time = TRUE)",
                 units = "%Y-%m-%d",
                 example = "2020-05-31")]
  
  sched.tab[columnNames == "daysRange",
            `:=`(description = "Schedule length = scheduleEnd - scheduleStart + 1",
                 units = "NA",
                 example = "100")]
  
  sched.tab[columnNames == "maxPossReads",
            `:=`(description = "Maximum possible reads available (= daysRange for daily data, = 48 * daysRange for half-hourly)",
                 units = "NA",
                 example = "4800")]
  
  sched.tab[columnNames == "percValid",
            `:=`(description = "Percentage of possible reads that are valid (error flag 1 and Valid_read_time = TRUE) rounded to 2 decimal places",
                 units = "NA",
                 example = "95.02")]
  
  sched.tab[columnNames == "percMissing",
            `:=`(description = "Percentage of possible reads that are missing (error flag 0) rounded to 2 decimal places",
                 units = "NA",
                 example = "2.13")]
  
  sched.tab[columnNames == "percError",
            `:=`(description = "Percentage of possible reads that are erroneous (error flag -1, -2, -3, -4 or Valid_read_time = FALSE) rounded to 2 decimal places",
                 units = "NA",
                 example = "3.04")]
  
  sched.tab[columnNames == "percValidOrUnitError",
            `:=`(description = "Percentage of possible reads that are valid or have a unit error (Valid_read_time = TRUE and error flag 1, or -4) rounded to 2 decimal places",
                 units = "NA",
                 example = "96.98")]
  
  sched.tab[columnNames == "minValidRead",
            `:=`(description = "Minimum read of the valid reads (after unit-correction if necessary)",
                 units = "Wh (elec), m^3 (gas)",
                 example = "0")]
  
  sched.tab[columnNames == "maxValidRead",
            `:=`(description = "Maximum read of the valid reads (after unit-correction if necessary)",
                 units = "Wh (elec), m^3 (gas)",
                 example = "302")]
  
  sched.tab[columnNames == "meanValidRead",
            `:=`(description = "Mean of the valid reads (after unit-correction if necessary), 2 decimal places",
                 units = "Wh (elec), m^3 (gas)",
                 example = "43.21")]
  
  
  colnames(sched.tab) <- c("Field", "Description", "Units", "Class", "Example value")
  
  
  sched_tab <- my.flex(sched.tab, autofit = FALSE)
  sched_tab <- compose(sched_tab,
                     i = 22:25,
                     j = "Units",
                     part = "body",
                     value = as_paragraph(
                       "Wh (elec), m",
                       as_sup("3"),
                       " (gas)"
                     ))


```

`r tab_caps("sched.tab")`
```{r}
sched_tab
```