-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserl_smart_meter_documentation.Rmd
870 lines (635 loc) · 42.2 KB
/
serl_smart_meter_documentation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
---
title: "Smart meter consumption data: \nTechnical documentation"
output:
word_document:
reference_doc: SERL_word_template_landscape1.docx
toc: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
#following only works for edition not monthly - and only when called from main
#release_version <- str_sub(getwd(), start = -9)
#ed <- stringr::str_sub(sub('.*edition', '', getwd()),
# start = 1, end = 2)
#edition <- paste0("edition", ed)
# Deal with different working directories depending on whether file is knitted from RStudio or
# rendered from main.R
wd <- getwd()
if(stringr::str_sub(wd, -13) == "documentation") {
parent_dir<-stringr::str_sub(wd, 1, nchar(wd) - 14)
release_version<-sub(".*/","",parent_dir)
} else {
release_version<-sub(".*/","",wd)
}
if(stringr::str_sub(wd, -13) == "documentation") {
source(paste0(stringr::str_sub(wd, 1, nchar(wd) - 13),
"/functions/create.info.table.R"))
} else {
source("./functions/create.info.table.R")
}
library(data.table)
library(ggplot2)
library(knitr)
library(captioner)
library(lubridate)
library(flextable)
library(officer)
library(stringr)
# Import all filenames, locations, source function files
data_folder <- paste0("your/data/foler", release_version)
final_rdata_folder <- paste0(data_folder, "/processed_rdata/")
rt_processed_file <- paste0(final_rdata_folder, "serl_smart_meter_rt_summary_", release_version, ".RDS")
intermediary_folder <- paste0(data_folder, "/intermediary_rdata/")
sm_doc_input_file <- paste0(intermediary_folder, "sm_data_documentation_input.RData")
#source(paste("your/folder/here",
# ed,
# "/Code/setup_edition", ed, ".R", sep = ""))
```
```{r defineInputs, include = FALSE}
# load_table_data <- FALSE # TRUE if we've run this file before and don't want to re-process the data (only edit the text, formatting, etc.)
min_n_to_determine_unit_error <- 30
# Define plotting/reporting variables for the markdown output
fig.h <- 6
fig.w <- 10
fullwidth <- TRUE
font.size <- 18
geom.text.size <- 5
```
```{r loadData, include = FALSE}
#load(rt_processed_file)
rt_summary <- readRDS(rt_processed_file)
load(sm_doc_input_file)
```
```{r functions, include = FALSE}
add.simple.table.superscript <- function(flex_table,
before = "m",
after = "",
power = "3",
row_ids = c(1, 2),
col_name = "Units",
t_part = "body") {
# puts superscript formatting in a flextable. E.g. with the
# defaults, we'd add m^3 (formatted) to rows 1 and 2 in
# column called units.
formatted_table <- compose(
flex_table,
i = row_ids,
j = col_name,
part = t_part,
value = as_paragraph(before,
as_sup(power),
after)
)
return(formatted_table)
}
my.flex <- function(t, autofit = TRUE) {
ft <- flextable(t)
ft <- theme_booktabs(ft)
ft <- theme_zebra(ft,
odd_header = rgb(84/255, 141/255, 212/255, 1),
even_body = "transparent",
odd_body = rgb(242/255, 242/255, 242/255, 1))
ft <- color(ft, color = "white", part = "header")
ft <- italic(ft, italic = TRUE, part = "header")
ft <- bold(ft, bold = FALSE, part = "header")
if(autofit == TRUE) {
ft <- autofit(ft)
} else{
ft <- set_table_properties(ft, width = 1, layout = "autofit")
}
return(ft)
}
remove.filename.version <- function(file_id) {
name_long <- get.serl.filename(file_id, release_version)
name_short <- str_sub(name_long, end = -3)
}
get.daily.filename.structure <- function(file_id) {
name_long <- get.serl.filename(file_id, release_version)
name_short <- str_sub(name_long, end = -11)
name_short <- paste(name_short, "_{YYYY}_edition{number}", sep = "")
}
```
```{r}
info_tab <- create.info.table(auth = "Ellen Zapata-Webborn, Jessica Few, Clare Hanmer")
info_tab
```
```{r captions, include = FALSE}
fig_caps <- captioner::captioner(prefix = "Figure")
tab_caps <- captioner::captioner(prefix = "Table")
tab_caps(name = "read.type.tab",
caption = "Smart meter data types, defined by 'deviceType' and 'readType'. ")
tab_caps(name = "hh.cols.tab",
caption = "Half-hourly data fields. Error flags are defined in the Error Flags section below.")
tab_caps(name = "d.cols.tab",
caption = "Daily data fields. See the Error Flags section below for definitions of the error flags used in this table.")
tab_caps("relevant.flags",
caption = "Error flags and their meanings.")
tab_caps("high_lim_tab",
caption = "Limits used to determine if a read is high enough for a 'very high read' flag (-2).")
tab_caps("sum.match.tab",
caption = "Electricity and gas 'sum_match' values and their meanings.")
tab_caps("sched.tab",
caption = "Read-type data quality summary: data fields. Error codes are explained above in the Error Flags section. Note that statistics for valid reads exclude valid reads recorded at the wrong time.")
tab_caps("partic.tab",
caption = "Participant data quality summary: data fields")
```
# Introduction
This document describes the half-hourly and daily smart meter datasets available to researchers with secure access to the SERL Observatory datasets, along with two data quality summary tables. The half-hourly data is provided in monthly files and the daily data is provided in yearly files. The data were collected from the earliest date available. The datasets described in this document are:
- serl_smart_meter_daily_{YYYY}_edition{number}.csv: yearly files of daily electricity and gas readings with some additional derived columns
- serl_smart_meter_half_hourly_{YYYY}_{MM}_edition{number}: monthly files of half-hourly electricity and gas readings with some additional derived columns (note that this dataset also includes reactive readings and export readings where available). Note that the monthly cut off follows UTC time.
- serl_smart_meter_rt_summary_edition{number}: data quality summary for each read type for each participant (such as number of errors found by type) and basic read statistics (such as mean and maximum)
where _{number}_ is the number of the data release, _e.g._ "02" (note that the first data release files are labelled with the release date rather than edition number), _{YYYY}_ and _{MM}_ are the year and month that files relate to respectively.
This document is structured as follows: we start with some basic information about how the data were collected and the different types of reading available, then we describe the two types of smart meter data tables (daily - multiple files each containing a year of daily data, and half-hourly - multiple files each containing a month of half-hourly data). Next we define the different types of error flag created, and finally we describe the data quality summary table (at the read-type level). Note that the participant summary table documentation was previously given in this file, but has been moved to its own document.
## Data collection
Half-hourly and daily smart meter readings are stored on the smart meter, and accessed by the Smart Energy Research Lab (SERL) as follows. The SERL technical team (The University of Essex (UK Data Archive) between 2018 and August 2022, and then the Advanced Research Computing Centre (ARCC) at University College London) uses a DCC adaptor service to communicate with Great Britain's central smart meter system (the Smart Data Communications Company (DCC)), which acts as a secure messaging service to transfer the smart meter readings (with the consent of SERL participants) to the secure SERL technical infrastructure. Not all properties have a gas meter we are able to access - for example, if there is only a SMETS2 electricity meter but not a SMETS2 gas meter, or if the property does not have mains gas. Researchers can check the EPC and survey data to identify properties which have gas central heating but no SERL gas data if this may affect their analysis.
## Smart meter read types
Smart meter read types are defined by two variables: 'deviceType' and 'readType'. Together they combine to define the type of smart meter data. The full list of smart meter data types are shown in the table below. Note that 'GPF' stands for 'Gas Proxy Function' (a proxy for the gas meter) and 'ESME' stands for 'Electricity Smart Metering Equipment' (the electricity meter).
```{r readTypes, include = FALSE}
read.type.tab[, units := "Wh"]
read.type.tab[deviceType == "GPF", units := "m^3^"]
read.type.tab[deviceType == "ESME" & readType == "DL",
`:=`(description = "Daily active electricity import",
position = 1)]
read.type.tab[deviceType == "ESME" & readType == "AI",
`:=`(description = "Half-hourly active electricity import",
position = 3)]
read.type.tab[deviceType == "ESME" & readType == "AE",
`:=`(description = "Half-hourly active electricity export",
position = 6)]
read.type.tab[deviceType == "ESME" & readType == "RI",
`:=`(description = "Half-hourly reactive electricity import",
position = 4)]
read.type.tab[deviceType == "ESME" & readType == "RE",
`:=`(description = "Half-hourly reactive electricity export",
position = 7)]
read.type.tab[deviceType == "GPF" & readType == "DL",
`:=`(description = "Daily gas import",
position = 2)]
read.type.tab[deviceType == "GPF" & readType == "AI",
`:=`(description = "Half-hourly gas import",
position = 5)]
setkey(read.type.tab, position)
read.type.tab <-
read.type.tab[, .(deviceType, readType, units, description)]
setnames(
read.type.tab,
old = c("units", "description"),
new = c("Units", "Description")
)
read_type_tab <- my.flex(read.type.tab)
read_type_tab <- add.simple.table.superscript(read_type_tab,
row_ids = c(2, 5))
```
`r tab_caps("read.type.tab")`
```{r}
read_type_tab
```
## Changes since the previous edition
There have been a few small changes to the data since the previous data:
- The data now runs up to the end of December 2023 for the smart meter data and climate data.
- Participants who have withdrawn and specified full erasure have been removed from all data files.
- We have increased the minimum threshold for flagging 'high' half hourly electricity active export readings from 2000 Wh to 5000 Wh, as domestic solar photovoltaic arrays have increased in size and some households are now exporting more than our original cut-off (the previous limit was the maximum output for a 4kW array, we have changed the limit to the maximum for a 10kW array).
We continue to analyse and improve the SERL datasets and we aim to strike a balance between improving the data and minimising inconvenience to researcher caused by changes. We welcome feedback to improve our processes.
# Smart meter data tables
## Half-hourly data
The half-hourly data tables have `r ncol_hh` columns. The fields are described in the table below. While the original data have not been modified, additional columns have been added to flag potential errors (see the Error Flags section below) and convert between units. This processing was done using `r paste(R.Version()$version.string)`. Code used for processing will be made available shortly on the SERL github repository [github.com/smartEnergyResearchLab](github.com/smartEnergyResearchLab).
Note that if no data were returned for any meter at a particular time then that row will be missing from the dataset rather than being an empty row. The 'class' field is the R class (e.g. R calls a Boolean a logical).
```{r hhColumnsTable, include = FALSE}
hh.cols.tab <- data.table(
columnNames = hh_colnames,
description = rep(NA_character_, ncol_hh),
units = rep(NA_character_, ncol_hh),
class = hh_col_class,
example = rep(NA_character_, ncol_hh,),
type = rep("Derived", ncol_hh)
)
hh.cols.tab[columnNames == "PUPRN",
`:=`(description = "Pseudonymised participant identifier",
units = "NA",
example = "1VUXXXF1",
type = "Assigned")]
hh.cols.tab[columnNames == "Read_date_effective_local",
`:=`(description = "Date of read (same as date of Read_date_time_local unless read taken at midnight, then the previous day since data pertains to the previous day)",
units = "%Y-%m-%d",
example = "2019-11-01")]
hh.cols.tab[columnNames == "Read_date_time_local",
`:=`(description = "Time read taken (local time: GMT or BST)",
units = "%Y-%m-%d %H:%M:%S tz",
example = "2020-07-02 00:03:30 BST")]
hh.cols.tab[columnNames == "Read_date_time_UTC",
`:=`(description = "Time read taken in UTC",
units = "%Y-%m-%d %H:%M:%S",
example = "2020-07-02 00:02:30",
type = "Primary")]
hh.cols.tab[columnNames == "HH",
`:=`(description = "Half-hour identifier between 1 and 48 (NA if not exactly on the hour or half-hour)",
units = "NA",
example = "48",
type = "Primary")]
hh.cols.tab[columnNames == "Elec_act_imp_hh_Wh",
`:=`(description = "Half-hourly electricity active import read",
units = "Wh",
example = "109",
type = "Primary")]
hh.cols.tab[columnNames == "Elec_react_imp_hh_varh",
`:=`(description = "Half-hourly electricity reactive import read",
units = "varh",
example = "15",
type = "Primary")]
hh.cols.tab[columnNames == "Elec_act_exp_hh_Wh",
`:=`(description = "Half-hourly electricity active export read",
units = "Wh",
example = "65",
type = "Primary")]
hh.cols.tab[columnNames == "Gas_hh_m3",
`:=`(description = "Half-hourly gas import read",
units = "m^3^",
example = "0.244",
type = "Primary")]
hh.cols.tab[columnNames == "Elec_react_exp_hh_varh",
`:=`(description = "Half-hourly electricity reactive export read",
units = "varh",
example = "14",
type = "Primary")]
hh.cols.tab[columnNames == "Elec_import_exists",
`:=`(description = "TRUE if electricity import meter exists in the inventory, otherwise FALSE",
units = "NA",
example = "TRUE")]
hh.cols.tab[columnNames == "Gas_exists",
`:=`(description = "TRUE if gas meter exists in the inventory, otherwise FALSE",
units = "NA",
example = "TRUE")]
hh.cols.tab[columnNames == "Elec_act_imp_flag",
`:=`(description = "Half-hourly electricity active import error flag",
units = "NA",
example = "-2")]
hh.cols.tab[columnNames == "Gas_flag",
`:=`(description = "Half-hourly gas import error flag",
units = "NA",
example = "0")]
hh.cols.tab[columnNames == "Gas_hh_kWh",
`:=`(description = "Half-hourly gas import read in kWh using standard conversion, assuming calorific value = 39.5",
units = "kWh",
example = "2.737835")]
hh.cols.tab[columnNames == "Gas_hh_Wh",
`:=`(description = "Half-hourly gas import read in Wh using standard conversion, assuming calorific value = 39.5",
units = "Wh",
example = "2737.835")]
hh.cols.tab[columnNames == "Elec_export_exists",
`:=`(description = "TRUE if electricity export meter exists in the inventory, otherwise FALSE",
units = "NA",
example = "FALSE")]
hh.cols.tab[columnNames == "Elec_act_exp_flag",
`:=`(description = "Half-hourly electricity active export error flag",
units = "NA",
example = "-4")]
hh.cols.tab[columnNames == "Elec_react_imp_flag",
`:=`(description = "Half-hourly electricity reactive import error flag",
units = "NA",
example = "1")]
hh.cols.tab[columnNames == "Elec_react_exp_flag",
`:=`(description = "Half-hourly electricity reactive export error flag",
units = "NA",
example = "2")]
hh.cols.tab[columnNames == "Valid_read_time",
`:=`(description = "FALSE if read time is not on the hour or half hour, otherwise TRUE",
units = "NA",
example = "TRUE")]
colnames(hh.cols.tab) <- c("Field", "Description", "Units", "Class", "Example value", "Variable type")
hh_cols_tab <- my.flex(hh.cols.tab, autofit = FALSE)
hh_cols_tab <- add.simple.table.superscript(hh_cols_tab,
row_ids = hh.cols.tab[ , which(Field == "Gas_hh_m3")])
```
`r tab_caps("hh.cols.tab")`
```{r}
hh_cols_tab
```
## Daily data
The daily data tables have `r ncol_d` columns. The fields are described in the table below. While original data has not been modified, additional columns have been added to flag potential errors (see Error Flags section below) and to convert between units. This processing was done using `r paste(R.Version()$version.string)`. Code used for processing is available on the SERL github repository [github.com/smartEnergyResearchLab](github.com/smartEnergyResearchLab).
Note that if no data were returned for any meter on a particular day then that row will be missing from the dataset rather than being empty. The exception is if there were no daily reads but the right number of half-hourly reads (48 unless the clocks changed) to sum to the daily total. In these instances the rows have been added to allow for easy imputation of a missing daily read with the sum of the half-hourly reads. For comparison between daily reads and daily sums, half-hourly data requires conversion (provided) to local time. A csv file is provided *(bst_dates_to_2024.csv)* which lists the start and end dates of British Summer Time (BST) for reference so the number of half-hours expected on each day is clear. Researchers may wish to avoid clock change days in their analysis, or take them into consideration.
The daily data includes fields added for comparison between the daily readings and half-hourly readings for the same days. As described in `r tab_caps("d.cols.tab", display = "cite")` there are columns for the sum of the half-hourly readings if there were 48 valid reads taken at the right times (or 46 or 50 reads if the clocks changed), the difference between the half-hourly sum and the daily read (if both exist and are valid), and a 'sum_match' column coded to state the condition of this match. The sum match codes are defined in `r tab_caps("sum.match.tab", display = "cite")` in the Error Flags section. Note that reads are considered invalid if taken at the wrong time (not at midnight for daily reads; not on the hour or half hour for half-hourly reads).
```{r DColumnsTable, include = FALSE}
d.cols.tab <- data.table(columnNames = d_colnames,
description = rep(NA_character_, ncol_d),
units = rep(NA_character_, ncol_d),
class = d_col_class,
example = rep(NA_character_, ncol_d),
type = rep("Derived", ncol_d))
d.cols.tab[columnNames == "PUPRN", `:=`(description = "Pseudonymised participant identifier",
units = "NA",
example = "1VUXXXF1",
type = "Assigned")]
# correct
d.cols.tab[columnNames == "Read_date_effective_local", `:=`(description = "Date that read relates to (in local time): previous day, unless after midday (then same day)",
units = "%Y-%m-%d",
example = "2019-11-01")]
d.cols.tab[columnNames == "Read_date_time_UTC", `:=`(description = "Time and date of read (UTC). Time not stated if at midnight",
units = "%Y-%m-%d",
example = "2019-11-02",
type = "Primary")]
d.cols.tab[columnNames == "Valid_read_time", `:=`(description = "TRUE if reading was at midnight, otherwise FALSE",
units = "NA",
example = "TRUE")]
d.cols.tab[columnNames == "Elec_act_imp_d_Wh", `:=`(description = "Daily electricity active import read",
units = "Wh",
example = "5839",
type = "Primary")]
d.cols.tab[columnNames == "Unit_correct_elec_act_imp_d_Wh", `:=`(description = "Daily electricity active import read corrected from kWh to Wh where kWh reporting is suspected, otherwise equals Elec_act_imp_d_Wh",
units = "Wh",
example = "5839")]
d.cols.tab[columnNames == "Elec_act_imp_d_kWh", `:=`(description = "Unit_correct_elec_act_imp_d_Wh divided by 1000. Note that reads originally recorded in kWh will be integers, otherwise 3 decimal places.",
units = "kWh",
example = "5.839")]
d.cols.tab[columnNames == "Elec_act_imp_flag", `:=`(description = "Daily electricity active import error flag",
units = "NA",
example = "-2")]
d.cols.tab[columnNames == "Elec_act_imp_hh_sum_Wh", `:=`(description = "Sum of half-hourly electricity active import reads for this date (NA if there were not 48* valid reads). *46 required when the clocks go forward, 50 when the clocks go back.",
units = "Wh",
example = "5742")]
d.cols.tab[columnNames == "Elec_act_imp_sum_diff", `:=`(description = "Unit_correct_elec_act_imp_d_Wh - Elec_act_imp_hh_sum_Wh",
units = "Wh",
example = "0")]
d.cols.tab[columnNames == "Elec_sum_match", `:=`(description = "Error code for whether the sum of half-hourly electricity active import matches the daily electricity read",
units = "NA",
example = "1")]
d.cols.tab[columnNames == "Gas_exists", `:=`(description = "TRUE if gas meter exists in the inventory, otherwise FALSE",
units = "NA",
example = "TRUE")]
d.cols.tab[columnNames == "Gas_d_m3", `:=`(description = "Daily gas read",
units = "m^3",
example = "8.214",
type = "Primary")]
d.cols.tab[columnNames == "Gas_d_kWh", `:=`(description = "Daily gas import read in kWh using standard conversion, assuming calorific value = 39.5",
units = "kWh",
example = "92.16628")]
d.cols.tab[columnNames == "Gas_flag", `:=`(description = "Daily gas import error flag",
units = "NA",
example = "2")]
d.cols.tab[columnNames == "N_gas_hh", `:=`(description = "Number of valid half-hourly gas import readings available on this date excluding invalid read time data",
units = "NA",
example = "48")]
d.cols.tab[columnNames == "Gas_hh_sum_m3", `:=`(description = "Sum of half-hourly gas reads for this date (NA if there were not 48* valid reads). *46 required when the clocks go forward, 50 when the clocks go back.",
units = "m^3",
example = "8.763")]
d.cols.tab[columnNames == "Gas_sum_diff", `:=`(description = "Gas_d_m3 - Gas_hh_sum_m3",
units = "m^3",
example = "0.273")]
d.cols.tab[columnNames == "Gas_sum_match", `:=`(description = "Error code for whether the sum of half-hourly gas import matches the daily gas read",
units = "NA",
example = "0")]
d.cols.tab[columnNames == "Valid_hh_sum_or_daily_elec", `:=`(description = "True if a valid half-hourly sum or daily electricity read exists",
units = "NA",
example = "TRUE")]
d.cols.tab[columnNames == "Valid_hh_sum_or_daily_gas", `:=`(description = "True if a valid half-hourly sum or daily gas read exists",
units = "NA",
example = "TRUE")]
colnames(d.cols.tab) <- c("Field", "Description", "Units",
"Class", "Example value", "Variable Type")
d.cols.tab = d.cols.tab[1:(ncol_d - 1)]
d_cols_tab <- my.flex(d.cols.tab, autofit = FALSE)
d_cols_tab <- add.simple.table.superscript(d_cols_tab,
row_ids = 14:15)
```
`r tab_caps("d.cols.tab")`
```{r}
d_cols_tab
```
# Error flags
## Flags for individual reads
`r tab_caps("relevant.flags", display = "cite")` shows the meaning of each error flag value. These flags are used for both daily and half-hourly reads. The error flags have evolved over the editions so please check the documentation relating to the dataset in use to learn about the relevant error flags.
```{r ErrorFlagTable, include = FALSE}
# -----
# Create a table called 'flags' to store the error flags, used for presenting the results in the markdown document
# -----
# 3 Ignore - Reading missing but not required as incorrect read time
# 2 No meter - or at least none we find via the DCC inventory
# 1 Valid read - a read that doesn't meet any other error flagging criteria
# 0 Missing read for an existing meter
# -1 'Max read' - see definition in setup
# -2 'High read' - see definition in setup
# -3 Negative read
# -4 Incorrect units - see 'correctUnits' code chunk
# -5 Valid read but invalid read time
relevant.flags <- data.table(
Flag = c(3:-6),
Meaning = c(
"Ignore",
"No meter",
"Valid",
"Missing",
"Max read",
"Very high but not max",
"Negative",
"Elec in kWh",
"Valid but invalid time",
"Suspicious zero"
),
Details = c(
"Invalid read time and no read for this read type - row exists for a different read type so ignore",
"The gas (or very rarely electricity) meter does not exist in the DCC inventory",
"The read exists and does not meet any of the other error flag criteria, thus presumed valid",
"The read should exist but is missing",
"The read is (presumably) the largest storable number on the meter - details below",
"The read is higher than plausible but not a 'Max read' - see section below for thresholds",
"The read is negative (none found)",
"The electricity read was reported in kWh rather than Wh - details below",
"The read time is invalid but otherwise the read is valid",
"The zero read is suspected to be an error - see details in section below"
)
)
relevant.flags <- my.flex(relevant.flags, autofit = FALSE)
```
`r tab_caps("relevant.flags")`
```{r}
relevant.flags
```
### 'Max reads' (error flag -1)
We discovered that multiple participants have some electricity readings as 16777215 Wh and multiple gas readings of 16777.215 m^3^. These numbers are all 1s in binary which implies they are the maximum read the (32-bit) meter can store, and likely due to some technical error. We call this type of error 'Max read'. Note that in a few cases the number is the max read stored in 64 bits - these are replaced by the 32-bit maximum to save memory.
### Very high reads (error flag -2)
```{r}
high_limits <- data.table(deviceType = c(rep("ESME", 3), rep("GPF", 2)),
readType = c("DL", "AI", "AE", "DL", "AI"),
lim = c(48 * 24 * 1000,
24 * 1000,
10000 / 2,
16 * 24,
16/2),
Units = c(rep("Wh", 3), rep("m^3", 2)),
Assumptions = c("Max 200A fuse, 240V",
"Max 200A fuse, 240V",
"10kW max PV capacity",
"16m^3/hr max capacity",
"16m^3/hr max capacity")
)
setnames(high_limits, "lim", "High Read Limit")
high_lim_tab <- my.flex(high_limits, autofit = FALSE)
high_lim_tab <- add.simple.table.superscript(high_lim_tab,
row_ids = c(4, 5))
high_lim_tab <- add.simple.table.superscript(high_lim_tab,
before = "16m",
after = "/hr max capacity",
col_name = "Assumptions",
row_ids = c(4, 5))
```
For gas and active electricity reads we flag if the reading is larger than we deem plausible, attempting to be cautious with our definition of 'plausible'. The following table shows our definitions of what constitutes a 'very high' read (so long as the read is not high enough to be a 'max read').
`r tab_caps("high_lim_tab")`
```{r}
high_lim_tab
```
### Readings in the wrong units (error flag -4)
We also discovered that for some participants all daily electricity active import readings were all extremely low. At the time of the initial investigation (August 2020) there were no participants with a maximum read between 85 and 2285 Wh; therefore we determined that any electricity active import (or export) meter with its highest reading less than 100 was erroneously reporting in kWh rather than in Wh as specified by the Smart Energy Code. This was verified by comparing the daily readings with the sum of the half-hours for the same day.
Subsequent investigations in March 2021 revealed that some meters were correctly recording daily electricity readings in Wh up until the date of a meter replacement, at which point the daily reads became approximately 1/1000th of the sum of the half-hourly reads. By 'approximately' we mean that after dividing the half-hourly sum by 1000 and rounding down, the result is within 1 of the daily read. For example, half-hourly sums between 5000 and 599 would be classed as approximately 1000 times bigger than a daily read between 4 and 6 (in order to handle rounding errors and slight read mismatch). In such cases, if a household has at least 5 rows with the daily and half-hourly sums in this situation (excluding daily reads of 0 which may be a different kind of error), all of such cases for the household are flagged as a unit error as above. Without checking all cases manually, a very small number of reads will be mis-flagged as unit errors or unit errors will be missed.
It is possible that some half-hourly sums are approximately 1/100th of the daily read, indicating readings in tens of Wh. This has been found to affect just a handful of meters at present, and it is left to researchers to decide how best to deal with daily and half-hourly sum mismatches in general. In most cases we believe that the sum of half-hours is more reliable than a daily read, but it depends on the individual meter, and not all days have complete valid half-hourly reads to sum. Note that if there are not the correct number of valid half-hourly reads taken at the right times then the half-hourly sum variable will be NA.
### 'Suspicious' Zero Reads (error flag -6)
From Edition 04 zero readings for daily electricity active import reads are flagged with -6 to indicate that they may be suspicious. Data quality analysis revealed that this read type is susceptible to errors. In particular, there are meters that start recording all zeros when the clocks go forward in the Spring until they change back, or in some cases, until they are fixed part-way through this period. No evidence has been found that this issue affects the other types of reads collected.
## Flags for the daily and half-hourly sum match
The daily data table contains fields called "Elec_sum_match" and "Gas_sum_match" which give an error flag for how the sum of the half-hourly reads for that day compares with the daily read. They are described in the table below. It isn't always possible to compare the sum and the daily readings; if any were reported in the wrong units, if any reads were taken at the wrong time, or if any of the reads were missing. By 'any' we mean the daily read and the 48 half-hourly reads (46 or 50 on clock change days).
```{r sumMatchTable, include = FALSE}
elec_match_limit <- 1 # (Wh)
elec_similar_limit <- 10 # (Wh)
gas_match_limit <- 0.001*1000 # (L)
gas_similar_limit <- 0.01*1000 # (L)
sum.match.tab <- data.table(Code = 3:-2,
Meaning = c(
"Daily electricity read originally recorded in kWh, match not tested due to rounding issues.",
"No meter: the (gas) meter does not exist in the DCC inventory",
paste("Daily read and half-hourly sum match: for electricity difference <= ",
elec_match_limit," Wh, for gas difference <= ", gas_match_limit, " L", sep = ""),
"Comparison not possible: do not have 48 valid half-hourly reads or daily read isn't valid",
paste("Daily read and half-hourly sum are similar but don't match: for electricity ",
elec_match_limit," < difference <= ", elec_similar_limit ," Wh, for gas ",
gas_match_limit, " < difference <= ", gas_similar_limit, " L", sep = ""),
"Daily read and half-hourly read are neither similar nor match but are both valid"
)
)
sum.match.tab <- my.flex(sum.match.tab)
```
`r tab_caps("sum.match.tab")`
```{r}
sum.match.tab
```
# Read-type data quality summary
A read type is a combination of device type and schedule type, such as daily active electricity import or half-hourly reactive electricity export. There are `r ncol_reads` columns in the read-type data quality table. Each read type for each participant has its own row, which gives the number of readings with each error code, the start and end dates of the schedule (when we expect the earliest and latest readings to be), and some calculated columns for the percentage missing and valid etc. The details are given in the table below. Note that reads with error code 3 are not included in this table as they are an artefact of the data structure rather than genuine reads (see above for details).
```{r schedules, include = FALSE}
ncols.sched.tab <- ncol(rt_summary)
sched.tab <- data.table(columnNames = rt_colnames,
description = rep(NA_character_, ncols.sched.tab),
units = rep(NA_character_, ncols.sched.tab),
class = rt_col_class,
example = rep(NA_character_, ncols.sched.tab))
sched.tab[columnNames == "PUPRN",
`:=`(description = "Pseudonymised participant identifier",
units = "NA",
example = "1VUXXXF1")]
sched.tab[columnNames == "valid",
`:=`(description = "Number of valid readings taken at the right time (error flag 1 and Valid_read_time = TRUE)",
units = "NA",
example = "96")]
sched.tab[columnNames == "validOrHHsumValid",
`:=`(description = "Number of valid readings taken at the right time (error flag 1 and Valid_read_time = TRUE), or days with a valid half hourly sum (all expected half hourly readings have error flag 1)",
units = "NA",
example = "96")]
sched.tab[columnNames == "missing",
`:=`(description = "Number of missing readings (error flag 0)",
units = "NA",
example = "27")]
sched.tab[columnNames == "maxRead",
`:=`(description = "Number of readings with the 'Max Read' error (flag -1)",
units = "NA",
example = "4")]
sched.tab[columnNames == "highRead",
`:=`(description = "Number of readings between the 'very high' and the 'max read' thresholds (error flag -2)",
units = "NA",
example = "2")]
sched.tab[columnNames == "negative",
`:=`(description = "Number of negative readings with (error flag -3)",
units = "NA",
example = "0")]
sched.tab[columnNames == "wrongUnits",
`:=`(description = "Number of readings with electricity recorded in kWh (error flag -4)",
units = "NA",
example = "3")]
sched.tab[columnNames == "validWrongTime",
`:=`(description = "Number of valid readings taken at the incorrect time but that otherwise would have been given a valid flag",
units = "NA",
example = "7")]
sched.tab[columnNames == "suspiciousZero",
`:=`(description = "Daily electricity active import readings that are zero - our analysis has shown these are likely to be errors, particularly during BST",
units = "NA",
example = "7")]
sched.tab[columnNames == "deviceType",
`:=`(description = "Device type: gas (GPF) or electricity (ESME) meter",
units = "NA",
example = "GPF")]
sched.tab[columnNames == "readType",
`:=`(description = "Defined in Table 1",
units = "NA",
example = "AI")]
sched.tab[columnNames == "theoreticalStart",
`:=`(description = "Earliest possible reading for the schedule",
units = "%Y-%m-%d",
example = "2019-11-01")]
sched.tab[columnNames == "theoreticalEnd",
`:=`(description = "Latest possible reading for the schedule",
units = "%Y-%m-%d",
example = "2020-02-29")]
sched.tab[columnNames == "firstValidReadDate",
`:=`(description = "Earliest date with a valid read (error flag 1 and Valid_read_time = TRUE)",
units = "%Y-%m-%d",
example = "2018-11-26")]
sched.tab[columnNames == "lastValidReadDate",
`:=`(description = "Latest date with a valid read (error flag 1 and Valid_read_time = TRUE)",
units = "%Y-%m-%d",
example = "2020-05-31")]
sched.tab[columnNames == "daysRange",
`:=`(description = "Schedule length = scheduleEnd - scheduleStart + 1",
units = "NA",
example = "100")]
sched.tab[columnNames == "maxPossReads",
`:=`(description = "Maximum possible reads available (= daysRange for daily data, = 48 * daysRange for half-hourly)",
units = "NA",
example = "4800")]
sched.tab[columnNames == "percValid",
`:=`(description = "Percentage of possible reads that are valid (error flag 1 and Valid_read_time = TRUE) rounded to 2 decimal places",
units = "NA",
example = "95.02")]
sched.tab[columnNames == "percMissing",
`:=`(description = "Percentage of possible reads that are missing (error flag 0) rounded to 2 decimal places",
units = "NA",
example = "2.13")]
sched.tab[columnNames == "percError",
`:=`(description = "Percentage of possible reads that are erroneous (error flag -1, -2, -3, -4 or Valid_read_time = FALSE) rounded to 2 decimal places",
units = "NA",
example = "3.04")]
sched.tab[columnNames == "percValidOrUnitError",
`:=`(description = "Percentage of possible reads that are valid or have a unit error (Valid_read_time = TRUE and error flag 1, or -4) rounded to 2 decimal places",
units = "NA",
example = "96.98")]
sched.tab[columnNames == "minValidRead",
`:=`(description = "Minimum read of the valid reads (after unit-correction if necessary)",
units = "Wh (elec), m^3 (gas)",
example = "0")]
sched.tab[columnNames == "maxValidRead",
`:=`(description = "Maximum read of the valid reads (after unit-correction if necessary)",
units = "Wh (elec), m^3 (gas)",
example = "302")]
sched.tab[columnNames == "meanValidRead",
`:=`(description = "Mean of the valid reads (after unit-correction if necessary), 2 decimal places",
units = "Wh (elec), m^3 (gas)",
example = "43.21")]
colnames(sched.tab) <- c("Field", "Description", "Units", "Class", "Example value")
sched_tab <- my.flex(sched.tab, autofit = FALSE)
sched_tab <- compose(sched_tab,
i = 22:25,
j = "Units",
part = "body",
value = as_paragraph(
"Wh (elec), m",
as_sup("3"),
" (gas)"
))
```
`r tab_caps("sched.tab")`
```{r}
sched_tab
```