diff --git a/.gitignore b/.gitignore index 4452110..354f6cd 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,7 @@ tests/* *.Rproj *.tar.gz + +*.html +README_cache/ +vignettes/rdefra_vignette_cache diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..beea6db --- /dev/null +++ b/.travis.yml @@ -0,0 +1,6 @@ +language: r +cache: packages +before_install: + - sudo apt-get -qq update + - sudo apt-get install r-cran-rgdal + - cd rdefra diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..4e0490a --- /dev/null +++ b/README.Rmd @@ -0,0 +1,179 @@ + + +rdefra: Interact with the UK AIR Pollution Database from DEFRA +--------------- + +[![CRAN Status Badge](http://www.r-pkg.org/badges/version/rdefra)](http://cran.r-project.org/web/packages/rdefra) +[![CRAN Total Downloads](http://cranlogs.r-pkg.org/badges/grand-total/rdefra)](http://cran.rstudio.com/web/packages/rdefra/index.html) +[![CRAN Monthly Downloads](http://cranlogs.r-pkg.org/badges/rdefra)](http://cran.rstudio.com/web/packages/rdefra/index.html) + +```{r, echo = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "README-", + message = FALSE +) +``` + +
+ +[Rdefra](https://cran.r-project.org/package=rdefra) is an R package to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information. + +This package follows a logic similar to other packages such as [waterData](https://cran.r-project.org/package=waterdata) and [rnrfa](https://cran.r-project.org/package=rnrfa): sites are first identified through a catalogue, data are imported via the station identification number, then data are visualised and/or used in analyses. The metadata related to the monitoring stations are accessible through the function `catalogue()`, missing stations' coordinates can be obtained using the function `EastingNorthing()`, and time series data related to different pollutants can be obtained using the function `get1Hdata()`. + +The package is designed to collect data efficiently. It allows to download multiple years of data for a single station with one line of code and, if used with the parallel package, allows the acquisition of data from hundreds of sites in only few minutes. + +For similar functionalities see also the [openair](https://cran.r-project.org/package=openair) package, which relies on a local copy of the data on servers at King's College (UK). + +### Dependencies +The rdefra package is dependent on a number of CRAN packages. Check for missing dependencies and install them: + +```R +packs <- c('RCurl', 'XML', 'plyr', 'rgdal', 'sp', 'devtools') +new.packages <- packs[!(packs %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages) +``` + +### Installation + +You can install this package from CRAN: + +```{r, eval=FALSE} +install.packages("rdefra") +``` + + +Or you can install the development version from Github with [devtools](https://github.com/hadley/devtools): + +```{r, eval=FALSE} +library(devtools) +install_github("cvitolo/r_rdefra", subdir = "rdefra") +``` + +Load the rdefra package: + +```{r} +library(rdefra) +``` + +### Functions +DEFRA monitoring stations can be downloaded and filtered using the function `catalogue()`. A cached version (downloaded in Feb 2016) is in `data(stations)`. + +```{r, cache = TRUE} +# Get full catalogue +stations <- catalogue() +``` + +Some of these have no coordinates but Easting (E) and Northing (N) are available on the DEFRA website. Get E and N, transform them to latitude and longitude and populate the missing coordinates using the code below. + +```{r, cache = TRUE} +# Find stations with no coordinates +myRows <- which(is.na(stations$Latitude) | is.na(stations$Longitude)) +# Get the ID of stations with no coordinates +stationList <- as.character(stations$UK.AIR.ID[myRows]) +# Scrape DEFRA website to get Easting/Northing +EN <- EastingNorthing(stationList) +# Only keep non-NA Easting/Northing coordinates +noNA <- which(!is.na(EN$Easting) & !is.na(EN$Northing)) +yesNA <- which(is.na(EN$Easting) & is.na(EN$Northing)) +``` + +Create spatial points from metadata table (coordinates are in WGS84): +```{r, cache = TRUE} +require(rgdal); require(sp) +# Define spatial points +pt <- EN[noNA,] +coordinates(pt) <- ~Easting+Northing +proj4string(pt) <- CRS("+init=epsg:27700") +# Convert coordinates from British National Grid to WGS84 +pt <- data.frame(spTransform(pt, CRS("+init=epsg:4326"))@coords) +names(pt) <- c("Longitude", "Latitude") + +# Populate the catalogue with newly calculated coordinates +stations[myRows[yesNA],c("UK.AIR.ID", "Longitude", "Latitude")] +stationsNew <- stations +stationsNew$Longitude[myRows][noNA] <- pt$Longitude +stationsNew$Latitude[myRows][noNA] <- pt$Latitude + +# Keep only stations with coordinates +noCoords <- which(is.na(stationsNew$Latitude) | is.na(stationsNew$Longitude)) +stationsNew <- stationsNew[-noCoords,] +``` + +Check whether there are hourly data available +```{r, cache = TRUE} +stationsNew$SiteID <- getSiteID(as.character(stationsNew$UK.AIR.ID)) +validStations <- which(!is.na(stationsNew$SiteID)) +IDstationHdata <- stationsNew$SiteID[validStations] +``` + +There are 6563 stations with valid coordinates within the UK-AIR (Air Information Resource, blue circles) database, for 225 of them hourly data is available and their location is shown in the map below (red circle). + +```{r, eval=FALSE} +library(leaflet) +leaflet(data = stationsNew) %>% addTiles() %>% + addCircleMarkers(lng = ~Longitude, lat = ~Latitude, radius = 0.5) %>% + addCircleMarkers(lng = ~Longitude[validStations], + lat = ~Latitude[validStations], + radius = 0.5, color="red", popup = ~SiteID[validStations]) +``` + +![UK-AIR monitoring stations (August 2016)](paper/MonitoringStations.png) + +How many of the above stations are in England and have hourly records? +```{r, cache = TRUE} +stationsNew <- stationsNew[!is.na(stationsNew$SiteID),] + +library(raster) +adm <- getData('GADM', country='GBR', level=1) +England <- adm[adm$NAME_1=='England',] +stationsSP <- SpatialPoints(stationsNew[, c('Longitude', 'Latitude')], + proj4string=CRS(proj4string(England))) + +library(sp) +x <- over(stationsSP, England)[,1] +x <- which(!is.na(x)) +stationsNew <- stationsNew[x,] +``` + +```{r, eval=FALSE} +library(leaflet) +leaflet(data = stationsNew) %>% addTiles() %>% + addCircleMarkers(lng = ~Longitude, lat = ~Latitude, + radius = 0.5, color="red", popup = ~SiteID) +``` + +Pollution data started to be collected in 1972, building the time series for a given station can be done in one line of code: + +```{r, cache = TRUE} +df <- get1Hdata("BAR2", years=1972:2016) +``` + +Using parallel processing, the acquisition of data from hundreds of sites takes only few minutes: + +```{r, eval=FALSE} +library(parallel) +library(plyr) + +# Calculate the number of cores +no_cores <- detectCores() - 1 + +# Initiate cluster +cl <- makeCluster(no_cores) + +system.time(myList <- parLapply(cl, IDstationHdata, +get1Hdata, years=1999:2016)) + +stopCluster(cl) + +df <- rbind.fill(myList) +``` + +## Meta + +* Please [report any issues or bugs](https://github.com/kehraProject/r_rdefra/issues). +* License: [GPL-3](https://opensource.org/licenses/GPL-3.0) +* Get citation information for `rdefra` in R doing `citation(package = 'rdefra')` + +[![ropensci_footer](http://ropensci.org/public_images/github_footer.png)](http://ropensci.org) diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..0d2d225 --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,20 @@ +This is a resubmission after adapting the package to make it suitable for inclusion in the ropensci framework. + +--------------------------------- + +## Release Summary + +This is the second release of rdefra. In this release, we added the following: + +* unist tests (using testthat framework), +* Travis CI integration +* a vignette +* paper for submission to JOSS +* documented the submission to the ropensci project + +## Test environment +* Ubuntu 14.04, R 3.3.1 + +## R CMD check results + +There were no ERRORs, WARNINGs or NOTEs. diff --git a/rdefra/man/regions.Rd b/extraData/regions.Rd similarity index 100% rename from rdefra/man/regions.Rd rename to extraData/regions.Rd diff --git a/extraData/regions.rda b/extraData/regions.rda new file mode 100644 index 0000000..41602b4 Binary files /dev/null and b/extraData/regions.rda differ diff --git a/paper/paper.md b/paper/paper.md index 0614d2c..aa8b057 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,32 +1,32 @@ --- title: 'rdefra: Interact with the UK AIR Pollution Database from DEFRA' +bibliography: paper.bib +date: "3 August 2016" tags: - - open data - - air pollution - - R +- open data +- air pollution +- R authors: - - name: Claudia Vitolo - orcid: 0000-0002-4252-1176 - affiliation: Brunel University London - - name: Andrew Russell - orcid: 0000-0001-7120-8499 - affiliation: Brunel University London - - name: Allan Tucker - orcid: 0000-0001-5105-3506 - affiliation: Brunel University London -date: 3 August 2016 -bibliography: paper.bib +- affiliation: Brunel University London + name: Claudia Vitolo + orcid: 0000-0002-4252-1176 +- affiliation: Brunel University London + name: Andrew Russell + orcid: 0000-0001-7120-8499 +- affiliation: Brunel University London + name: Allan Tucker + orcid: 0000-0001-5105-3506 --- # Summary -The rdefra package [@rdefra-archive] is an R package [@R-base] to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information. +Rdefra [@rdefra-archive] is an R package [@R-base] to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information. This package follows a logic similar to other packages such as waterData[@waterdata] and rnrfa[@rnrfa]: sites are first identified through a catalogue, data are imported via the station identification number, then data are visualised and/or used in analyses. The metadata related to the monitoring stations are accessible through the function `catalogue()`, missing stations' coordinates can be obtained using the function `EastingNorthing()`, and time series data related to different pollutants can be obtained using the function `get1Hdata()`. The package is designed to collect data efficiently. It allows to download multiple years of data for a single station with one line of code and, if used with the parallel package [@R-base], allows the acquisition of data from hundreds of sites in only few minutes. -The figure below showa the 6563 stations with valid coordinates within the UK-AIR (blue circles) database, for 225 of them hourly data is available and their location is shown as red circles. +The figure below shows the 6563 stations with valid coordinates within the UK-AIR (blue circles) database, for 225 of them hourly data is available and their location is shown as red circles. ![UK-AIR monitoring stations (August 2016)](MonitoringStations.png) diff --git a/preparePackage.R b/preparePackage.R new file mode 100644 index 0000000..ce2bde7 --- /dev/null +++ b/preparePackage.R @@ -0,0 +1,25 @@ +# Create a compressed version for the dataset 'regions' +load("~/regions.rda") +tools::checkRdaFiles("~/regions.rda") + +save(regions, + file='~/Dropbox/Repos/r_rdefra/extraData/regions.rda', + compress='xz') +# or, to compress in place: tools::resaveRdaFiles(paths = '~/Dropbox/Repos/r_rdefra/extraData/regions.rda', compress = 'xz') + +# Create a compressed version for the dataset 'stations' +load("~/stations.rda") +tools::checkRdaFiles("~/stations.rda") + +save(stations, + file='~/Dropbox/Repos/r_rdefra/rdefra/data/stations.rda', + compress='gzip') + +# Run unit tests using testthat +devtools::test('rdefra') + +# Run R CMD check or devtools::check() +devtools::check('rdefra') + +# Generate a template for a README.Rmd +devtools::use_readme_rmd() diff --git a/rdefra/.Rbuildignore b/rdefra/.Rbuildignore new file mode 100644 index 0000000..6e6b0d3 --- /dev/null +++ b/rdefra/.Rbuildignore @@ -0,0 +1 @@ +^README\.Rmd$ diff --git a/rdefra/DESCRIPTION b/rdefra/DESCRIPTION index 2cf5d11..eb1cd09 100644 --- a/rdefra/DESCRIPTION +++ b/rdefra/DESCRIPTION @@ -1,8 +1,8 @@ Package: rdefra Type: Package Title: Interact with the UK AIR Pollution Database from DEFRA -Version: 0.1 -Date: 2016-06-09 +Version: 0.2.0 +Date: 2016-08-03 Author: Claudia Vitolo [aut, cre], Andrew Russell [aut], Allan Tucker [aut] Maintainer: Claudia Vitolo URL: https://github.com/kehraProject/r_rdefra @@ -10,6 +10,8 @@ BugReports: https://github.com/kehraProject/r_rdefra/issues Description: Get data from DEFRA's UK-AIR website. It basically scraps the HTML content. Depends: R (>= 2.10) Imports: RCurl, XML, plyr +Suggests: testthat +LazyData: true Encoding: UTF-8 License: GPL-3 Repository: CRAN diff --git a/rdefra/data/regions.rda b/rdefra/data/regions.rda deleted file mode 100644 index aad0ea3..0000000 Binary files a/rdefra/data/regions.rda and /dev/null differ diff --git a/rdefra/data/stations.rda b/rdefra/data/stations.rda index f146809..4d3195e 100644 Binary files a/rdefra/data/stations.rda and b/rdefra/data/stations.rda differ diff --git a/rdefra/inst/CITATION b/rdefra/inst/CITATION new file mode 100644 index 0000000..414cf12 --- /dev/null +++ b/rdefra/inst/CITATION @@ -0,0 +1,12 @@ +citHeader("To cite 'rdefra' in publications, please use:") + +citEntry(entry = "manual", + author = "Claudia Vitolo and Andrew Russell and Allan Tucker", + title = "rdefra: Interact with the UK AIR Pollution Database from DEFRA", + year = "2016", + note = "R package version 0.2.0", + url = "https://CRAN.R-project.org/package=rdefra", + doi = "http://dx.doi.org/10.5281/zenodo.55270", + textVersion = "Claudia Vitolo, Andrew Russell and Allan Tucker (2016). rdefra: Interact with the UK AIR Pollution Database from DEFRA. R package version 0.2.0 + https://CRAN.R-project.org/package=rdefra" +) diff --git a/rdefra/inst/tests/testthat.R b/rdefra/inst/tests/testthat.R new file mode 100644 index 0000000..88130d6 --- /dev/null +++ b/rdefra/inst/tests/testthat.R @@ -0,0 +1,4 @@ +library('testthat') +library('rdefra') + +test_check('rdefra') diff --git a/rdefra/inst/tests/testthat/test-data.R b/rdefra/inst/tests/testthat/test-data.R new file mode 100644 index 0000000..bf5bd8c --- /dev/null +++ b/rdefra/inst/tests/testthat/test-data.R @@ -0,0 +1,18 @@ +context("Data") + +test_that("Are hourly data for station BTR3 available?", { + + site_id = "BTR3" + years <- 2012:2016 + + rootURL <- "https://uk-air.defra.gov.uk/data_files/site_data/" + myURL <- paste(rootURL, site_id, "_", years, ".csv", sep = "") + + con.url <- try(url(myURL[[1]])) + + expect_that(inherits(con.url, "try-error"), equals(FALSE)) + expect_that(length(myURL), equals(5)) + + closeAllConnections() + +}) diff --git a/rdefra/inst/tests/testthat/test-metadata.R b/rdefra/inst/tests/testthat/test-metadata.R new file mode 100644 index 0000000..ccea711 --- /dev/null +++ b/rdefra/inst/tests/testthat/test-metadata.R @@ -0,0 +1,24 @@ +context("Metadata") + +test_that("Is the DEFRA server running?", { + + site_name = ""; pollutant = 9999; group_id = 9999 + closed = "true"; country_id = 9999; region_id = 9999 + location_type = 9999; search = "Search+Network" + view = "advanced"; action = "results" + + rootURL <- "http://uk-air.defra.gov.uk/networks/find-sites?" + + myURL <- paste(rootURL, "&site_name=", site_name, "&pollutant=", pollutant, + "&group_id=", group_id, "&closed=", closed, "&country_id=", + country_id, "®ion_id=", region_id, "&location_type=", + location_type, "&search=", search, "&view=", + view, "&action=", action, sep = "") + + con.url <- try(url(myURL)) + + expect_that(inherits(con.url, "try-error"), equals(FALSE)) + + closeAllConnections() + +}) diff --git a/vignettes/rdefra_vignette.Rmd b/vignettes/rdefra_vignette.Rmd new file mode 100644 index 0000000..7cf6474 --- /dev/null +++ b/vignettes/rdefra_vignette.Rmd @@ -0,0 +1,129 @@ +--- +title: "rdefra_vignette" +author: "Claudia Vitolo" +date: "3 August 2016" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Introduction +Rdefra is an R package to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information. + +This package follows a logic similar to other packages such as waterData and rnrfa: sites are first identified through a catalogue, data are imported via the station identification number, then data are visualised and/or used in analyses. The metadata related to the monitoring stations are accessible through the function `catalogue()`, missing stations' coordinates can be obtained using the function `EastingNorthing()`, and time series data related to different pollutants can be obtained using the function `get1Hdata()`. + +The package is designed to collect data efficiently. It allows to download multiple years of data for a single station with one line of code and, if used with the parallel package, allows the acquisition of data from hundreds of sites in only few minutes. + +# Dependencies +The rdefra package and the examples in this vignette are dependent on a number of CRAN packages. Check for missing dependencies and install them: + +```{r, warning=FALSE, message=FALSE, cache=TRUE} +packs <- c('RCurl', 'XML', 'plyr', 'rgdal', 'sp', 'devtools') +new.packages <- packs[!(packs %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages) + +library(devtools) +``` + +# Installation +This package is currently under development and available via devtools: + +```{r, warning=FALSE, message=FALSE, cache=TRUE} +install_github("cvitolo/r_rdefra", subdir = "rdefra") +``` + +Now, load the rdefra package: + +```{r, warning=FALSE, message=FALSE, cache=TRUE} +library(rdefra) +``` + +# Functions +DEFRA monitoring stations can be downloaded and filtered using the function `catalogue()`. A cached version (downloaded in Feb 2016) is in `data(stations)`. + +```{r, warning=FALSE, message=FALSE, cache=TRUE} +# Get full catalogue +stations <- catalogue() +``` + +Some of these have no coordinates but Easting (E) and Northing (N) are available on the DEFRA website. Get E and N, transform them to latitude and longitude and populate the missing coordinates using the code below. + +```{r, warning=FALSE, message=FALSE, cache=TRUE} +# Find stations with no coordinates +myRows <- which(is.na(stations$Latitude) | is.na(stations$Longitude)) +# Get the ID of stations with no coordinates +stationList <- as.character(stations$UK.AIR.ID[myRows]) +# Scrape DEFRA website to get Easting/Northing +EN <- EastingNorthing(stationList) +# Only keep non-NA Easting/Northing coordinates +noNA <- which(!is.na(EN$Easting) & !is.na(EN$Northing)) +yesNA <- which(is.na(EN$Easting) & is.na(EN$Northing)) + +require(rgdal); require(sp) +# Define spatial points +pt <- EN[noNA,] +coordinates(pt) <- ~Easting+Northing +proj4string(pt) <- CRS("+init=epsg:27700") +# Convert coordinates from British National Grid to WGS84 +pt <- data.frame(spTransform(pt, CRS("+init=epsg:4326"))@coords) +names(pt) <- c("Longitude", "Latitude") + +# Populate the catalogue with newly calculated coordinates +stations[myRows[yesNA],c("UK.AIR.ID", "Longitude", "Latitude")] +stationsNew <- stations +stationsNew$Longitude[myRows][noNA] <- pt$Longitude +stationsNew$Latitude[myRows][noNA] <- pt$Latitude + +# Keep only stations with coordinates +noCoords <- which(is.na(stationsNew$Latitude) | is.na(stationsNew$Longitude)) +stationsNew <- stationsNew[-noCoords,] +``` + +Check whether there are hourly data available +```{r, warning=FALSE, message=FALSE, cache=TRUE} +stationsNew$SiteID <- getSiteID(as.character(stationsNew$UK.AIR.ID)) +validStations <- which(!is.na(stationsNew$SiteID)) +IDstationHdata <- stationsNew$SiteID[validStations] +``` + +There are 6563 stations with valid coordinates within the UK-AIR (Air Information Resource, blue circles) database, for 225 of them hourly data is available and their location is shown in the map below (red circle). + +```{r, warning=FALSE, message=FALSE, cache=TRUE} +library(leaflet) +leaflet(data = stationsNew) %>% addTiles() %>% + addCircleMarkers(lng = ~Longitude, lat = ~Latitude, radius = 0.5) %>% + addCircleMarkers(lng = ~Longitude[validStations], + lat = ~Latitude[validStations], + radius = 0.5, color="red", popup = ~SiteID[validStations]) +``` + +How many of the above stations are in England and have hourly records? +```{r, warning=FALSE, message=FALSE, cache=TRUE} +stationsNew <- stationsNew[!is.na(stationsNew$SiteID),] + +library(raster) +adm <- getData('GADM', country='GBR', level=1) +England <- adm[adm$NAME_1=='England',] +stationsSP <- SpatialPoints(stationsNew[, c('Longitude', 'Latitude')], + proj4string=CRS(proj4string(England))) + +library(sp) +x <- over(stationsSP, England)[,1] +x <- which(!is.na(x)) +stationsNew <- stationsNew[x,] + +library(leaflet) +leaflet(data = stationsNew) %>% addTiles() %>% + addCircleMarkers(lng = ~Longitude, lat = ~Latitude, + radius = 0.5, color="red", popup = ~SiteID) +``` + +Pollution data started to be collected in 1972, building the time series for a given station can be done in one line of code: + +```{r, warning=FALSE, message=FALSE, cache=TRUE} +df <- get1Hdata("BAR2", years=1972:2016) +``` + +Data retrieval can be also be performed in parallel, using the parallel package (see example in the README file).