diff --git a/.gitignore b/.gitignore
index 4452110..354f6cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,7 @@
tests/*
*.Rproj
*.tar.gz
+
+*.html
+README_cache/
+vignettes/rdefra_vignette_cache
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..beea6db
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,6 @@
+language: r
+cache: packages
+before_install:
+ - sudo apt-get -qq update
+ - sudo apt-get install r-cran-rgdal
+ - cd rdefra
diff --git a/README.Rmd b/README.Rmd
new file mode 100644
index 0000000..4e0490a
--- /dev/null
+++ b/README.Rmd
@@ -0,0 +1,179 @@
+
+
+rdefra: Interact with the UK AIR Pollution Database from DEFRA
+---------------
+
+[![CRAN Status Badge](http://www.r-pkg.org/badges/version/rdefra)](http://cran.r-project.org/web/packages/rdefra)
+[![CRAN Total Downloads](http://cranlogs.r-pkg.org/badges/grand-total/rdefra)](http://cran.rstudio.com/web/packages/rdefra/index.html)
+[![CRAN Monthly Downloads](http://cranlogs.r-pkg.org/badges/rdefra)](http://cran.rstudio.com/web/packages/rdefra/index.html)
+
+```{r, echo = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ fig.path = "README-",
+ message = FALSE
+)
+```
+
+
+
+[Rdefra](https://cran.r-project.org/package=rdefra) is an R package to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information.
+
+This package follows a logic similar to other packages such as [waterData](https://cran.r-project.org/package=waterdata) and [rnrfa](https://cran.r-project.org/package=rnrfa): sites are first identified through a catalogue, data are imported via the station identification number, then data are visualised and/or used in analyses. The metadata related to the monitoring stations are accessible through the function `catalogue()`, missing stations' coordinates can be obtained using the function `EastingNorthing()`, and time series data related to different pollutants can be obtained using the function `get1Hdata()`.
+
+The package is designed to collect data efficiently. It allows to download multiple years of data for a single station with one line of code and, if used with the parallel package, allows the acquisition of data from hundreds of sites in only few minutes.
+
+For similar functionalities see also the [openair](https://cran.r-project.org/package=openair) package, which relies on a local copy of the data on servers at King's College (UK).
+
+### Dependencies
+The rdefra package is dependent on a number of CRAN packages. Check for missing dependencies and install them:
+
+```R
+packs <- c('RCurl', 'XML', 'plyr', 'rgdal', 'sp', 'devtools')
+new.packages <- packs[!(packs %in% installed.packages()[,"Package"])]
+if(length(new.packages)) install.packages(new.packages)
+```
+
+### Installation
+
+You can install this package from CRAN:
+
+```{r, eval=FALSE}
+install.packages("rdefra")
+```
+
+
+Or you can install the development version from Github with [devtools](https://github.com/hadley/devtools):
+
+```{r, eval=FALSE}
+library(devtools)
+install_github("cvitolo/r_rdefra", subdir = "rdefra")
+```
+
+Load the rdefra package:
+
+```{r}
+library(rdefra)
+```
+
+### Functions
+DEFRA monitoring stations can be downloaded and filtered using the function `catalogue()`. A cached version (downloaded in Feb 2016) is in `data(stations)`.
+
+```{r, cache = TRUE}
+# Get full catalogue
+stations <- catalogue()
+```
+
+Some of these have no coordinates but Easting (E) and Northing (N) are available on the DEFRA website. Get E and N, transform them to latitude and longitude and populate the missing coordinates using the code below.
+
+```{r, cache = TRUE}
+# Find stations with no coordinates
+myRows <- which(is.na(stations$Latitude) | is.na(stations$Longitude))
+# Get the ID of stations with no coordinates
+stationList <- as.character(stations$UK.AIR.ID[myRows])
+# Scrape DEFRA website to get Easting/Northing
+EN <- EastingNorthing(stationList)
+# Only keep non-NA Easting/Northing coordinates
+noNA <- which(!is.na(EN$Easting) & !is.na(EN$Northing))
+yesNA <- which(is.na(EN$Easting) & is.na(EN$Northing))
+```
+
+Create spatial points from metadata table (coordinates are in WGS84):
+```{r, cache = TRUE}
+require(rgdal); require(sp)
+# Define spatial points
+pt <- EN[noNA,]
+coordinates(pt) <- ~Easting+Northing
+proj4string(pt) <- CRS("+init=epsg:27700")
+# Convert coordinates from British National Grid to WGS84
+pt <- data.frame(spTransform(pt, CRS("+init=epsg:4326"))@coords)
+names(pt) <- c("Longitude", "Latitude")
+
+# Populate the catalogue with newly calculated coordinates
+stations[myRows[yesNA],c("UK.AIR.ID", "Longitude", "Latitude")]
+stationsNew <- stations
+stationsNew$Longitude[myRows][noNA] <- pt$Longitude
+stationsNew$Latitude[myRows][noNA] <- pt$Latitude
+
+# Keep only stations with coordinates
+noCoords <- which(is.na(stationsNew$Latitude) | is.na(stationsNew$Longitude))
+stationsNew <- stationsNew[-noCoords,]
+```
+
+Check whether there are hourly data available
+```{r, cache = TRUE}
+stationsNew$SiteID <- getSiteID(as.character(stationsNew$UK.AIR.ID))
+validStations <- which(!is.na(stationsNew$SiteID))
+IDstationHdata <- stationsNew$SiteID[validStations]
+```
+
+There are 6563 stations with valid coordinates within the UK-AIR (Air Information Resource, blue circles) database, for 225 of them hourly data is available and their location is shown in the map below (red circle).
+
+```{r, eval=FALSE}
+library(leaflet)
+leaflet(data = stationsNew) %>% addTiles() %>%
+ addCircleMarkers(lng = ~Longitude, lat = ~Latitude, radius = 0.5) %>%
+ addCircleMarkers(lng = ~Longitude[validStations],
+ lat = ~Latitude[validStations],
+ radius = 0.5, color="red", popup = ~SiteID[validStations])
+```
+
+![UK-AIR monitoring stations (August 2016)](paper/MonitoringStations.png)
+
+How many of the above stations are in England and have hourly records?
+```{r, cache = TRUE}
+stationsNew <- stationsNew[!is.na(stationsNew$SiteID),]
+
+library(raster)
+adm <- getData('GADM', country='GBR', level=1)
+England <- adm[adm$NAME_1=='England',]
+stationsSP <- SpatialPoints(stationsNew[, c('Longitude', 'Latitude')],
+ proj4string=CRS(proj4string(England)))
+
+library(sp)
+x <- over(stationsSP, England)[,1]
+x <- which(!is.na(x))
+stationsNew <- stationsNew[x,]
+```
+
+```{r, eval=FALSE}
+library(leaflet)
+leaflet(data = stationsNew) %>% addTiles() %>%
+ addCircleMarkers(lng = ~Longitude, lat = ~Latitude,
+ radius = 0.5, color="red", popup = ~SiteID)
+```
+
+Pollution data started to be collected in 1972, building the time series for a given station can be done in one line of code:
+
+```{r, cache = TRUE}
+df <- get1Hdata("BAR2", years=1972:2016)
+```
+
+Using parallel processing, the acquisition of data from hundreds of sites takes only few minutes:
+
+```{r, eval=FALSE}
+library(parallel)
+library(plyr)
+
+# Calculate the number of cores
+no_cores <- detectCores() - 1
+
+# Initiate cluster
+cl <- makeCluster(no_cores)
+
+system.time(myList <- parLapply(cl, IDstationHdata,
+get1Hdata, years=1999:2016))
+
+stopCluster(cl)
+
+df <- rbind.fill(myList)
+```
+
+## Meta
+
+* Please [report any issues or bugs](https://github.com/kehraProject/r_rdefra/issues).
+* License: [GPL-3](https://opensource.org/licenses/GPL-3.0)
+* Get citation information for `rdefra` in R doing `citation(package = 'rdefra')`
+
+[![ropensci_footer](http://ropensci.org/public_images/github_footer.png)](http://ropensci.org)
diff --git a/cran-comments.md b/cran-comments.md
new file mode 100644
index 0000000..0d2d225
--- /dev/null
+++ b/cran-comments.md
@@ -0,0 +1,20 @@
+This is a resubmission after adapting the package to make it suitable for inclusion in the ropensci framework.
+
+---------------------------------
+
+## Release Summary
+
+This is the second release of rdefra. In this release, we added the following:
+
+* unist tests (using testthat framework),
+* Travis CI integration
+* a vignette
+* paper for submission to JOSS
+* documented the submission to the ropensci project
+
+## Test environment
+* Ubuntu 14.04, R 3.3.1
+
+## R CMD check results
+
+There were no ERRORs, WARNINGs or NOTEs.
diff --git a/rdefra/man/regions.Rd b/extraData/regions.Rd
similarity index 100%
rename from rdefra/man/regions.Rd
rename to extraData/regions.Rd
diff --git a/extraData/regions.rda b/extraData/regions.rda
new file mode 100644
index 0000000..41602b4
Binary files /dev/null and b/extraData/regions.rda differ
diff --git a/paper/paper.md b/paper/paper.md
index 0614d2c..aa8b057 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -1,32 +1,32 @@
---
title: 'rdefra: Interact with the UK AIR Pollution Database from DEFRA'
+bibliography: paper.bib
+date: "3 August 2016"
tags:
- - open data
- - air pollution
- - R
+- open data
+- air pollution
+- R
authors:
- - name: Claudia Vitolo
- orcid: 0000-0002-4252-1176
- affiliation: Brunel University London
- - name: Andrew Russell
- orcid: 0000-0001-7120-8499
- affiliation: Brunel University London
- - name: Allan Tucker
- orcid: 0000-0001-5105-3506
- affiliation: Brunel University London
-date: 3 August 2016
-bibliography: paper.bib
+- affiliation: Brunel University London
+ name: Claudia Vitolo
+ orcid: 0000-0002-4252-1176
+- affiliation: Brunel University London
+ name: Andrew Russell
+ orcid: 0000-0001-7120-8499
+- affiliation: Brunel University London
+ name: Allan Tucker
+ orcid: 0000-0001-5105-3506
---
# Summary
-The rdefra package [@rdefra-archive] is an R package [@R-base] to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information.
+Rdefra [@rdefra-archive] is an R package [@R-base] to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information.
This package follows a logic similar to other packages such as waterData[@waterdata] and rnrfa[@rnrfa]: sites are first identified through a catalogue, data are imported via the station identification number, then data are visualised and/or used in analyses. The metadata related to the monitoring stations are accessible through the function `catalogue()`, missing stations' coordinates can be obtained using the function `EastingNorthing()`, and time series data related to different pollutants can be obtained using the function `get1Hdata()`.
The package is designed to collect data efficiently. It allows to download multiple years of data for a single station with one line of code and, if used with the parallel package [@R-base], allows the acquisition of data from hundreds of sites in only few minutes.
-The figure below showa the 6563 stations with valid coordinates within the UK-AIR (blue circles) database, for 225 of them hourly data is available and their location is shown as red circles.
+The figure below shows the 6563 stations with valid coordinates within the UK-AIR (blue circles) database, for 225 of them hourly data is available and their location is shown as red circles.
![UK-AIR monitoring stations (August 2016)](MonitoringStations.png)
diff --git a/preparePackage.R b/preparePackage.R
new file mode 100644
index 0000000..ce2bde7
--- /dev/null
+++ b/preparePackage.R
@@ -0,0 +1,25 @@
+# Create a compressed version for the dataset 'regions'
+load("~/regions.rda")
+tools::checkRdaFiles("~/regions.rda")
+
+save(regions,
+ file='~/Dropbox/Repos/r_rdefra/extraData/regions.rda',
+ compress='xz')
+# or, to compress in place: tools::resaveRdaFiles(paths = '~/Dropbox/Repos/r_rdefra/extraData/regions.rda', compress = 'xz')
+
+# Create a compressed version for the dataset 'stations'
+load("~/stations.rda")
+tools::checkRdaFiles("~/stations.rda")
+
+save(stations,
+ file='~/Dropbox/Repos/r_rdefra/rdefra/data/stations.rda',
+ compress='gzip')
+
+# Run unit tests using testthat
+devtools::test('rdefra')
+
+# Run R CMD check or devtools::check()
+devtools::check('rdefra')
+
+# Generate a template for a README.Rmd
+devtools::use_readme_rmd()
diff --git a/rdefra/.Rbuildignore b/rdefra/.Rbuildignore
new file mode 100644
index 0000000..6e6b0d3
--- /dev/null
+++ b/rdefra/.Rbuildignore
@@ -0,0 +1 @@
+^README\.Rmd$
diff --git a/rdefra/DESCRIPTION b/rdefra/DESCRIPTION
index 2cf5d11..eb1cd09 100644
--- a/rdefra/DESCRIPTION
+++ b/rdefra/DESCRIPTION
@@ -1,8 +1,8 @@
Package: rdefra
Type: Package
Title: Interact with the UK AIR Pollution Database from DEFRA
-Version: 0.1
-Date: 2016-06-09
+Version: 0.2.0
+Date: 2016-08-03
Author: Claudia Vitolo [aut, cre], Andrew Russell [aut], Allan Tucker [aut]
Maintainer: Claudia Vitolo
URL: https://github.com/kehraProject/r_rdefra
@@ -10,6 +10,8 @@ BugReports: https://github.com/kehraProject/r_rdefra/issues
Description: Get data from DEFRA's UK-AIR website. It basically scraps the HTML content.
Depends: R (>= 2.10)
Imports: RCurl, XML, plyr
+Suggests: testthat
+LazyData: true
Encoding: UTF-8
License: GPL-3
Repository: CRAN
diff --git a/rdefra/data/regions.rda b/rdefra/data/regions.rda
deleted file mode 100644
index aad0ea3..0000000
Binary files a/rdefra/data/regions.rda and /dev/null differ
diff --git a/rdefra/data/stations.rda b/rdefra/data/stations.rda
index f146809..4d3195e 100644
Binary files a/rdefra/data/stations.rda and b/rdefra/data/stations.rda differ
diff --git a/rdefra/inst/CITATION b/rdefra/inst/CITATION
new file mode 100644
index 0000000..414cf12
--- /dev/null
+++ b/rdefra/inst/CITATION
@@ -0,0 +1,12 @@
+citHeader("To cite 'rdefra' in publications, please use:")
+
+citEntry(entry = "manual",
+ author = "Claudia Vitolo and Andrew Russell and Allan Tucker",
+ title = "rdefra: Interact with the UK AIR Pollution Database from DEFRA",
+ year = "2016",
+ note = "R package version 0.2.0",
+ url = "https://CRAN.R-project.org/package=rdefra",
+ doi = "http://dx.doi.org/10.5281/zenodo.55270",
+ textVersion = "Claudia Vitolo, Andrew Russell and Allan Tucker (2016). rdefra: Interact with the UK AIR Pollution Database from DEFRA. R package version 0.2.0
+ https://CRAN.R-project.org/package=rdefra"
+)
diff --git a/rdefra/inst/tests/testthat.R b/rdefra/inst/tests/testthat.R
new file mode 100644
index 0000000..88130d6
--- /dev/null
+++ b/rdefra/inst/tests/testthat.R
@@ -0,0 +1,4 @@
+library('testthat')
+library('rdefra')
+
+test_check('rdefra')
diff --git a/rdefra/inst/tests/testthat/test-data.R b/rdefra/inst/tests/testthat/test-data.R
new file mode 100644
index 0000000..bf5bd8c
--- /dev/null
+++ b/rdefra/inst/tests/testthat/test-data.R
@@ -0,0 +1,18 @@
+context("Data")
+
+test_that("Are hourly data for station BTR3 available?", {
+
+ site_id = "BTR3"
+ years <- 2012:2016
+
+ rootURL <- "https://uk-air.defra.gov.uk/data_files/site_data/"
+ myURL <- paste(rootURL, site_id, "_", years, ".csv", sep = "")
+
+ con.url <- try(url(myURL[[1]]))
+
+ expect_that(inherits(con.url, "try-error"), equals(FALSE))
+ expect_that(length(myURL), equals(5))
+
+ closeAllConnections()
+
+})
diff --git a/rdefra/inst/tests/testthat/test-metadata.R b/rdefra/inst/tests/testthat/test-metadata.R
new file mode 100644
index 0000000..ccea711
--- /dev/null
+++ b/rdefra/inst/tests/testthat/test-metadata.R
@@ -0,0 +1,24 @@
+context("Metadata")
+
+test_that("Is the DEFRA server running?", {
+
+ site_name = ""; pollutant = 9999; group_id = 9999
+ closed = "true"; country_id = 9999; region_id = 9999
+ location_type = 9999; search = "Search+Network"
+ view = "advanced"; action = "results"
+
+ rootURL <- "http://uk-air.defra.gov.uk/networks/find-sites?"
+
+ myURL <- paste(rootURL, "&site_name=", site_name, "&pollutant=", pollutant,
+ "&group_id=", group_id, "&closed=", closed, "&country_id=",
+ country_id, "®ion_id=", region_id, "&location_type=",
+ location_type, "&search=", search, "&view=",
+ view, "&action=", action, sep = "")
+
+ con.url <- try(url(myURL))
+
+ expect_that(inherits(con.url, "try-error"), equals(FALSE))
+
+ closeAllConnections()
+
+})
diff --git a/vignettes/rdefra_vignette.Rmd b/vignettes/rdefra_vignette.Rmd
new file mode 100644
index 0000000..7cf6474
--- /dev/null
+++ b/vignettes/rdefra_vignette.Rmd
@@ -0,0 +1,129 @@
+---
+title: "rdefra_vignette"
+author: "Claudia Vitolo"
+date: "3 August 2016"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Introduction
+Rdefra is an R package to retrieve air pollution data from the Air Information Resource (UK-AIR) of the Department for Environment, Food and Rural Affairs in the United Kingdom. UK-AIR does not provide a public API for programmatic access to data, therefore this package scrapes the HTML pages to get relevant information.
+
+This package follows a logic similar to other packages such as waterData and rnrfa: sites are first identified through a catalogue, data are imported via the station identification number, then data are visualised and/or used in analyses. The metadata related to the monitoring stations are accessible through the function `catalogue()`, missing stations' coordinates can be obtained using the function `EastingNorthing()`, and time series data related to different pollutants can be obtained using the function `get1Hdata()`.
+
+The package is designed to collect data efficiently. It allows to download multiple years of data for a single station with one line of code and, if used with the parallel package, allows the acquisition of data from hundreds of sites in only few minutes.
+
+# Dependencies
+The rdefra package and the examples in this vignette are dependent on a number of CRAN packages. Check for missing dependencies and install them:
+
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+packs <- c('RCurl', 'XML', 'plyr', 'rgdal', 'sp', 'devtools')
+new.packages <- packs[!(packs %in% installed.packages()[,"Package"])]
+if(length(new.packages)) install.packages(new.packages)
+
+library(devtools)
+```
+
+# Installation
+This package is currently under development and available via devtools:
+
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+install_github("cvitolo/r_rdefra", subdir = "rdefra")
+```
+
+Now, load the rdefra package:
+
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+library(rdefra)
+```
+
+# Functions
+DEFRA monitoring stations can be downloaded and filtered using the function `catalogue()`. A cached version (downloaded in Feb 2016) is in `data(stations)`.
+
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+# Get full catalogue
+stations <- catalogue()
+```
+
+Some of these have no coordinates but Easting (E) and Northing (N) are available on the DEFRA website. Get E and N, transform them to latitude and longitude and populate the missing coordinates using the code below.
+
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+# Find stations with no coordinates
+myRows <- which(is.na(stations$Latitude) | is.na(stations$Longitude))
+# Get the ID of stations with no coordinates
+stationList <- as.character(stations$UK.AIR.ID[myRows])
+# Scrape DEFRA website to get Easting/Northing
+EN <- EastingNorthing(stationList)
+# Only keep non-NA Easting/Northing coordinates
+noNA <- which(!is.na(EN$Easting) & !is.na(EN$Northing))
+yesNA <- which(is.na(EN$Easting) & is.na(EN$Northing))
+
+require(rgdal); require(sp)
+# Define spatial points
+pt <- EN[noNA,]
+coordinates(pt) <- ~Easting+Northing
+proj4string(pt) <- CRS("+init=epsg:27700")
+# Convert coordinates from British National Grid to WGS84
+pt <- data.frame(spTransform(pt, CRS("+init=epsg:4326"))@coords)
+names(pt) <- c("Longitude", "Latitude")
+
+# Populate the catalogue with newly calculated coordinates
+stations[myRows[yesNA],c("UK.AIR.ID", "Longitude", "Latitude")]
+stationsNew <- stations
+stationsNew$Longitude[myRows][noNA] <- pt$Longitude
+stationsNew$Latitude[myRows][noNA] <- pt$Latitude
+
+# Keep only stations with coordinates
+noCoords <- which(is.na(stationsNew$Latitude) | is.na(stationsNew$Longitude))
+stationsNew <- stationsNew[-noCoords,]
+```
+
+Check whether there are hourly data available
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+stationsNew$SiteID <- getSiteID(as.character(stationsNew$UK.AIR.ID))
+validStations <- which(!is.na(stationsNew$SiteID))
+IDstationHdata <- stationsNew$SiteID[validStations]
+```
+
+There are 6563 stations with valid coordinates within the UK-AIR (Air Information Resource, blue circles) database, for 225 of them hourly data is available and their location is shown in the map below (red circle).
+
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+library(leaflet)
+leaflet(data = stationsNew) %>% addTiles() %>%
+ addCircleMarkers(lng = ~Longitude, lat = ~Latitude, radius = 0.5) %>%
+ addCircleMarkers(lng = ~Longitude[validStations],
+ lat = ~Latitude[validStations],
+ radius = 0.5, color="red", popup = ~SiteID[validStations])
+```
+
+How many of the above stations are in England and have hourly records?
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+stationsNew <- stationsNew[!is.na(stationsNew$SiteID),]
+
+library(raster)
+adm <- getData('GADM', country='GBR', level=1)
+England <- adm[adm$NAME_1=='England',]
+stationsSP <- SpatialPoints(stationsNew[, c('Longitude', 'Latitude')],
+ proj4string=CRS(proj4string(England)))
+
+library(sp)
+x <- over(stationsSP, England)[,1]
+x <- which(!is.na(x))
+stationsNew <- stationsNew[x,]
+
+library(leaflet)
+leaflet(data = stationsNew) %>% addTiles() %>%
+ addCircleMarkers(lng = ~Longitude, lat = ~Latitude,
+ radius = 0.5, color="red", popup = ~SiteID)
+```
+
+Pollution data started to be collected in 1972, building the time series for a given station can be done in one line of code:
+
+```{r, warning=FALSE, message=FALSE, cache=TRUE}
+df <- get1Hdata("BAR2", years=1972:2016)
+```
+
+Data retrieval can be also be performed in parallel, using the parallel package (see example in the README file).