From 863b756b149d1a62fb9339d46bbb3b92a73060d6 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Wed, 16 Oct 2024 10:38:29 +0100
Subject: [PATCH 1/7] Ignoring cleaned_data directory

---
 .dockerignore | 1 +
 .gitignore    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.dockerignore b/.dockerignore
index 5bf512a..4591f18 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,5 +1,6 @@
 data/Met_Eireann/arch
 data/Met_Eireann/scraped_data
+data/Met_Eireann/cleaned_data
 *__pycache__
 *.ipynb_checkpoints
 *.xlsx
diff --git a/.gitignore b/.gitignore
index e78ee81..57a5a11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # ignore sub repos
 data/Met_Eireann/arch
 data/Met_Eireann/scraped_data
+data/Met_Eireann/cleaned_data
 *__pycache__
 *.ipynb_checkpoints
 *.xlsx

From 3ec524c3381477d2f3ae9a1f11c56928a6a26503 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Wed, 16 Oct 2024 10:39:20 +0100
Subject: [PATCH 2/7] Addded s3 constants. Credentials, cleaned and scraped
 data directories.

---
 webscraper/cons.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/webscraper/cons.py b/webscraper/cons.py
index 546bdfb..dc5e3e9 100644
--- a/webscraper/cons.py
+++ b/webscraper/cons.py
@@ -10,6 +10,7 @@
 sys.path.append(root_dir)
 # set directories
 data_dir = os.path.join(root_dir, 'data')
+creds_data = os.path.join(root_dir, '.creds')
 gis_dir = os.path.join(data_dir, "gis")
 met_eireann_dir = os.path.join(data_dir, 'Met_Eireann')
 bokeh_ref_data_dir = os.path.join(data_dir, "bokeh", "ref")
@@ -23,11 +24,13 @@
 map_data_fpath = os.path.join(gis_dir, "map_data.pickle")
 points_data_fpath = os.path.join(gis_dir, "points_data.pickle")
 scraped_data_dir = os.path.join(met_eireann_dir, 'scraped_data')
+cleaned_data_dir = os.path.join(met_eireann_dir, 'cleaned_data')
 stations_fpath = os.path.join(met_eireann_dir, 'ref', 'StationDetails.csv')
 unittest_normal_dists_fpath = os.path.join(bokeh_ref_data_dir, "unittest_normal_dists.json")
 col_options_fpath = os.path.join(bokeh_ref_data_dir, "col_options.json")
 stat_options_fpath = os.path.join(bokeh_ref_data_dir, "stat_options.json")
 agg_level_strftime_fpath = os.path.join(bokeh_ref_data_dir, "agg_level_strftime.json")
+session_token_fpath = os.path.join(creds_data, "sessionToken.json")
 
 # load bokeh reference data
 with open(col_options_fpath) as json_file: 
@@ -35,4 +38,10 @@
 with open(stat_options_fpath) as json_file: 
     stat_options = json.load(json_file)
 with open(agg_level_strftime_fpath) as json_file: 
-    date_strftime_dict = json.load(json_file)
\ No newline at end of file
+    date_strftime_dict = json.load(json_file)
+
+# aws s3 constants
+s3_bucket = "irishclimatedashboard"
+s3_scraped_directory = "data/Met_Eireann/scraped_data"
+s3_clean_directory = "data/Met_Eireann/cleaned_data"
+s3_fname = "dly{station_id}.csv"
\ No newline at end of file

From 073f5aa35c2e3ed0ba23a43b4eb70e8f89875d2b Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Wed, 16 Oct 2024 10:40:15 +0100
Subject: [PATCH 3/7] Converted / created a Boto3 s3 client object class

---
 webscraper/arch/gen_boto3_excel.py | 55 -------------------
 webscraper/utilities/S3Client.py   | 84 ++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 55 deletions(-)
 delete mode 100644 webscraper/arch/gen_boto3_excel.py
 create mode 100644 webscraper/utilities/S3Client.py

diff --git a/webscraper/arch/gen_boto3_excel.py b/webscraper/arch/gen_boto3_excel.py
deleted file mode 100644
index 3c5b74d..0000000
--- a/webscraper/arch/gen_boto3_excel.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import io
-import boto3
-import json
-from beartype import beartype
-
-@beartype
-def gen_boto3_excel(
-    sessionToken:str,
-    bucket:str="irishclimateapp", 
-    prefix:str="data/Met_Eireann"
-    ) -> list:
-    """Retrieves the raw Met Eireann data from AWS s3
-
-    Parameters
-    ----------
-    sessionToken : str
-        The file path to an active aws session token
-    bucket : str
-        The s3 bucket containing the Met Eireann data files
-    prefix : str
-        The s3 directory containing the Met Eireann data files
-
-    Returns
-    -------
-    list
-        The raw Met Eireann data
-    """
-    # load aws config
-    with open(sessionToken, "r") as j:
-        aws_config = json.loads(j.read())
-    # connect to aws boto3
-    session = boto3.Session(
-        aws_access_key_id=aws_config['Credentials']["AccessKeyId"],
-        aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"],
-        aws_session_token=aws_config['Credentials']["SessionToken"],
-        region_name="eu-west-1"
-    )
-    # generate boto3 s3 connection
-    client = session.client("s3")
-    # create a paginator to list all objects
-    paginator = client.get_paginator("list_objects_v2")
-    # apply the paginator to list all files in the irishclimateapp bucket with key data/Met_Eireann
-    operation_parameters = {"Bucket": bucket, "Prefix": prefix}
-    page_iterator = paginator.paginate(**operation_parameters)
-    # filter down contents keys with .xlsx
-    filtered_iterator = page_iterator.search("Contents[?contains(Key,'.xlsx')].Key")
-    # extract out the file keys
-    file_keys = [content_key for content_key in filtered_iterator]
-    # load s3 objects into list
-    objs_list = [
-        client.get_object(Bucket=bucket, Key=file_key) for file_key in file_keys
-    ]
-    # decode xlsx files in body
-    data_list = [io.BytesIO(obj["Body"].read()) for obj in objs_list]
-    return data_list
diff --git a/webscraper/utilities/S3Client.py b/webscraper/utilities/S3Client.py
new file mode 100644
index 0000000..5202865
--- /dev/null
+++ b/webscraper/utilities/S3Client.py
@@ -0,0 +1,84 @@
+import io
+import boto3
+import json
+import logging
+import pandas as pd
+from typing import Union
+from beartype import beartype
+
+class S3Client():
+    
+    @beartype
+    def __init__(self, sessionToken:str):
+        # load aws config
+        with open(sessionToken, "r") as j:
+            aws_config = json.loads(j.read())
+        # connect to aws boto3
+        self.session = boto3.Session(
+            aws_access_key_id=aws_config['Credentials']["AccessKeyId"],
+            aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"],
+            aws_session_token=aws_config['Credentials']["SessionToken"],
+            region_name="eu-west-1"
+        )
+        # generate boto3 s3 connection
+        self.client = self.session.client("s3")
+    
+    @beartype
+    def store(
+        self,
+        data:pd.DataFrame,
+        key:str,
+        bucket:str="irishclimateapp"
+        ):
+        """Stores a raw Met Eireann data file on s3.
+        
+        Parameters
+        ----------
+        directory : str
+            The s3 key to store the Met Eireann data files
+        bucket : str
+            The s3 bucket storing the Met Eireann data files
+        
+        Returns
+        -------
+        """
+        try:
+            logging.info(f"Storing data to S3://{bucket}/{key}")
+            csv_buf = io.StringIO()
+            data.to_csv(csv_buf, header=True, index=False)
+            csv_buf.seek(0)
+            self.client.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key=key)
+        except Exception as e:
+            logging.info(str(e))
+            
+    @beartype
+    def retrieve(
+        self,
+        key:str,
+        bucket:str="irishclimateapp" 
+        ):
+        
+        """Retrieves a raw Met Eireann data from AWS s3.
+        
+        Parameters
+        ----------
+        key : str
+            The s3 key containing the Met Eireann data file
+        bucket : str
+            The s3 bucket containing the Met Eireann data file
+        
+        Returns
+        -------
+        
+            The raw Met Eireann data
+        """
+        data = None
+        try:
+            logging.info(f"Retrieving data from S3://{bucket}/{key}")
+            # load s3 objects into list
+            obj = self.client.get_object(Bucket=bucket, Key=key)
+            # decode xlsx files in body
+            data = pd.read_csv(obj["Body"])
+        except Exception as e:
+            logging.info(str(e))
+        return data

From 0e6e6486730e0cdf9e0f17e26d9e765c813a6eb8 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Wed, 16 Oct 2024 10:40:54 +0100
Subject: [PATCH 4/7] Extracted out cleaning logic to sperated script. Added
 additional writing of clean filees to disk and s3

---
 webscraper/utilities/clean_data.py      | 41 +++++++++++++++++++++++++
 webscraper/utilities/gen_master_data.py |  3 +-
 2 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 webscraper/utilities/clean_data.py

diff --git a/webscraper/utilities/clean_data.py b/webscraper/utilities/clean_data.py
new file mode 100644
index 0000000..e805660
--- /dev/null
+++ b/webscraper/utilities/clean_data.py
@@ -0,0 +1,41 @@
+import logging
+import os
+from beartype import beartype
+from typing import Union
+import cons
+from webscraper.utilities.load_data import load_data
+from utilities.S3Client import S3Client
+
+@beartype
+def clean_data(
+    scraped_data_dir:str=cons.scraped_data_dir,
+    cleaned_data_dir:str=cons.cleaned_data_dir,
+    store_on_s3:bool=False
+    ):
+    """Generates the master data from the individual raw Met Eireann .xlsx files
+
+    Parameters
+    ----------
+    scraped_data_dir : str
+        The local directory to load the raw Met Eireann .csv files from
+    cleaned_data_dir : str
+        The local directory to write the cleaned Met Eireann .csv files to
+
+    Returns
+    -------
+    """
+     # load data files from file directory
+    scraped_data_fpaths = [os.path.join(scraped_data_dir, fname) for fname in os.listdir(scraped_data_dir)]
+    logging.info("Reading, cleaning and storing files ...")
+    s3client = S3Client(sessionToken=cons.session_token_fpath)
+    for fpath in scraped_data_fpaths:
+        # extract basename
+        fname = os.path.basename(fpath)
+        # load data
+        clean_data = load_data(fpath)
+        # write data to clean data directory
+        cleaned_data_fpath = os.path.join(cleaned_data_dir, fname)
+        clean_data.to_csv(cleaned_data_fpath, header=True, index=False)
+        if store_on_s3:
+            # store data on s3 back up repository
+            s3client.store(data=clean_data, bucket=cons.s3_bucket, key=f"{cons.s3_clean_directory}/{fname}")
\ No newline at end of file
diff --git a/webscraper/utilities/gen_master_data.py b/webscraper/utilities/gen_master_data.py
index 02081a9..5fe25ab 100644
--- a/webscraper/utilities/gen_master_data.py
+++ b/webscraper/utilities/gen_master_data.py
@@ -4,7 +4,6 @@
 import cons
 from beartype import beartype
 from typing import Union
-from webscraper.utilities.load_data import load_data
 
 @beartype
 def gen_master_data(
@@ -30,7 +29,7 @@ def gen_master_data(
         met_eireann_fpaths = [os.path.join(cons.scraped_data_dir, fname) for fname in os.listdir(cons.scraped_data_dir)]
     logging.info("Reading, concatenating and cleaning .xlsx files ...")
     # load and concatenate data files together
-    data_list = [load_data(fpath) for fpath in met_eireann_fpaths]
+    data_list = [pd.read_csv(fpath) for fpath in met_eireann_fpaths]
     data = pd.concat(objs=data_list, ignore_index=True, axis=0)
     # order results by county, id and date alphabetically
     data = data.sort_values(by=["county", "id", "date"]).reset_index(drop=True)

From 352cc25db3b58d8cfe08f88e8c24db212c6f03df Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Wed, 16 Oct 2024 10:41:47 +0100
Subject: [PATCH 5/7] Calling cleaning function

---
 webscraper/prg_webscrape_data.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/webscraper/prg_webscrape_data.py b/webscraper/prg_webscrape_data.py
index a5c9848..1f0b317 100644
--- a/webscraper/prg_webscrape_data.py
+++ b/webscraper/prg_webscrape_data.py
@@ -9,6 +9,7 @@
 from utilities.gen_preaggregate_data import gen_preaggregate_data
 from utilities.gen_counties_data import gen_counties_data
 from utilities.gen_stations_data import gen_stations_data
+from utilities.clean_data import clean_data
 
 @beartype
 def webscrape_data(
@@ -44,6 +45,8 @@ def webscrape_data(
         stations = load_stations_data(stations_fpath=cons.stations_fpath, filter_open=True)
         # run webscraper
         resp_log = retrieve_station_data(stations=stations, scraped_data_dir=cons.scraped_data_dir, data_level="dly")
+        # run data cleaning
+        clean_data(scraped_data_dir=cons.scraped_data_dir, cleaned_data_dir=cons.cleaned_data_dir)
     if generate_master_data:
         logging.info('~~~~~ Generating master data file ...')
         # generate master data file

From 49702370ec2076323fd2c124f9386526cd281072 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 17 Oct 2024 08:48:46 +0100
Subject: [PATCH 6/7] #27 Revised webscraping pipeline. Added missing doc
 strings and updated beartyping. Combined scripts within common subprocesses.
 Updated programme parameters in .bat and .py files. Added missing default
 cons values.

---
 exeWebscrapeData.bat                          |   2 +-
 webscraper/prg_webscrape_data.py              | 110 ++++++++++++------
 webscraper/utilities/clean_data.py            |  41 -------
 webscraper/utilities/commandline_interface.py |  28 ++---
 .../{load_data.py => gen_clean_data.py}       |  48 +++++++-
 .../{gen_counties_data.py => gen_map_data.py} |  50 ++++----
 webscraper/utilities/gen_master_data.py       |  37 +++---
 webscraper/utilities/gen_met_data.py          |  82 +++++++++++++
 ...en_stations_data.py => gen_points_data.py} |  36 +++---
 ...reaggregate_data.py => gen_preagg_data.py} |  35 +++---
 webscraper/utilities/load_stations_data.py    |  35 ------
 webscraper/utilities/retrieve_station_data.py |  36 ------
 webscraper/utilities/url_retrieve.py          |  34 ------
 13 files changed, 290 insertions(+), 284 deletions(-)
 delete mode 100644 webscraper/utilities/clean_data.py
 rename webscraper/utilities/{load_data.py => gen_clean_data.py} (56%)
 rename webscraper/utilities/{gen_counties_data.py => gen_map_data.py} (60%)
 create mode 100644 webscraper/utilities/gen_met_data.py
 rename webscraper/utilities/{gen_stations_data.py => gen_points_data.py} (54%)
 rename webscraper/utilities/{gen_preaggregate_data.py => gen_preagg_data.py} (54%)
 delete mode 100644 webscraper/utilities/load_stations_data.py
 delete mode 100644 webscraper/utilities/retrieve_station_data.py
 delete mode 100644 webscraper/utilities/url_retrieve.py

diff --git a/exeWebscrapeData.bat b/exeWebscrapeData.bat
index 4e1eb1e..d51547f 100644
--- a/exeWebscrapeData.bat
+++ b/exeWebscrapeData.bat
@@ -1 +1 @@
-call python webscraper\prg_webscrape_data.py --retrieve_data --generate_master_data --generate_preaggregated_data --generate_counties_data --generate_stations_data
\ No newline at end of file
+call python webscraper\prg_webscrape_data.py --run_met_data --run_clean_data --run_master_data --run_preagg_data --run_map_data --run_points_data
\ No newline at end of file
diff --git a/webscraper/prg_webscrape_data.py b/webscraper/prg_webscrape_data.py
index 1f0b317..35703e2 100644
--- a/webscraper/prg_webscrape_data.py
+++ b/webscraper/prg_webscrape_data.py
@@ -3,35 +3,37 @@
 import time
 from beartype import beartype
 from utilities.commandline_interface import commandline_interface
-from utilities.load_stations_data import load_stations_data
-from utilities.retrieve_station_data import retrieve_station_data
+from utilities.gen_met_data import gen_met_data
+from utilities.gen_clean_data import gen_clean_data
 from utilities.gen_master_data import gen_master_data
-from utilities.gen_preaggregate_data import gen_preaggregate_data
-from utilities.gen_counties_data import gen_counties_data
-from utilities.gen_stations_data import gen_stations_data
-from utilities.clean_data import clean_data
+from utilities.gen_preagg_data import gen_preagg_data
+from utilities.gen_map_data import gen_map_data
+from utilities.gen_points_data import gen_points_data
 
 @beartype
 def webscrape_data(
-    retrieve_data:bool, 
-    generate_master_data:bool, 
-    generate_preaggregated_data:bool,
-    generate_counties_data:bool,
-    generate_stations_data:bool
+    run_met_data:bool, 
+    run_clean_data:bool,
+    run_master_data:bool, 
+    run_preagg_data:bool,
+    run_map_data:bool,
+    run_points_data:bool
     ):
     """Webscrape and process met data into dashboard files
 
     Parameters
     ----------
-    retrieve_data : bool
+    run_met_data : bool
         Retrieves / web scrapes the historical met data
-    generate_master_data : bool
+    run_clean_data : bool
+        Cleans and processes the scraped met data
+    run_master_data : bool
         Generates the master data file from the retrieved / web scraped met data files
-    generate_preaggregated_data : bool
+    run_preagg_data : bool
         Preaggreates the master data file into various date levels for the bokeh dashboard app
-    generate_counties_data : bool
-        Generates the counties gis file for the bokeh dashboard app
-    generate_stations_data : bool
+    run_map_data : bool
+        Generates the map gis file for the bokeh dashboard app
+    run_points_data : bool
         Generates the stations gis file for the bokeh dashboard app
 
     Returns
@@ -39,30 +41,61 @@ def webscrape_data(
     """
     # start timer
     t0 = time.time()
-    if retrieve_data:
+
+    if run_met_data:
         logging.info('~~~~~ Retrieving data for met stations ...')
-        # load stations data
-        stations = load_stations_data(stations_fpath=cons.stations_fpath, filter_open=True)
         # run webscraper
-        resp_log = retrieve_station_data(stations=stations, scraped_data_dir=cons.scraped_data_dir, data_level="dly")
+        gen_met_data(
+            stations_fpath=cons.stations_fpath, 
+            filter_open=True, 
+            topn_stations=5, 
+            scraped_data_dir=cons.scraped_data_dir, data_level="dly"
+            )
+    
+    if run_clean_data:
+        logging.info('~~~~~ Cleaning met stations data ...')
         # run data cleaning
-        clean_data(scraped_data_dir=cons.scraped_data_dir, cleaned_data_dir=cons.cleaned_data_dir)
-    if generate_master_data:
+        gen_clean_data(
+            scraped_data_dir=cons.scraped_data_dir, 
+            cleaned_data_dir=cons.cleaned_data_dir, 
+            store_on_s3=False
+            )
+    
+    if run_master_data:
         logging.info('~~~~~ Generating master data file ...')
         # generate master data file
-        gen_master_data(master_data_fpath = cons.master_data_fpath)
-    if generate_preaggregated_data:
+        gen_master_data(
+            cleaned_data_dir=cons.cleaned_data_dir, 
+            master_data_fpath=cons.master_data_fpath
+            )
+    
+    if run_preagg_data:
         logging.info('~~~~~ Generating preaggregated data file ...')
         # generate the preaggregate data
-        gen_preaggregate_data(preaggregate_data_fpath = cons.preaggregate_data_fpath)
-    if generate_counties_data:
-        logging.info('~~~~~ Generating geospatial counties data file ...')
+        gen_preagg_data(
+            master_data_fpath=cons.master_data_fpath, 
+            preaggregate_data_fpath=cons.preaggregate_data_fpath
+            )
+
+    if run_map_data:
+        logging.info('~~~~~ Generating geospatial map data file ...')
         # generate counties data
-        gen_counties_data(map_data_fpath = cons.map_data_fpath)
-    if generate_stations_data:
-        logging.info('~~~~~ Generating geospatial stations data file ...')
+        gen_map_data(
+            rep_counties_fpath=cons.rep_counties_fpath, 
+            ni_counties_fpath=cons.ni_counties_fpath, 
+            preaggregate_data_fpath=cons.preaggregate_data_fpath, 
+            map_data_fpath=cons.map_data_fpath
+            )
+
+    if run_points_data:
+        logging.info('~~~~~ Generating geospatial points data file ...')
         # generate wheather station points data
-        gen_stations_data(points_data_fpath = cons.points_data_fpath)
+        gen_points_data(
+            master_data_fpath=cons.master_data_fpath, 
+            stations_fpath=cons.stations_fpath, 
+            points_data_fpath=cons.points_data_fpath
+            )
+        
     # end timer and log result
     t1 = time.time()
     tres = t1 - t0
@@ -74,13 +107,16 @@ def webscrape_data(
     # set up logging
     lgr = logging.getLogger()
     lgr.setLevel(logging.INFO)
+
     # handle input parameters
     input_params_dict = commandline_interface()
+
     # call webscrape data
     webscrape_data(
-        retrieve_data=input_params_dict['retrieve_data'], 
-        generate_master_data=input_params_dict['generate_master_data'], 
-        generate_preaggregated_data=input_params_dict['generate_preaggregated_data'],
-        generate_counties_data=input_params_dict['generate_counties_data'],
-        generate_stations_data=input_params_dict['generate_stations_data']
+        run_met_data=input_params_dict['run_met_data'], 
+        run_clean_data=input_params_dict['run_clean_data'], 
+        run_master_data=input_params_dict['run_master_data'], 
+        run_preagg_data=input_params_dict['run_preagg_data'],
+        run_map_data=input_params_dict['run_map_data'],
+        run_points_data=input_params_dict['run_points_data']
     )
\ No newline at end of file
diff --git a/webscraper/utilities/clean_data.py b/webscraper/utilities/clean_data.py
deleted file mode 100644
index e805660..0000000
--- a/webscraper/utilities/clean_data.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import logging
-import os
-from beartype import beartype
-from typing import Union
-import cons
-from webscraper.utilities.load_data import load_data
-from utilities.S3Client import S3Client
-
-@beartype
-def clean_data(
-    scraped_data_dir:str=cons.scraped_data_dir,
-    cleaned_data_dir:str=cons.cleaned_data_dir,
-    store_on_s3:bool=False
-    ):
-    """Generates the master data from the individual raw Met Eireann .xlsx files
-
-    Parameters
-    ----------
-    scraped_data_dir : str
-        The local directory to load the raw Met Eireann .csv files from
-    cleaned_data_dir : str
-        The local directory to write the cleaned Met Eireann .csv files to
-
-    Returns
-    -------
-    """
-     # load data files from file directory
-    scraped_data_fpaths = [os.path.join(scraped_data_dir, fname) for fname in os.listdir(scraped_data_dir)]
-    logging.info("Reading, cleaning and storing files ...")
-    s3client = S3Client(sessionToken=cons.session_token_fpath)
-    for fpath in scraped_data_fpaths:
-        # extract basename
-        fname = os.path.basename(fpath)
-        # load data
-        clean_data = load_data(fpath)
-        # write data to clean data directory
-        cleaned_data_fpath = os.path.join(cleaned_data_dir, fname)
-        clean_data.to_csv(cleaned_data_fpath, header=True, index=False)
-        if store_on_s3:
-            # store data on s3 back up repository
-            s3client.store(data=clean_data, bucket=cons.s3_bucket, key=f"{cons.s3_clean_directory}/{fname}")
\ No newline at end of file
diff --git a/webscraper/utilities/commandline_interface.py b/webscraper/utilities/commandline_interface.py
index b9efe59..f1cc605 100644
--- a/webscraper/utilities/commandline_interface.py
+++ b/webscraper/utilities/commandline_interface.py
@@ -4,12 +4,6 @@
 def commandline_interface():
     """A commandline interface for parsing input parameters with
 
-    Windows
-    python IrishClimateDashboard\\webscraper\\prg_webscraper_data.py --retrieve_data --generate_master_data --generate_preaggregated_data --generate_counties_data --generate_stations_data
-
-    Linux
-    python3 IrishClimateDashboard/webscraper/prg_webscraper_data.py --retrieve_data --generate_master_data --generate_preaggregated_data --generate_counties_data --generate_stations_data
-
     Parameters
     ----------
 
@@ -21,19 +15,21 @@ def commandline_interface():
     # define argument parser object
     parser = argparse.ArgumentParser(description="Execute Random TeleCom Data Programme.")
     # add input arguments
-    parser.add_argument("--retrieve_data", action=argparse.BooleanOptionalAction, dest="retrieve_data", type=bool, default=False, help="Boolean, retrieves / web scrapes the historical met data",)
-    parser.add_argument("--generate_master_data", action=argparse.BooleanOptionalAction, dest="generate_master_data", type=bool, default=False, help="Boolean, generates the master data file from the retrieved / web scraped met data files",)
-    parser.add_argument("--generate_preaggregated_data", action=argparse.BooleanOptionalAction, dest="generate_preaggregated_data", type=bool, default=False, help="Boolean, preaggreates the master data file into various date levels for the bokeh dashboard app",)
-    parser.add_argument("--generate_counties_data", action=argparse.BooleanOptionalAction, dest="generate_counties_data", type=bool, default=False, help="Boolean, generates the counties gis file for the bokeh dashboard app",)
-    parser.add_argument("--generate_stations_data", action=argparse.BooleanOptionalAction, dest="generate_stations_data", type=bool, default=False, help="Boolean, generates the stations gis file for the bokeh dashboard app",)
+    parser.add_argument("--run_met_data", action=argparse.BooleanOptionalAction, dest="run_met_data", type=bool, default=False, help="Boolean, retrieves / web scrapes the historical met data",)
+    parser.add_argument("--run_clean_data", action=argparse.BooleanOptionalAction, dest="run_clean_data", type=bool, default=False, help="Boolean, cleans and processes the scraped met data",)
+    parser.add_argument("--run_master_data", action=argparse.BooleanOptionalAction, dest="run_master_data", type=bool, default=False, help="Boolean, generates the master data file from the retrieved / web scraped met data files",)
+    parser.add_argument("--run_preagg_data", action=argparse.BooleanOptionalAction, dest="run_preagg_data", type=bool, default=False, help="Boolean, preaggreates the master data file into various date levels for the bokeh dashboard app",)
+    parser.add_argument("--run_map_data", action=argparse.BooleanOptionalAction, dest="run_map_data", type=bool, default=False, help="Boolean, generates the map gis file for the bokeh dashboard app",)
+    parser.add_argument("--run_points_data", action=argparse.BooleanOptionalAction, dest="run_points_data", type=bool, default=False, help="Boolean, generates the stations gis file for the bokeh dashboard app",)
     # create an output dictionary to hold the results
     input_params_dict = {}
     # extract input arguments
     args = parser.parse_args()
     # map input arguments into output dictionary
-    input_params_dict["retrieve_data"] = args.retrieve_data
-    input_params_dict["generate_master_data"] = args.generate_master_data
-    input_params_dict["generate_preaggregated_data"] = args.generate_preaggregated_data
-    input_params_dict["generate_counties_data"] = args.generate_counties_data
-    input_params_dict["generate_stations_data"] = args.generate_stations_data
+    input_params_dict["run_met_data"] = args.run_met_data
+    input_params_dict["run_clean_data"] = args.run_clean_data
+    input_params_dict["run_master_data"] = args.run_master_data
+    input_params_dict["run_preagg_data"] = args.run_preagg_data
+    input_params_dict["run_map_data"] = args.run_map_data
+    input_params_dict["run_points_data"] = args.run_points_data
     return input_params_dict
diff --git a/webscraper/utilities/load_data.py b/webscraper/utilities/gen_clean_data.py
similarity index 56%
rename from webscraper/utilities/load_data.py
rename to webscraper/utilities/gen_clean_data.py
index 0b84b68..0a9296a 100644
--- a/webscraper/utilities/load_data.py
+++ b/webscraper/utilities/gen_clean_data.py
@@ -1,9 +1,10 @@
-
-import re
+import logging
 import os
+import re
 import pandas as pd
-import cons
 from beartype import beartype
+import cons
+from utilities.S3Client import S3Client
 
 @beartype
 def load_data(
@@ -17,8 +18,7 @@ def load_data(
     fpath : str
         The file path to load the webscraped met data from disk
     stations_fpath : str
-        The file path to load the reference station data from disk
-
+        The file path to load the reference station data from disk, default is cons.stations_fpath
 
     Returns
     -------
@@ -56,3 +56,41 @@ def load_data(
     dataframe["county"] = dataframe["county"].str.title()
     dataframe["date"] = pd.to_datetime(dataframe["date"], format='%d-%b-%Y')
     return dataframe
+
+
+@beartype
+def gen_clean_data(
+    scraped_data_dir:str=cons.scraped_data_dir,
+    cleaned_data_dir:str=cons.cleaned_data_dir,
+    store_on_s3:bool=False
+    ):
+    """Generates the master data from the individual raw Met Eireann .xlsx files
+
+    Parameters
+    ----------
+    scraped_data_dir : str
+        The local directory to load the raw Met Eireann .csv files from, default is cons.scraped_data_dir
+    cleaned_data_dir : str
+        The local directory to write the cleaned Met Eireann .csv files to, default is cons.cleaned_data_dir
+    store_on_s3 : bool
+        Whether to back up the clean data files on s3, default is False
+
+    Returns
+    -------
+    """
+     # load data files from file directory
+    scraped_data_fpaths = [os.path.join(scraped_data_dir, fname) for fname in os.listdir(scraped_data_dir)]
+    logging.info("Reading, cleaning and storing files ...")
+    s3client = S3Client(sessionToken=cons.session_token_fpath)
+    for fpath in scraped_data_fpaths:
+        # extract basename
+        fname = os.path.basename(fpath)
+        # load data
+        clean_data = load_data(fpath)
+        # write data to clean data directory
+        cleaned_data_fpath = os.path.join(cleaned_data_dir, fname)
+        logging.info(f"Writing cleaned data file {cleaned_data_fpath} to disk")
+        clean_data.to_csv(cleaned_data_fpath, header=True, index=False)
+        if store_on_s3:
+            # store data on s3 back up repository
+            s3client.store(data=clean_data, bucket=cons.s3_bucket, key=f"{cons.s3_clean_directory}/{fname}")
\ No newline at end of file
diff --git a/webscraper/utilities/gen_counties_data.py b/webscraper/utilities/gen_map_data.py
similarity index 60%
rename from webscraper/utilities/gen_counties_data.py
rename to webscraper/utilities/gen_map_data.py
index 631acc4..e7bc019 100644
--- a/webscraper/utilities/gen_counties_data.py
+++ b/webscraper/utilities/gen_map_data.py
@@ -8,31 +8,36 @@
 from typing import Union
 
 @beartype
-def gen_counties_data(
-    pre_agg_data_dict:Union[dict,None]=None,
-    map_data_fpath:Union[str,None]=None,
+def gen_map_data(
+    rep_counties_fpath:str=cons.rep_counties_fpath,
+    ni_counties_fpath:str=cons.ni_counties_fpath,
+    preaggregate_data_fpath:str=cons.preaggregate_data_fpath,
+    map_data_fpath:str=cons.map_data_fpath
     ):
     """Generates counties map data for the bokeh map dashboard
 
     Parameters
     ----------
-    pre_agg_data_dict : None or dict
-        Either the preaggregated data dictionary or loads the preaggregated data dictionary from disk when None, default is None
-    map_data_fpath : None or str
-        The file location to write the map data to disk, default is None
+    rep_counties_fpath : str
+        The file path to the republic of ireland counties .shp file on disk, default is cons.rep_counties_fpath,
+    ni_counties_fpath : str
+        The file path to northern irleand counties .shp file on disk, default is cons.ni_counties_fpath
+    pre_agg_data_dict : str
+        The file path to the preaggregated data on disk, default is cons.preaggregate_data_fpath
+    map_data_fpath : str
+        The file location to write the map data to disk, default is map_data_fpath
 
     Returns
     -------
     """
     logging.info("Loading rep / ni counties shape files ...")
     # load in county shape files
-    rep_counties = (gpd.read_file(cons.rep_counties_fpath)[["ENGLISH", "geometry"]].rename(columns={"ENGLISH": "county"}).to_crs(epsg=2157))
-    ni_counties = gpd.read_file(cons.ni_counties_fpath)[["county", "geometry"]].to_crs(epsg=2157)
-    if type(pre_agg_data_dict) == type(None):
-        logging.info("Loading preaggregated data dictionary ...")
-        # load preaggregated data
-        with open(cons.preaggregate_data_fpath, "rb") as f:
-            pre_agg_data_dict = pickle.load(f)
+    rep_counties = (gpd.read_file(rep_counties_fpath)[["ENGLISH", "geometry"]].rename(columns={"ENGLISH": "county"}).to_crs(epsg=2157))
+    ni_counties = gpd.read_file(ni_counties_fpath)[["county", "geometry"]].to_crs(epsg=2157)
+    logging.info("Loading preaggregated data dictionary ...")
+    # load preaggregated data
+    with open(preaggregate_data_fpath, "rb") as f:
+        pre_agg_data_dict = pickle.load(f)
     logging.info("Concatenating counties geopandas dataframes ...")
     # concatenate county shape files
     counties = gpd.GeoDataFrame(pd.concat([rep_counties, ni_counties], ignore_index=True), crs="EPSG:2157")
@@ -61,19 +66,16 @@ def gen_counties_data(
         county_data = pre_agg_data.groupby(group_cols, as_index=False).agg(agg_dict)
         county_data['stat'] = stat
         map_data_list.append(county_data)
-    # 
     map_data = pd.concat(objs=map_data_list,axis=0,ignore_index=True)
     # join county level data to map data
     map_geodata = gpd.GeoDataFrame(
         data=pd.merge(left=counties, right=map_data, on="county", how="left"),
         crs="EPSG:2157",
         )
-    # if the output
-    if map_data_fpath != None:
-        if os.path.exists(map_data_fpath):
-            logging.info("Writing counties data to disk as pickle file ...")
-            # pickle the preaggregated data dictionary to disk
-            with open(map_data_fpath, "wb") as f:
-                pickle.dump(map_geodata, f, protocol=pickle.HIGHEST_PROTOCOL)
-        else:
-            raise ValueError(f"{map_data_fpath} does not exist")
+    if os.path.exists(map_data_fpath):
+        logging.info("Writing counties data to disk as pickle file ...")
+        # pickle the preaggregated data dictionary to disk
+        with open(map_data_fpath, "wb") as f:
+            pickle.dump(map_geodata, f, protocol=pickle.HIGHEST_PROTOCOL)
+    else:
+        raise ValueError(f"{map_data_fpath} does not exist")
diff --git a/webscraper/utilities/gen_master_data.py b/webscraper/utilities/gen_master_data.py
index 5fe25ab..6caef31 100644
--- a/webscraper/utilities/gen_master_data.py
+++ b/webscraper/utilities/gen_master_data.py
@@ -7,37 +7,36 @@
 
 @beartype
 def gen_master_data(
-    met_eireann_fpaths:Union[list,None]=None,
-    master_data_fpath:Union[str,None]=None,
+    cleaned_data_dir:str=cons.cleaned_data_dir,
+    master_data_fpath:str=cons.master_data_fpath,
     ):
     """Generates the master data from the individual raw Met Eireann .xlsx files
 
     Parameters
     ----------
-    met_eireann_fpaths : None or list
-        The raw Met Eireann .xlsx file paths, default is None
-    master_data_fpath : None or str
-        The file location to write the master data to disk, default is None
+    cleaned_data_dir : str
+        The raw Met Eireann .xlsx file paths, default is cons.cleaned_data_dir
+    master_data_fpath : str
+        The file location to write the master data to disk, default is cons.master_data_fpath
 
     Returns
     -------
     """
-    # if load data locally
-    if met_eireann_fpaths == None:
-        logging.info("Retrieving raw met eireann .xlsx file paths from disk ...")
-        # load data files from file directory
-        met_eireann_fpaths = [os.path.join(cons.scraped_data_dir, fname) for fname in os.listdir(cons.scraped_data_dir)]
-    logging.info("Reading, concatenating and cleaning .xlsx files ...")
+    logging.info("Retrieving cleaned file paths from disk ...")
+    # load data files from file directory
+    met_eireann_fpaths = [os.path.join(cleaned_data_dir, fname) for fname in os.listdir(cleaned_data_dir)]
+    logging.info("Reading and concatenating files ...")
     # load and concatenate data files together
     data_list = [pd.read_csv(fpath) for fpath in met_eireann_fpaths]
     data = pd.concat(objs=data_list, ignore_index=True, axis=0)
+    # convert date to datetime
+    data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
     # order results by county, id and date alphabetically
     data = data.sort_values(by=["county", "id", "date"]).reset_index(drop=True)
     # if the output
-    if master_data_fpath != None:
-        if os.path.exists(master_data_fpath):
-            logging.info("Writing master file to disk as .feather file ...")
-            # save concatenated data to disk
-            data.to_feather(master_data_fpath)
-        else:
-            raise ValueError(f"{master_data_fpath} does not exist")
\ No newline at end of file
+    if os.path.exists(master_data_fpath):
+        logging.info("Writing master file to disk as .feather file ...")
+        # save concatenated data to disk
+        data.to_feather(master_data_fpath)
+    else:
+        raise ValueError(f"{master_data_fpath} does not exist")
\ No newline at end of file
diff --git a/webscraper/utilities/gen_met_data.py b/webscraper/utilities/gen_met_data.py
new file mode 100644
index 0000000..a0c8fa0
--- /dev/null
+++ b/webscraper/utilities/gen_met_data.py
@@ -0,0 +1,82 @@
+import logging
+import os
+import pandas as pd
+import urllib.request
+from beartype import beartype
+from typing import Union
+import cons
+
+@beartype
+def url_retrieve(
+    stationid:int, 
+    scraped_data_dir:str=cons.scraped_data_dir, 
+    data_level:str="dly"
+    ):
+    """Retrieves met data for a given station id 
+
+    Parameters
+    ----------
+    stationid : int
+        The station id to retrieve data for
+    scraped_data_dir : str
+        The file directory to write the scraped met data to, default is cons.scraped_data_dir
+    data_level : str
+        The time level of the met data to scrape, default is "dly"
+
+    Returns
+    -------
+    urllib.request.urlretrieve, Exception
+        A retrieval response
+    """
+    data_fname = f"{data_level}{stationid}.csv"
+    data_url = f"http://cli.fusio.net/cli/climate_data/webdata/{data_fname}"
+    download_data_fpath = os.path.join(scraped_data_dir, data_fname)
+    try:
+        resp = urllib.request.urlretrieve(data_url, download_data_fpath)
+    except Exception as e:
+        resp = e
+    return resp
+
+@beartype
+def gen_met_data(
+    stations_fpath:str=cons.stations_fpath,
+    filter_open:bool=True,
+    topn_stations:Union[int, None]=None,
+    scraped_data_dir:str=cons.scraped_data_dir,
+    data_level:str="dly"
+    ):
+    """Webscrapes the met data for all station ids in a given stations dataframe
+
+    Parameters
+    ----------
+    stations_fpath : pd.DataFrame
+        The file path to the met eireann stations reference data, default is cons.stations_fpath
+    filter_open : bool
+        Whether to only filter for only open weather stations in the met eireann stations reference data, default is True
+    topn_stations : int
+        The number of stations to sample from the head of the met eireann stations reference data, default is None
+    scraped_data_dir : str
+        The file directory to write the scraped met data to, default is cons.scraped_data_dir
+    data_level : str
+        The time level of the met data to scrape, default is "dly"
+
+
+    Returns
+    -------
+    """
+    # load stations data
+    stations = pd.read_csv(stations_fpath)
+    if filter_open:
+        # only consider open stations for now
+        open_stations_filter = stations['close_year'].isnull()
+        stations = stations.loc[open_stations_filter, :].reset_index(drop=True)
+    if topn_stations != None:
+        stations = stations.head(topn_stations)
+    # iterate over each station and pull daily level data using using stationid
+    resp_log =[]
+    for idx, row in stations.iterrows():
+        logging.info(f"{idx} {row['county']} {row['station_id']} {row['name']}")
+        resp = url_retrieve(stationid=row['station_id'], scraped_data_dir=scraped_data_dir, data_level=data_level)
+        logging.info(resp)
+        resp_log.append(resp)
+    
\ No newline at end of file
diff --git a/webscraper/utilities/gen_stations_data.py b/webscraper/utilities/gen_points_data.py
similarity index 54%
rename from webscraper/utilities/gen_stations_data.py
rename to webscraper/utilities/gen_points_data.py
index 8f1cf5b..dc4b211 100644
--- a/webscraper/utilities/gen_stations_data.py
+++ b/webscraper/utilities/gen_points_data.py
@@ -8,31 +8,35 @@
 from typing import Union
 
 @beartype
-def gen_stations_data(
-    points_data_fpath:Union[str,None]=None
+def gen_points_data(
+    master_data_fpath:str=cons.master_data_fpath,
+    stations_fpath:str=cons.stations_fpath,
+    points_data_fpath:str=cons.points_data_fpath
     ):
     """Generates gis points data for Met Eireann stations
 
     Parameters
     ----------
+    master_data_fpath : str
+        The file path to the master data on disk, default is cons.master_data_fpath
+    station_fpath : str
+        The file path to the stations reference data on disk, default is cons.stations_fpath
     points_data_fpath : str
-        The file location to write the gis points data to disk, default is None
+        The file location to write the gis points data to disk, default is cons.points_data_fpath
 
     Returns
     -------
     """
     logging.info("Loading master and stations data from disk ...")
     # load master and station data
-    master_data = pd.read_feather(cons.master_data_fpath)
-    stations_data = pd.read_csv(cons.stations_fpath)
+    master_data = pd.read_feather(master_data_fpath)
+    stations_data = pd.read_csv(stations_fpath)
     logging.info("Identifying master station ids ...")
     # extract out station ids from mater file
     master_station_ids = master_data["id"].unique()
     logging.info("Filtering corresponding station data ...")
     # filter master data with station ids
-    master_stations = stations_data.loc[
-        stations_data["station_id"].isin(master_station_ids), :
-    ].copy()
+    master_stations = stations_data.loc[stations_data["station_id"].isin(master_station_ids), :].copy()
     master_stations["county"] = master_stations["county"].str.title()
     master_stations["name"] = master_stations["name"].str.title()
     logging.info("Creating geopandas DataFrame of station data ...")
@@ -42,12 +46,10 @@ def gen_stations_data(
         geometry=gpd.points_from_xy(master_stations.longitude, master_stations.latitude),
         crs="EPSG:4326",
         ).to_crs(epsg=2157)
-    # if the output
-    if points_data_fpath != None:
-        if os.path.exists(points_data_fpath):
-            logging.info("Writing gis stations data to disk as .pickle file ...")
-            # pickle the gis stations data
-            with open(points_data_fpath, "wb") as f:
-                pickle.dump(geo_master_stations, f, protocol=pickle.HIGHEST_PROTOCOL)
-        else:
-            raise ValueError(f"{points_data_fpath} does not exist")
+    if os.path.exists(points_data_fpath):
+        logging.info("Writing gis stations data to disk as .pickle file ...")
+        # pickle the gis stations data
+        with open(points_data_fpath, "wb") as f:
+            pickle.dump(geo_master_stations, f, protocol=pickle.HIGHEST_PROTOCOL)
+    else:
+        raise ValueError(f"{points_data_fpath} does not exist")
diff --git a/webscraper/utilities/gen_preaggregate_data.py b/webscraper/utilities/gen_preagg_data.py
similarity index 54%
rename from webscraper/utilities/gen_preaggregate_data.py
rename to webscraper/utilities/gen_preagg_data.py
index ef04595..5088b23 100644
--- a/webscraper/utilities/gen_preaggregate_data.py
+++ b/webscraper/utilities/gen_preagg_data.py
@@ -7,26 +7,25 @@
 from typing import Union
 
 @beartype
-def gen_preaggregate_data(
-    master_data:Union[pd.DataFrame,None]=None, 
-    preaggregate_data_fpath:Union[str,None]=None
+def gen_preagg_data(
+    master_data_fpath:str=cons.master_data_fpath, 
+    preaggregate_data_fpath:str=cons.preaggregate_data_fpath
     ):
     """Generates preaggregate data for bokeh dashboard app
 
     Parameters
     ----------
-    master_data : None or pd.DataFrame
-        Either the master data as a pandas.DataFrame or loads the master data from disk when None, default is None
+    master_data_fpath : None or pd.DataFrame
+        The file location to write the master data to disk, default is cons.master_data_fpath
     preaggregate_data_fpath : str
-        The file location to write the preaggregated data to disk, default is None
+        The file location to write the preaggregated data to disk, default is cons.preaggregate_data_fpath
 
     Returns
     -------
     """
-    if type(master_data) == type(None):
-        logging.info("Loading master data from disk ...")
-        # load master data
-        master_data = pd.read_feather(cons.master_data_fpath)
+    logging.info("Loading master data from disk ...")
+    # load master data
+    master_data = pd.read_feather(master_data_fpath)
     logging.info("Performing initial data aggregation to year-month level ...")
     # preaggregate the data to year-month level for each available stat
     pre_agg_data_dict = {}
@@ -41,12 +40,10 @@ def gen_preaggregate_data(
         agg_dict = {col: stat for col in cons.col_options}
         tmp_agg_data = agg_data.groupby(group_cols, as_index=False).agg(agg_dict)
         pre_agg_data_dict[stat] = tmp_agg_data
-    # if the output
-    if preaggregate_data_fpath != None:
-        if os.path.exists(preaggregate_data_fpath):
-            logging.info("Writing preaggregated data to disk as .pickle file ...")
-            # pickle the preaggregated data dictionary to disk
-            with open(cons.preaggregate_data_fpath, "wb") as f:
-                pickle.dump(pre_agg_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
-        else:
-            raise ValueError(f"{preaggregate_data_fpath} does not exist")
+    if os.path.exists(preaggregate_data_fpath):
+        logging.info("Writing preaggregated data to disk as .pickle file ...")
+        # pickle the preaggregated data dictionary to disk
+        with open(cons.preaggregate_data_fpath, "wb") as f:
+            pickle.dump(pre_agg_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
+    else:
+        raise ValueError(f"{preaggregate_data_fpath} does not exist")
diff --git a/webscraper/utilities/load_stations_data.py b/webscraper/utilities/load_stations_data.py
deleted file mode 100644
index 0bf0d82..0000000
--- a/webscraper/utilities/load_stations_data.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pandas as pd
-from beartype import beartype
-
-@beartype
-def load_stations_data(
-    stations_fpath:str, 
-    filter_open:bool=True, 
-    topn:int=None
-    ) -> pd.DataFrame:
-    """Loads the station reference data file
-
-    Parameters
-    ----------
-    stations_fpath : str
-        The file path to load the reference station data from disk
-    filter_open : bool
-        Whether to only consider open stations and not closed stations
-    topn : int
-        The number of rows to filter from the head of the loaded stations data
-
-
-    Returns
-    -------
-    pd.DataFrame
-        The loaded stations reference data
-    """
-    # load stations data
-    stations = pd.read_csv(stations_fpath)
-    if filter_open:
-        # only consider open stations for now
-        open_stations_filter = stations['close_year'].isnull()
-        stations = stations.loc[open_stations_filter, :].reset_index(drop=True)
-    if topn != None:
-        stations = stations.head(topn)
-    return stations
\ No newline at end of file
diff --git a/webscraper/utilities/retrieve_station_data.py b/webscraper/utilities/retrieve_station_data.py
deleted file mode 100644
index 87d2c0c..0000000
--- a/webscraper/utilities/retrieve_station_data.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import logging
-import pandas as pd
-from utilities.url_retrieve import url_retrieve
-from beartype import beartype
-
-@beartype
-def retrieve_station_data(
-    stations:pd.DataFrame,
-    scraped_data_dir:str,
-    data_level:str="dly"
-    ) -> list:
-    """Webscrapes the met data for all station ids in a given stations dataframe
-
-    Parameters
-    ----------
-    stations : pd.DataFrame
-        The loaded reference stations data
-    scraped_data_dir : str
-        The file directory to write the scraped met data to
-    data_level : str
-        The time level of the met data to scrape, default is "dly"
-
-
-    Returns
-    -------
-    list
-        A log of the webscrape responses
-    """
-    # iterate over each station and pull daily level data using using stationid
-    resp_log =[]
-    for idx, row in stations.iterrows():
-        logging.info(f"{idx} {row['county']} {row['station_id']} {row['name']}")
-        resp = url_retrieve(stationid=row['station_id'], scraped_data_dir=scraped_data_dir, data_level=data_level)
-        logging.info(resp)
-        resp_log.append(resp)
-    return resp_log
\ No newline at end of file
diff --git a/webscraper/utilities/url_retrieve.py b/webscraper/utilities/url_retrieve.py
deleted file mode 100644
index b1de553..0000000
--- a/webscraper/utilities/url_retrieve.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-import urllib.request
-from beartype import beartype
-
-@beartype
-def url_retrieve(
-    stationid:int, 
-    scraped_data_dir:str, 
-    data_level:str="dly"
-    ):
-    """Retrieves met data for a given station id 
-
-    Parameters
-    ----------
-    stationid : int
-        The station id to retrieve data for
-    scraped_data_dir : str
-        The file directory to write the scraped met data to
-    data_level : str
-        The time level of the met data to scrape, default is "dly"
-
-    Returns
-    -------
-    urllib.request.urlretrieve, Exception
-        A retrieval response
-    """
-    data_fname = f"{data_level}{stationid}.csv"
-    data_url = f"http://cli.fusio.net/cli/climate_data/webdata/{data_fname}"
-    download_data_fpath = os.path.join(scraped_data_dir, data_fname)
-    try:
-        resp = urllib.request.urlretrieve(data_url, download_data_fpath)
-    except Exception as e:
-        resp = e
-    return resp
\ No newline at end of file

From 7b753a61bb3dabda241ee9dcdc951b701d0686e9 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 17 Oct 2024 08:49:09 +0100
Subject: [PATCH 7/7] Recast date to datetime

---
 data/master.feather | Bin 33814562 -> 33737714 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/data/master.feather b/data/master.feather
index 8a15ed4583c62a59e608df77c35d1989068d7435..0c403c95a75b6e5536273848c48b26c465a77311 100644
GIT binary patch
delta 5346
zcmd_ui(6Cm1IO`w&qPraH}is;f`U4_7^M|u7<oZ5O+w2>DWF(W)25=CR!mbjA=?xR
z^R`J@rU+A`L#7T(6Vu9WhDEtFb!Hfv73GCKpEKs;d7j_-3w}J$>ka2S#`k>BIUF`h
z>YauQ#RUe}eIj*vQrDI-t#rC(I-Sn2HQN+c7$Z6q+VsZlWnl>-wBdHw>!sn>wj$5#
zr4gYmUmmvBh_Ff1thvt@Mv6XscA>9jMGk&sbFgMl&&i&p)3wLfd-XcqA{Ya4a8be-
z6hRL3hs#2zI|LrEK!3Q5=T<`zWI+OOm)E#wT+~a^hWyKojXIsA+pByQ!tKFuOf$#T
z#>VZpM40u$RNN-p7%L?XE!-5@MQkh{nQe^5;EZfzr(#>SvDvn^7U^Y&(e5w`kU$U3
zAPkyA3kZjn&<Y};HMD_9XbT3}VKmxOit9@=n+2zglQu*aZjS8yKc<u&*~T=f)krLZ
zAK~NeQjQVY!41$JIzUJ01X0i#x<FUB5xPNlS;~p-mMKRcirzP@X`y=a=^M1B&1xTG
zk?etvG0+2gLM+5VFX#<@zy$Gd6Wk29z^%|1Zi5849qy0=9s79#p+zqX?;j@}Y^ybU
zc+37+^h3cl>l};h3nTi&op2Z24T*3M+zSKXK1hOra6cr&129Nl7%_P5VmW%%;C(4r
zc->EFaZ-J2t(mia9?W&07z{&TD5StJNQDPsI6MR+U?e>JPT<6-3!$Yc6I5xQPs_11
zi&W?^#@G$V)8G+EhtV(w9)-tXESTYO$bcu{Nf;*^j*s{63C+d}R_~SDXsyS(B@^pu
zD9?l}$c70p5prM>JOxjK1t!B3m?}4vKhsdDn#NkMJ<ZH%oF;b~OowN|3Nv6P%z|8)
z4Rhc*m<!L#4Y~6w&tEr9oK&TGStMBJ<FZI^x-9CK+BeOd4==!r@Dk*K4Hm#cco`PK
zVpsxpaKKV|)6D!U`R>L$XNX9-qIto0cj4k#mQ;X?Lv#M{fdx(H&YN6d_a?sr%ivXT
zf(w?z3Mhmkcnw~MVps`p$ll~tp6gc}tGBjjuXX+o4_J*Qk8WCW_yYUzfvaH+tc7*p
z1`n)<5_l8df(`ICyaOBIUDyPhp%k{r!v~giSSV*6D63e~bW4!@V>Ir4vMa3|r#NbK
zIe1|!Y=iCa9#p{lAVVd506XAA*a^E}H~8eJ(N(6$t~=$i`s9h4S60=wDomSNQ3ZS8
zBlsBh!an!}_CqxsfKTBd9FkKjK6B)#rb!If{IyJr{T$QO^!gmWfWz=5d<9>_H}EYS
zf$yLOj>7lwgIv?A*48pK)wsmkIFX`t7;6%1aYD^aCv>CEo<5)!euSUk82I65I1VS^
z7dQ#0;4}o_4Ezdb;Ws&bKwX7!XWB|cC%Jn{UEgarI*aClt6!;Wx|XfZI=j_*4t|Gv
zI1d-#54Z?_!e8(=G{8UbuWWT*@=p#eTDcFI+i3pk9F{KQ_?p8m!xgv+jRoc~;RHrv
z^h`5Gd7wGJYQcmvEtyurXAX-Hp2*P5Jls;Q*2KQK-6BMAq0X3?2!XU_+Axt!TgJc`
znRd(#OnasS(~;@KL@}L(FD9mosGi-l3+}pgT|}G|uelEvcbCvU7)rVbr=g@Pb0gD@
z>CQwmF-#ApClkxWF};}HOdrN1$_*v);^xp%P>%oM{aUBn=Q<LPU2sMnjTgvG%+1U#
z%&kmc<~AmQxt+O#>BsbE?qu!~zNn*ji>L#tsRNTVSMS}N$f*;{5(RP(b1yT1xsOR=
z1~T_E$;<=HAZ9Q#gc-`D2w!5^Fp=<d=;%-Do{FPi)=uLV>I>j;{K}0By_%TpoX{gx
zAP+LbnTMDW%t+>8W)zdgJi??iqnR<xqs(K>SjNmeE_@SuWQb)`e7_fEh$};c$z69e
z1N&w+XziQj`wZ?|=#@@dFX!>v41qksJjsk>#xt2r7L(0PU?ws-%p~S1=4r;lOcuW5
zwNu2T-tJ>(vPDnd<knNg{C=UAf8N(L&skRTRGzcIps50RhMC4pXP#xO%nW8GGmFV(
zW;1h`=a{+7^THPxG*2|VuDWK->$T@UF;AKQ<UBn6<b37@=0)ZuCXcZ(3z&t>%giEX
zF|&lRGY-sua;fl?gsz!o%kyzwDmBkb_=<eY&zEUVZ&AKL3Yb@zWz4ILlW{T2nH5YS
zQ^dT+yv`IeE15Th&s(%gRDKXT5Xv8mo*I8Fe1Wm6L%ZS1SS_5cj5W+!W*y^ZJj{Bg
zgn5&Bi`l@u&Ah{GWZq>qF`I?Yl~F3HV?zfecpGzR-a~5arJ=pC<(CSlEq@DB#*{N&
zW-GIe+0MMjR50%|GE>QXAj)m|JH(36kx=en$$45Y1a|lSkXMAm^r1j@GP{`FjE|{e
z_AnnYA2WNIeat7!e&KVNs)fU*nzSKV^Ez~GtLCgx71g-)RUBYGWezfjn9rEcnJ<{b
z%$Lkp%-75}%(u)D+&e426Ul=@^DA!_W@$~otET4K0oT{ypy`h?-!nfjwakyqPs}mK
z&-~0BXHGD`FejN)!e`T;7XIAO9w>+1RH}K>{R>X>2v`=K7D#|O!~DveWqxDom~+hU
zOg(d+xxoCvTogXbqCbUoi)!j*chebncN_4RP`%yunK6R7{Yifb<Zq^d`G@(Jxx`#%
zt}s`bMp58T;<E%JNj`s)UUGQcb5H4|#JmLlM*ts*LygMkz}VpD(70nmq%!UiG0NxA
zINmEO+ormHs`2oHZI$h;or6{5mG!Na?bNMl*N^AvDyxDMB+SXY{sA4{#3|c`JEp0|
z@vckR_V%8y8h2C$C$^l;SB=lD4~`rBt5oB7lY#9#oz8o<O!a_&Oz;R)4XIL%<3)LJ
zyYw^FI9^37+bb)7P>p+1f<MQiuTzazT7uu7X1%1$ue)|g+XVNtW<e|b_Xhv3X*xBY
z+~D}ee><zjYvP0BUQ@hk94~v63B4EZR*eTH1;4*~{IF}|SU)BzYo|QHWX!ns00%aj
z4a&G~`V`eT{!l98{uc98<M;!nj9a!WRgJqUgFnYKWtD2&)6=NTuWN8^Ry|PT3a&uc
zdxvTqw?kzD>%03^<G5of<15b{QH@v53Xa>JI;9%N@3->(H60CqsvbyoyLX(?OWprZ
U{>PW5Z?zH^=>zHMqsHF;AJqy5HUIzs

delta 6184
zcmds*i(eFF7suxr5V<Jqs;QV4QZz21o0^iP;RPW}-R$B<DUvrJvGtnQATt-%bWv1L
z(_QT%DZ+}{teP!aCX^PH8ETi!E|Ll^mWdZ$-rt$o^|MdD^AEg!KHo1pXV!VnoH_H%
zvvaj@rDnU^qRD@hjaW1I-k{FGg5WO*LfD?Ps1{<z%mU`uV{cgt9rJuHXwh<%_v5^l
zPkO&`q2=4ms;>~mgo28o{b>O>f-!C8l(ad5&<>v~{RANw7(f(g6cGbOARWX50aP<V
zco)n7@jw74@TUk&0Xjh06&0Kb?iRG-c5^fRw~B(ekPZbMLo|UOdLnxt2r>Av<ifh~
z+8$#3n1Y?5omgdIoI%@-#S}he&~_-CWYGGThoy^c;uEzvpdbvBEeJV@Z=`5VDOv_Z
z;0OFc3lIQWf<O=if<Xvq1zLkp5C$|-idMUN<PMiDF{n^CKh<AJB~mO6E!Y(r{y$QY
z(hXX@7~@Ssm>4RjBLym1Ii@MwqdiFmPB$GzmPb?CfVQ9=xCgWc9Y9AA4myE*L1)kf
zbd`>#bnBA3<4R#-ko5C`ZpV^5Rq`{q+p6UjAx5B3h6`=w(Y-m^E|?biM}Yf4chCbw
zf}WrkxF1A;Xb=Np!2=);^ag!^4m=3@N{jsamDX|<i>vDwDV_>b%c{7ppHIbdmWOc_
zi(jQ|KjEoZ?kbb$?g#pVhrj?Z5X6Ir!65JmcoYl<L%?Go0SuKy_pmK*Dpj-28-8pg
zsy2F$K2p5VN-aItN5fGxhr1P5G@-#X$T=L00Eu8ENCL^=aWD!z0iFb-!5C?fb8P5U
zt}yb*RjRF2xksNawslk}E#tjh!jaH|7qoFE{XRW-3XB6!gJ;0A;5jfJ7{K!&1-t-W
z1QR6vzKO0QTpDz9`PnO4t5v{0KNS_QORc#IG)ku6g{fc?NCPi{$zTeY3Z{YSART0Y
zOz^T4yl_UqfxCw{-cTJrzi$?XH@f-Z9d6Ub{#jrqcm){2EHE3)0dv7TFdw`M7J!A)
z#r}&vYxwu@kz$o<ck1q6?9-jDJia@N)+*cMJl*MDYcfYK21~$F@EXVlIpB4$47>rB
zgEzqnU;?=yPclcFo6K@|TAL>uS*WyrpZWU%=+Uf8X7s3PlC_u2=#&L+B^ht0RJg<B
zPBGS-O~(3_;4QEUtOgd457vOSpa84`>%j(42#O?Q{YKke_3Ghg)U;FUmVDPHpMo{!
zD3gtM0<`BT+fRE67HBqIJiiHS25*BcU@Nc!8`uW6gLlBY;63m@*a3>c2Vf`I1xlog
z=Xc+;Ou4<Ar<Cqr;h7Xg*94VEhork)itchf!#_Nto;8{4OF<d1gFT=e>;?P4e()iX
zfCC%=2f-om5jYG!mdy3e0nhn%oAt-`*I1-DL$%ox5-X5S!eqYLb0?cNB~^eU;3)V6
zd<u?%&%kj|397*7;0tg<+LUy1!xZ1)<>#nhyjFF}v6q}gBgF^uQ%+7`s41lEB={11
z1**Z<;2ZEQI0e1~r@{B22AlzBrI507tAe;3wm8m3vaxE{qvL^dKJ8N^DA&c?KKdYK
z+u&)R;vm!PL+8K`;79Ni_!+psFW@}*75oN%2Y-MIpcecIE`m$a>_eCLGtYgub{p#;
z4LN+d_nl|MEY)(=`qj}4oz9m_$WUfzV?dqB7*G$c05@m=SHWN4Z*UF#1FnNca0A?w
zi~+Y?8C=Qa8QRcZ_5QCm-a^+HTJ!h6t+gp;&Mj~oG=cws=9MvXm<5nTk{`*R)PfX1
zYDo$t1u<vLoM2WO$|XeCP+Y2(kJGR$n4vJXxqL#kbL9!i;#wZepb%0kQfpEuDU75c
zX-REJZAtA&_mJ9?I*>Xtr)xP@Ve@=*v+lkbj$5p+YI0Gx!YR3$Xg;}!7$rG-Q#iBO
zn>vy1C3PlsA$28nBSnzzBXuYBAVrdTl6sNuXJz)LC>HCRn{xkESF1IYQx_eL+}czh
z=R2xnrNtZ@&7c@kEa?GK9H}>{4@pOQkkpsdkJO*^5NQB&nqvpDUR8IeR6A3(T)Jxm
zDW%$}e7SVfEavM2aoFqeq=!j^NRN;nB@HGGAw5P)APprABMm2wASEKN>m%8a>Arbc
zPfkq2Wt7^eJZGHAsY%RdrZ8r-k>>>OOc6C(*`DB;DRO69wx=aAD4Fy)X%y)R(vzgo
zq%owiBt7XV(m2x7q-RLalAa@tCmEP?d)o7CRi-?FIQyJ<o;8hN$<{b+3QizRqjmzt
z*za=!$vrjP2?)KFZSTTY-`gU#OJUFpq!&pONE1n^q)DVS(o3Ytq$#ASq-mt-q;ygS
zbBgUU+0<U1c`qW3b$8|s%4CcCd4EdSx2kqU&df~e3j2D#D;n2Z&Kffr^fGA%DT_3d
z^a{yHnnjvT!en&TIF~e!G@tY;X#sPdH7;ZUg}z;(+(3q%s(DQcTts;pw()t{wpk7a
zFJjPQ(h|~A(rct_QV!{L(lXK;q~)YHNh?SuQZ92I49;Vv+kNx0X6??y#Xh2Xv1_p(
zlTN-vxv1VHT8%?POEinop2r|FX(j0`(kjwwl7*B{T0>e(Dj=;RttV|D6_SdW(`etw
z9Q-m{e(c7oJa(PMnVYC?w(k5osJ44Sing~!m$ivOn@Mkzwve`xtRx$08)-Y~9n!m`
z_ek%Pc94omACPu3r!H$3tL(v5QJK0os!op!*6gAx*5vckW5gQyfxCL$F1C7I3D^xv
zK^d@vJ)j)y1^d8$@F9?ZgO#p3z}9kkD66zwwN;ukr`JKMRzmi_mTB3Rvrz}pyitcp
zACV4|J|;Oy6{I7iqohwrpOTJ|K10t(9cLSIe6vw{zVUI@o_Bw6oU&<5;Cnt{sAW{i
zaRybAsz{%cz95|-og{rp`ifLd`kM3&>08n%(s#@`s^l~qGmOhi{`tB@HLZ?UPSeG5
z%;D1#Jc|HZ!)a!*HGEI1A)O(eC7mPvK>CsN6X|D?i}VZWJn2``Z_H_H_?@}ta=FPX
zrl|dDo%HeaKd59;C43_pODriFe=z6*sh0F7=_2V8=`yK~R8P7>a+4ZJS4n>{XG+H3
z?6uu@XQbcZ=_zZM4%Zm>63b~ogqq5#G}RE#gDb~sDG$HKpnpi$NsXi%q?@E$q}!w>
z(tk+JY-M@4h!0G3;)A%sX4SR4#loC@nr|`|XiYvSit*WcdI^Vr@ne+xJ}^!B>y77@
zDg|=2jgY_RjpHSgyqz1^pNj{cQZBa3HJpn(ZYZx$YJbxE#<K-Sl`??4B-M9^fY%Z7
zwtH3<7sq=7dE5TVVlJLrr3{=qcqJEKa6^gfAKA>s@dFmy*@B?amvK8>6O;-hlvi-^
zgc-`f)!o13;`lKoZyVx&;NrHi%5jq4s^j90S<3hI;%#|+;Z8~0E15QS4xtyOcjC6C
zO1yS!CoW#oSBYy*M{{wkF69AryW|(IcaDQ4o*Yl_ntbQ`IDfp0m6VlI?U|H2J1Ab9
zraY@$kG;&ru_~b5g6sSuE{><N9M4^2=HmHBl;fn2-^|5rv08b2!C)%kcGPTC&LF4f
zAugVft6Tz2c_kOe)F=;Z5Nf!%V~G-<^!f!Zj%h*ue$61wKim%e4y*aFD0clndxNg`
Q3ob%44jngc?Dz-&3q{qdz5oCK