From aefd8c8c8bc865b92f62f6bb51b30dcbed57e148 Mon Sep 17 00:00:00 2001 From: Oisin Date: Wed, 9 Oct 2024 21:40:39 +0100 Subject: [PATCH 1/4] add git tag v0.0.0 --- Dockerfile | 2 +- exeDocker.bat | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 12dccf1..f4f3b29 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ RUN useradd ${user} RUN mkdir -p /home/${user} && chown -R ${user}: /home/${user} # clone git repo -RUN git clone https://github.com/oislen/IrishClimateDashboard.git /home/ubuntu/IrishClimateDashboard +RUN git clone https://github.com/oislen/IrishClimateDashboard.git --branch ${GIT_BRANCH} /home/ubuntu/IrishClimateDashboard # install required python packages COPY requirements.txt /tmp/ diff --git a/exeDocker.bat b/exeDocker.bat index d623fec..fc60a15 100644 --- a/exeDocker.bat +++ b/exeDocker.bat @@ -4,12 +4,13 @@ SET DOCKER_REPO=irishclimatedashboard SET DOCKER_TAG=latest SET DOCKER_IMAGE=%DOCKER_USER%/%DOCKER_REPO%:%DOCKER_TAG% SET DOCKER_CONTAINER_NAME=icd +SET GIT_BRANCH=v0.0.0 :: remove existing docker containers and images docker image rm -f %DOCKER_IMAGE% :: build docker image -call docker build --no-cache -t %DOCKER_IMAGE% . +call docker build --no-cache -t %DOCKER_IMAGE% . --build-arg GIT_BRANCH=%GIT_BRANCH% ::call docker build -t %DOCKER_IMAGE% . :: run docker container From b95e5b26753ab6950f447b6de88115a013def9ca Mon Sep 17 00:00:00 2001 From: Oisin Date: Wed, 9 Oct 2024 21:40:55 +0100 Subject: [PATCH 2/4] add git tag v0.0.0 --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index f4f3b29..9f27d41 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,9 @@ FROM ubuntu:20.04 # set environment variables ENV user=ubuntu ENV DEBIAN_FRONTEND=noninteractive +# set git branch for cloning +ARG GIT_BRANCH +ENV GIT_BRANCH=${GIT_BRANCH} # install required software and programmes for development environment RUN apt-get update From 5a0858c085e8e601d1c8cab91ddcf2a899294c90 Mon Sep 17 00:00:00 2001 From: Oisin Date: Wed, 9 Oct 2024 21:51:20 +0100 Subject: [PATCH 3/4] Replaced prints with logging.infp --- scripts/PreProcessData/gen_counties_data.py | 19 ++++++++++--------- scripts/PreProcessData/gen_master_data.py | 11 ++++++----- .../PreProcessData/gen_preaggregate_data.py | 11 ++++++----- scripts/PreProcessData/gen_stations_data.py | 11 ++++++----- scripts/prg_preprocess_data.py | 17 +++++++++++------ 5 files changed, 39 insertions(+), 30 deletions(-) diff --git a/scripts/PreProcessData/gen_counties_data.py b/scripts/PreProcessData/gen_counties_data.py index d436f9e..910d6b4 100644 --- a/scripts/PreProcessData/gen_counties_data.py +++ b/scripts/PreProcessData/gen_counties_data.py @@ -1,5 +1,6 @@ import cons import os +import logging import pickle import pandas as pd import geopandas as gpd @@ -29,7 +30,7 @@ def gen_counties_data( 0, pandas.DataFrame Depending on return_data parameter, either return zero or map data """ - print("Loading rep / ni counties shape files ...") + logging.info("Loading rep / ni counties shape files ...") # load in county shape files rep_counties = ( gpd.read_file(cons.rep_counties_fpath)[["ENGLISH", "geometry"]] @@ -40,32 +41,32 @@ def gen_counties_data( epsg=2157 ) if type(pre_agg_data_dict) == type(None): - print("Loading preaggregated data dictionary ...") + logging.info("Loading preaggregated data dictionary ...") # load preaggregated data with open(cons.preaggregate_data_fpath, "rb") as f: pre_agg_data_dict = pickle.load(f) - print("Concatenating counties geopandas dataframes ...") + logging.info("Concatenating counties geopandas dataframes ...") # concatenate county shape files counties = gpd.GeoDataFrame( pd.concat([rep_counties, ni_counties], ignore_index=True), crs="EPSG:2157" ) - print("Simplifiying counties geometries ...") + logging.info("Simplifiying counties geometries ...") # simplify the granularity of the geometry column counties["geometry"] = counties["geometry"].simplify(tolerance=1000) - print("Standardising county names to title case ...") + logging.info("Standardising county names to title case ...") # clean up county column counties["county"] = ( counties["county"].str.title().str.replace(pat="County ", repl="", regex=False) ) - print("Ordering results by county name ...") + logging.info("Ordering results by county name ...") # sort data by county counties = counties.sort_values(by="county") - print("Calculating county level statistics ...") + logging.info("Calculating county level statistics ...") # create a dictionary to contain map data map_data_dict = {} # iterate over statistic and pre aggregated data for stat, pre_agg_data in pre_agg_data_dict.items(): - print(f"{stat} ...") + logging.info(f"{stat} ...") # aggregate data to county level group_cols = ["county"] agg_dict = {col: stat for col in cons.col_options} @@ -84,7 +85,7 @@ def gen_counties_data( # if the output if map_data_fpath != None: if os.path.exists(map_data_fpath): - print("Writing counties data to disk as pickle file ...") + logging.info("Writing counties data to disk as pickle file ...") # pickle the preaggregated data dictionary to disk with open(map_data_fpath, "wb") as f: pickle.dump(map_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/scripts/PreProcessData/gen_master_data.py b/scripts/PreProcessData/gen_master_data.py index abdae51..6c4d352 100644 --- a/scripts/PreProcessData/gen_master_data.py +++ b/scripts/PreProcessData/gen_master_data.py @@ -1,5 +1,6 @@ import os import pandas as pd +import logging import cons from utilities.gen_boto3_excel import gen_boto3_excel from beartype import beartype @@ -36,7 +37,7 @@ def gen_master_data( # if load data locally if not aws_s3: if met_eireann_fpaths == None: - print("Retrieving raw met eireann .xlsx file paths from disk ...") + logging.info("Retrieving raw met eireann .xlsx file paths from disk ...") # load data files from file directory met_eireann_fpaths = [ os.path.join(cons.met_eireann_dir, fpath) @@ -45,11 +46,11 @@ def gen_master_data( ] # otherwise if loading data from aws s3 else: - print("Retrieving raw met eireann .xlsx file paths from aws s3 ...") + logging.info("Retrieving raw met eireann .xlsx file paths from aws s3 ...") met_eireann_fpaths = gen_boto3_excel( bucket="irishclimateapp", prefix="data/Met_Eireann" ) - print("Reading, concatenating and cleaning .xlsx files ...") + logging.info("Reading, concatenating and cleaning .xlsx files ...") # load and concatenate data files together data_list = [ pd.read_excel(fpath, dtype=dtypes, na_values=[" "]) @@ -59,13 +60,13 @@ def gen_master_data( data = data[data.columns[~data.columns.str.contains("ind")]] data["date"] = pd.to_datetime(data["date"]) data["county"] = data["county"].str.title() - print("Sorting master file by county and station names ...") + logging.info("Sorting master file by county and station names ...") # order results by county and station alphabetically data = data.sort_values(by=["county", "station"]).reset_index(drop=True) # if the output if master_data_fpath != None: if os.path.exists(master_data_fpath): - print("Writing master file to disk as .feather file ...") + logging.info("Writing master file to disk as .feather file ...") # save concatenated data to disk data.to_feather(master_data_fpath) else: diff --git a/scripts/PreProcessData/gen_preaggregate_data.py b/scripts/PreProcessData/gen_preaggregate_data.py index bb2be99..7426348 100644 --- a/scripts/PreProcessData/gen_preaggregate_data.py +++ b/scripts/PreProcessData/gen_preaggregate_data.py @@ -1,5 +1,6 @@ import cons import os +import logging import pickle import pandas as pd from beartype import beartype @@ -29,10 +30,10 @@ def gen_preaggregate_data( Depending on return_data parameter, either return zero or preaggregated data """ if type(master_data) == type(None): - print("Loading master data from disk ...") + logging.info("Loading master data from disk ...") # load master data master_data = pd.read_feather(cons.master_data_fpath) - print("Performing initial data aggregation to year-month level ...") + logging.info("Performing initial data aggregation to year-month level ...") # preaggregate the data to year-month level for each available stat pre_agg_data_dict = {} strftime = cons.date_strftime_dict["year-month"] @@ -40,16 +41,16 @@ def gen_preaggregate_data( agg_data["date_str"] = agg_data["date"].dt.strftime(strftime) agg_data["date"] = pd.to_datetime(agg_data["date_str"], format=strftime) group_cols = ["county", "date", "date_str"] - print("Performing final data aggregation to desired statistics ...") + logging.info("Performing final data aggregation to desired statistics ...") for stat in cons.stat_options: - print(f"{stat} ...") + logging.info(f"{stat} ...") agg_dict = {col: stat for col in cons.col_options} tmp_agg_data = agg_data.groupby(group_cols, as_index=False).agg(agg_dict) pre_agg_data_dict[stat] = tmp_agg_data # if the output if preaggregate_data_fpath != None: if os.path.exists(preaggregate_data_fpath): - print("Writing preaggregated data to disk as .pickle file ...") + logging.info("Writing preaggregated data to disk as .pickle file ...") # pickle the preaggregated data dictionary to disk with open(cons.preaggregate_data_fpath, "wb") as f: pickle.dump(pre_agg_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/scripts/PreProcessData/gen_stations_data.py b/scripts/PreProcessData/gen_stations_data.py index ce0faee..178d7b2 100644 --- a/scripts/PreProcessData/gen_stations_data.py +++ b/scripts/PreProcessData/gen_stations_data.py @@ -1,4 +1,5 @@ import os +import logging import pickle import pandas as pd import geopandas as gpd @@ -26,21 +27,21 @@ def gen_stations_data( 0, pandas.DataFrame Depending on return_data parameter, either return zero or gis points data """ - print("Loading master and stations data from disk ...") + logging.info("Loading master and stations data from disk ...") # load master and station data master_data = pd.read_feather(cons.master_data_fpath) stations_data = pd.read_csv(cons.stations_fpath) - print("Identifying master station ids ...") + logging.info("Identifying master station ids ...") # extract out station ids from mater file master_station_ids = master_data["id"].unique() - print("Filtering corresponding station data ...") + logging.info("Filtering corresponding station data ...") # filter master data with station ids master_stations = stations_data.loc[ stations_data["station_id"].isin(master_station_ids), : ].copy() master_stations["county"] = master_stations["county"].str.title() master_stations["name"] = master_stations["name"].str.title() - print("Creating geopandas DataFrame of station data ...") + logging.info("Creating geopandas DataFrame of station data ...") # create gis data geo_master_stations = gpd.GeoDataFrame( master_stations, @@ -52,7 +53,7 @@ def gen_stations_data( # if the output if points_data_fpath != None: if os.path.exists(points_data_fpath): - print("Writing gis stations data to disk as .pickle file ...") + logging.info("Writing gis stations data to disk as .pickle file ...") # pickle the gis stations data with open(points_data_fpath, "wb") as f: pickle.dump(geo_master_stations, f, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/scripts/prg_preprocess_data.py b/scripts/prg_preprocess_data.py index 085c8a7..907d60f 100644 --- a/scripts/prg_preprocess_data.py +++ b/scripts/prg_preprocess_data.py @@ -1,5 +1,6 @@ import time import cons +import logging from PreProcessData.gen_master_data import gen_master_data from PreProcessData.gen_preaggregate_data import gen_preaggregate_data from PreProcessData.gen_counties_data import gen_counties_data @@ -7,27 +8,31 @@ if __name__ == '__main__': + # set up logging + lgr = logging.getLogger() + lgr.setLevel(logging.INFO) + # start timer t0 = time.time() - print('~~~~~ Generating master data file ...') + logging.info('~~~~~ Generating master data file ...') # generate master data file gen_master_data(master_data_fpath = cons.master_data_fpath, return_data = False) - print('~~~~~ Generating preaggregated data file ...') + logging.info('~~~~~ Generating preaggregated data file ...') # generate the preaggregate data gen_preaggregate_data(preaggregate_data_fpath = cons.preaggregate_data_fpath, return_data = False) - print('~~~~~ Generating geospatial counties data file ...') + logging.info('~~~~~ Generating geospatial counties data file ...') # generate counties data gen_counties_data(map_data_fpath = cons.map_data_fpath, return_data = False) - print('~~~~~ Generating geospatial stations data file ...') + logging.info('~~~~~ Generating geospatial stations data file ...') # generate wheather station points data gen_stations_data(points_data_fpath = cons.points_data_fpath, return_data = False) - # end timer and print result + # end timer and log result t1 = time.time() tres = t1 - t0 eres = round(tres, 2) - print(f'Total Execution Time: {eres} seconds') \ No newline at end of file + logging.info(f'Total Execution Time: {eres} seconds') \ No newline at end of file From a8c3534edf4dd76a0eafc23911ce8eebc69211a0 Mon Sep 17 00:00:00 2001 From: Oisin Date: Wed, 9 Oct 2024 21:51:46 +0100 Subject: [PATCH 4/4] Created executable .bat script for generating preprocessed data --- exePreProcessData.bat | 1 + 1 file changed, 1 insertion(+) create mode 100644 exePreProcessData.bat diff --git a/exePreProcessData.bat b/exePreProcessData.bat new file mode 100644 index 0000000..9a2d7ec --- /dev/null +++ b/exePreProcessData.bat @@ -0,0 +1 @@ +call python scripts\prg_preprocess_data.py \ No newline at end of file