Merge pull request #17 from oislen/dev

Dev
oislen · Oct 9, 2024 · 7882ab2 · 7882ab2
2 parents 03bb574 + a8c3534
commit 7882ab2
Show file tree

Hide file tree

Showing 8 changed files with 46 additions and 32 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -4,6 +4,9 @@ FROM ubuntu:20.04
 # set environment variables
 ENV user=ubuntu
 ENV DEBIAN_FRONTEND=noninteractive
+# set git branch for cloning
+ARG GIT_BRANCH
+ENV GIT_BRANCH=${GIT_BRANCH}
 
 # install required software and programmes for development environment
 RUN apt-get update 
@@ -14,7 +17,7 @@ RUN useradd ${user}
 RUN mkdir -p /home/${user} && chown -R ${user}: /home/${user}
 
 # clone git repo
-RUN git clone https://github.com/oislen/IrishClimateDashboard.git /home/ubuntu/IrishClimateDashboard
+RUN git clone https://github.com/oislen/IrishClimateDashboard.git --branch ${GIT_BRANCH} /home/ubuntu/IrishClimateDashboard
 
 # install required python packages
 COPY requirements.txt /tmp/

diff --git a/exeDocker.bat b/exeDocker.bat
@@ -4,12 +4,13 @@ SET DOCKER_REPO=irishclimatedashboard
 SET DOCKER_TAG=latest
 SET DOCKER_IMAGE=%DOCKER_USER%/%DOCKER_REPO%:%DOCKER_TAG%
 SET DOCKER_CONTAINER_NAME=icd
+SET GIT_BRANCH=v0.0.0
 
 :: remove existing docker containers and images
 docker image rm -f %DOCKER_IMAGE%
 
 :: build docker image
-call docker build --no-cache -t %DOCKER_IMAGE% . 
+call docker build --no-cache -t %DOCKER_IMAGE% .  --build-arg GIT_BRANCH=%GIT_BRANCH%
 ::call docker build -t %DOCKER_IMAGE% .
 
 :: run docker container

diff --git a/exePreProcessData.bat b/exePreProcessData.bat
@@ -0,0 +1 @@
+call python scripts\prg_preprocess_data.py
diff --git a/scripts/PreProcessData/gen_counties_data.py b/scripts/PreProcessData/gen_counties_data.py
@@ -1,5 +1,6 @@
 import cons
 import os
+import logging
 import pickle
 import pandas as pd
 import geopandas as gpd
@@ -29,7 +30,7 @@ def gen_counties_data(
     0, pandas.DataFrame
         Depending on return_data parameter, either return zero or map data
     """
-    print("Loading rep / ni counties shape files ...")
+    logging.info("Loading rep / ni counties shape files ...")
     # load in county shape files
     rep_counties = (
         gpd.read_file(cons.rep_counties_fpath)[["ENGLISH", "geometry"]]
@@ -40,32 +41,32 @@ def gen_counties_data(
         epsg=2157
     )
     if type(pre_agg_data_dict) == type(None):
-        print("Loading preaggregated data dictionary ...")
+        logging.info("Loading preaggregated data dictionary ...")
         # load preaggregated data
         with open(cons.preaggregate_data_fpath, "rb") as f:
             pre_agg_data_dict = pickle.load(f)
-    print("Concatenating counties geopandas dataframes ...")
+    logging.info("Concatenating counties geopandas dataframes ...")
     # concatenate county shape files
     counties = gpd.GeoDataFrame(
         pd.concat([rep_counties, ni_counties], ignore_index=True), crs="EPSG:2157"
     )
-    print("Simplifiying counties geometries ...")
+    logging.info("Simplifiying counties geometries ...")
     # simplify the granularity of the geometry column
     counties["geometry"] = counties["geometry"].simplify(tolerance=1000)
-    print("Standardising county names to title case ...")
+    logging.info("Standardising county names to title case ...")
     # clean up county column
     counties["county"] = (
         counties["county"].str.title().str.replace(pat="County ", repl="", regex=False)
     )
-    print("Ordering results by county name ...")
+    logging.info("Ordering results by county name ...")
     # sort data by county
     counties = counties.sort_values(by="county")
-    print("Calculating county level statistics ...")
+    logging.info("Calculating county level statistics ...")
     # create a dictionary to contain map data
     map_data_dict = {}
     # iterate over statistic and pre aggregated data
     for stat, pre_agg_data in pre_agg_data_dict.items():
-        print(f"{stat} ...")
+        logging.info(f"{stat} ...")
         # aggregate data to county level
         group_cols = ["county"]
         agg_dict = {col: stat for col in cons.col_options}
@@ -84,7 +85,7 @@ def gen_counties_data(
     # if the output
     if map_data_fpath != None:
         if os.path.exists(map_data_fpath):
-            print("Writing counties data to disk as pickle file ...")
+            logging.info("Writing counties data to disk as pickle file ...")
             # pickle the preaggregated data dictionary to disk
             with open(map_data_fpath, "wb") as f:
                 pickle.dump(map_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

diff --git a/scripts/PreProcessData/gen_master_data.py b/scripts/PreProcessData/gen_master_data.py
@@ -1,5 +1,6 @@
 import os
 import pandas as pd
+import logging
 import cons
 from utilities.gen_boto3_excel import gen_boto3_excel
 from beartype import beartype
@@ -36,7 +37,7 @@ def gen_master_data(
     # if load data locally
     if not aws_s3:
         if met_eireann_fpaths == None:
-            print("Retrieving raw met eireann .xlsx file paths from disk ...")
+            logging.info("Retrieving raw met eireann .xlsx file paths from disk ...")
             # load data files from file directory
             met_eireann_fpaths = [
                 os.path.join(cons.met_eireann_dir, fpath)
@@ -45,11 +46,11 @@ def gen_master_data(
             ]
     # otherwise if loading data from aws s3
     else:
-        print("Retrieving raw met eireann .xlsx file paths from aws s3 ...")
+        logging.info("Retrieving raw met eireann .xlsx file paths from aws s3 ...")
         met_eireann_fpaths = gen_boto3_excel(
             bucket="irishclimateapp", prefix="data/Met_Eireann"
         )
-    print("Reading, concatenating and cleaning .xlsx files ...")
+    logging.info("Reading, concatenating and cleaning .xlsx files ...")
     # load and concatenate data files together
     data_list = [
         pd.read_excel(fpath, dtype=dtypes, na_values=[" "])
@@ -59,13 +60,13 @@ def gen_master_data(
     data = data[data.columns[~data.columns.str.contains("ind")]]
     data["date"] = pd.to_datetime(data["date"])
     data["county"] = data["county"].str.title()
-    print("Sorting master file by county and station names ...")
+    logging.info("Sorting master file by county and station names ...")
     # order results by county and station alphabetically
     data = data.sort_values(by=["county", "station"]).reset_index(drop=True)
     # if the output
     if master_data_fpath != None:
         if os.path.exists(master_data_fpath):
-            print("Writing master file to disk as .feather file ...")
+            logging.info("Writing master file to disk as .feather file ...")
             # save concatenated data to disk
             data.to_feather(master_data_fpath)
         else:

diff --git a/scripts/PreProcessData/gen_preaggregate_data.py b/scripts/PreProcessData/gen_preaggregate_data.py
@@ -1,5 +1,6 @@
 import cons
 import os
+import logging
 import pickle
 import pandas as pd
 from beartype import beartype
@@ -29,27 +30,27 @@ def gen_preaggregate_data(
         Depending on return_data parameter, either return zero or preaggregated data
     """
     if type(master_data) == type(None):
-        print("Loading master data from disk ...")
+        logging.info("Loading master data from disk ...")
         # load master data
         master_data = pd.read_feather(cons.master_data_fpath)
-    print("Performing initial data aggregation to year-month level ...")
+    logging.info("Performing initial data aggregation to year-month level ...")
     # preaggregate the data to year-month level for each available stat
     pre_agg_data_dict = {}
     strftime = cons.date_strftime_dict["year-month"]
     agg_data = master_data.copy()
     agg_data["date_str"] = agg_data["date"].dt.strftime(strftime)
     agg_data["date"] = pd.to_datetime(agg_data["date_str"], format=strftime)
     group_cols = ["county", "date", "date_str"]
-    print("Performing final data aggregation to desired statistics ...")
+    logging.info("Performing final data aggregation to desired statistics ...")
     for stat in cons.stat_options:
-        print(f"{stat} ...")
+        logging.info(f"{stat} ...")
         agg_dict = {col: stat for col in cons.col_options}
         tmp_agg_data = agg_data.groupby(group_cols, as_index=False).agg(agg_dict)
         pre_agg_data_dict[stat] = tmp_agg_data
     # if the output
     if preaggregate_data_fpath != None:
         if os.path.exists(preaggregate_data_fpath):
-            print("Writing preaggregated data to disk as .pickle file ...")
+            logging.info("Writing preaggregated data to disk as .pickle file ...")
             # pickle the preaggregated data dictionary to disk
             with open(cons.preaggregate_data_fpath, "wb") as f:
                 pickle.dump(pre_agg_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

diff --git a/scripts/PreProcessData/gen_stations_data.py b/scripts/PreProcessData/gen_stations_data.py
@@ -1,4 +1,5 @@
 import os
+import logging
 import pickle
 import pandas as pd
 import geopandas as gpd
@@ -26,21 +27,21 @@ def gen_stations_data(
     0, pandas.DataFrame
         Depending on return_data parameter, either return zero or gis points data
     """
-    print("Loading master and stations data from disk ...")
+    logging.info("Loading master and stations data from disk ...")
     # load master and station data
     master_data = pd.read_feather(cons.master_data_fpath)
     stations_data = pd.read_csv(cons.stations_fpath)
-    print("Identifying master station ids ...")
+    logging.info("Identifying master station ids ...")
     # extract out station ids from mater file
     master_station_ids = master_data["id"].unique()
-    print("Filtering corresponding station data ...")
+    logging.info("Filtering corresponding station data ...")
     # filter master data with station ids
     master_stations = stations_data.loc[
         stations_data["station_id"].isin(master_station_ids), :
     ].copy()
     master_stations["county"] = master_stations["county"].str.title()
     master_stations["name"] = master_stations["name"].str.title()
-    print("Creating geopandas DataFrame of station data ...")
+    logging.info("Creating geopandas DataFrame of station data ...")
     # create gis data
     geo_master_stations = gpd.GeoDataFrame(
         master_stations,
@@ -52,7 +53,7 @@ def gen_stations_data(
     # if the output
     if points_data_fpath != None:
         if os.path.exists(points_data_fpath):
-            print("Writing gis stations data to disk as .pickle file ...")
+            logging.info("Writing gis stations data to disk as .pickle file ...")
             # pickle the gis stations data
             with open(points_data_fpath, "wb") as f:
                 pickle.dump(geo_master_stations, f, protocol=pickle.HIGHEST_PROTOCOL)

diff --git a/scripts/prg_preprocess_data.py b/scripts/prg_preprocess_data.py
@@ -1,33 +1,38 @@
 import time
 import cons
+import logging
 from PreProcessData.gen_master_data import gen_master_data
 from PreProcessData.gen_preaggregate_data import gen_preaggregate_data
 from PreProcessData.gen_counties_data import gen_counties_data
 from PreProcessData.gen_stations_data import gen_stations_data
 
 if __name__ == '__main__':
 
+    # set up logging
+    lgr = logging.getLogger()
+    lgr.setLevel(logging.INFO)
+
     # start timer
     t0 = time.time()
 
-    print('~~~~~ Generating master data file ...')
+    logging.info('~~~~~ Generating master data file ...')
     # generate master data file
     gen_master_data(master_data_fpath = cons.master_data_fpath, return_data = False)
 
-    print('~~~~~ Generating preaggregated data file ...')
+    logging.info('~~~~~ Generating preaggregated data file ...')
     # generate the preaggregate data
     gen_preaggregate_data(preaggregate_data_fpath = cons.preaggregate_data_fpath, return_data = False)
 
-    print('~~~~~ Generating geospatial counties data file ...')
+    logging.info('~~~~~ Generating geospatial counties data file ...')
     # generate counties data
     gen_counties_data(map_data_fpath = cons.map_data_fpath, return_data = False)
 
-    print('~~~~~ Generating geospatial stations data file ...')
+    logging.info('~~~~~ Generating geospatial stations data file ...')
     # generate wheather station points data
     gen_stations_data(points_data_fpath = cons.points_data_fpath, return_data = False)
 
-    # end timer and print result
+    # end timer and log result
     t1 = time.time()
     tres = t1 - t0
     eres = round(tres, 2)
-    print(f'Total Execution Time: {eres} seconds')
+    logging.info(f'Total Execution Time: {eres} seconds')