Skip to content

Commit

Permalink
Merge pull request #17 from oislen/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
oislen authored Oct 9, 2024
2 parents 03bb574 + a8c3534 commit 7882ab2
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 32 deletions.
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ FROM ubuntu:20.04
# set environment variables
ENV user=ubuntu
ENV DEBIAN_FRONTEND=noninteractive
# set git branch for cloning
ARG GIT_BRANCH
ENV GIT_BRANCH=${GIT_BRANCH}

# install required software and programmes for development environment
RUN apt-get update
Expand All @@ -14,7 +17,7 @@ RUN useradd ${user}
RUN mkdir -p /home/${user} && chown -R ${user}: /home/${user}

# clone git repo
RUN git clone https://github.com/oislen/IrishClimateDashboard.git /home/ubuntu/IrishClimateDashboard
RUN git clone https://github.com/oislen/IrishClimateDashboard.git --branch ${GIT_BRANCH} /home/ubuntu/IrishClimateDashboard

# install required python packages
COPY requirements.txt /tmp/
Expand Down
3 changes: 2 additions & 1 deletion exeDocker.bat
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ SET DOCKER_REPO=irishclimatedashboard
SET DOCKER_TAG=latest
SET DOCKER_IMAGE=%DOCKER_USER%/%DOCKER_REPO%:%DOCKER_TAG%
SET DOCKER_CONTAINER_NAME=icd
SET GIT_BRANCH=v0.0.0

:: remove existing docker containers and images
docker image rm -f %DOCKER_IMAGE%

:: build docker image
call docker build --no-cache -t %DOCKER_IMAGE% .
call docker build --no-cache -t %DOCKER_IMAGE% . --build-arg GIT_BRANCH=%GIT_BRANCH%
::call docker build -t %DOCKER_IMAGE% .

:: run docker container
Expand Down
1 change: 1 addition & 0 deletions exePreProcessData.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
call python scripts\prg_preprocess_data.py
19 changes: 10 additions & 9 deletions scripts/PreProcessData/gen_counties_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import cons
import os
import logging
import pickle
import pandas as pd
import geopandas as gpd
Expand Down Expand Up @@ -29,7 +30,7 @@ def gen_counties_data(
0, pandas.DataFrame
Depending on return_data parameter, either return zero or map data
"""
print("Loading rep / ni counties shape files ...")
logging.info("Loading rep / ni counties shape files ...")
# load in county shape files
rep_counties = (
gpd.read_file(cons.rep_counties_fpath)[["ENGLISH", "geometry"]]
Expand All @@ -40,32 +41,32 @@ def gen_counties_data(
epsg=2157
)
if type(pre_agg_data_dict) == type(None):
print("Loading preaggregated data dictionary ...")
logging.info("Loading preaggregated data dictionary ...")
# load preaggregated data
with open(cons.preaggregate_data_fpath, "rb") as f:
pre_agg_data_dict = pickle.load(f)
print("Concatenating counties geopandas dataframes ...")
logging.info("Concatenating counties geopandas dataframes ...")
# concatenate county shape files
counties = gpd.GeoDataFrame(
pd.concat([rep_counties, ni_counties], ignore_index=True), crs="EPSG:2157"
)
print("Simplifiying counties geometries ...")
logging.info("Simplifiying counties geometries ...")
# simplify the granularity of the geometry column
counties["geometry"] = counties["geometry"].simplify(tolerance=1000)
print("Standardising county names to title case ...")
logging.info("Standardising county names to title case ...")
# clean up county column
counties["county"] = (
counties["county"].str.title().str.replace(pat="County ", repl="", regex=False)
)
print("Ordering results by county name ...")
logging.info("Ordering results by county name ...")
# sort data by county
counties = counties.sort_values(by="county")
print("Calculating county level statistics ...")
logging.info("Calculating county level statistics ...")
# create a dictionary to contain map data
map_data_dict = {}
# iterate over statistic and pre aggregated data
for stat, pre_agg_data in pre_agg_data_dict.items():
print(f"{stat} ...")
logging.info(f"{stat} ...")
# aggregate data to county level
group_cols = ["county"]
agg_dict = {col: stat for col in cons.col_options}
Expand All @@ -84,7 +85,7 @@ def gen_counties_data(
# if the output
if map_data_fpath != None:
if os.path.exists(map_data_fpath):
print("Writing counties data to disk as pickle file ...")
logging.info("Writing counties data to disk as pickle file ...")
# pickle the preaggregated data dictionary to disk
with open(map_data_fpath, "wb") as f:
pickle.dump(map_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
Expand Down
11 changes: 6 additions & 5 deletions scripts/PreProcessData/gen_master_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pandas as pd
import logging
import cons
from utilities.gen_boto3_excel import gen_boto3_excel
from beartype import beartype
Expand Down Expand Up @@ -36,7 +37,7 @@ def gen_master_data(
# if load data locally
if not aws_s3:
if met_eireann_fpaths == None:
print("Retrieving raw met eireann .xlsx file paths from disk ...")
logging.info("Retrieving raw met eireann .xlsx file paths from disk ...")
# load data files from file directory
met_eireann_fpaths = [
os.path.join(cons.met_eireann_dir, fpath)
Expand All @@ -45,11 +46,11 @@ def gen_master_data(
]
# otherwise if loading data from aws s3
else:
print("Retrieving raw met eireann .xlsx file paths from aws s3 ...")
logging.info("Retrieving raw met eireann .xlsx file paths from aws s3 ...")
met_eireann_fpaths = gen_boto3_excel(
bucket="irishclimateapp", prefix="data/Met_Eireann"
)
print("Reading, concatenating and cleaning .xlsx files ...")
logging.info("Reading, concatenating and cleaning .xlsx files ...")
# load and concatenate data files together
data_list = [
pd.read_excel(fpath, dtype=dtypes, na_values=[" "])
Expand All @@ -59,13 +60,13 @@ def gen_master_data(
data = data[data.columns[~data.columns.str.contains("ind")]]
data["date"] = pd.to_datetime(data["date"])
data["county"] = data["county"].str.title()
print("Sorting master file by county and station names ...")
logging.info("Sorting master file by county and station names ...")
# order results by county and station alphabetically
data = data.sort_values(by=["county", "station"]).reset_index(drop=True)
# if the output
if master_data_fpath != None:
if os.path.exists(master_data_fpath):
print("Writing master file to disk as .feather file ...")
logging.info("Writing master file to disk as .feather file ...")
# save concatenated data to disk
data.to_feather(master_data_fpath)
else:
Expand Down
11 changes: 6 additions & 5 deletions scripts/PreProcessData/gen_preaggregate_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import cons
import os
import logging
import pickle
import pandas as pd
from beartype import beartype
Expand Down Expand Up @@ -29,27 +30,27 @@ def gen_preaggregate_data(
Depending on return_data parameter, either return zero or preaggregated data
"""
if type(master_data) == type(None):
print("Loading master data from disk ...")
logging.info("Loading master data from disk ...")
# load master data
master_data = pd.read_feather(cons.master_data_fpath)
print("Performing initial data aggregation to year-month level ...")
logging.info("Performing initial data aggregation to year-month level ...")
# preaggregate the data to year-month level for each available stat
pre_agg_data_dict = {}
strftime = cons.date_strftime_dict["year-month"]
agg_data = master_data.copy()
agg_data["date_str"] = agg_data["date"].dt.strftime(strftime)
agg_data["date"] = pd.to_datetime(agg_data["date_str"], format=strftime)
group_cols = ["county", "date", "date_str"]
print("Performing final data aggregation to desired statistics ...")
logging.info("Performing final data aggregation to desired statistics ...")
for stat in cons.stat_options:
print(f"{stat} ...")
logging.info(f"{stat} ...")
agg_dict = {col: stat for col in cons.col_options}
tmp_agg_data = agg_data.groupby(group_cols, as_index=False).agg(agg_dict)
pre_agg_data_dict[stat] = tmp_agg_data
# if the output
if preaggregate_data_fpath != None:
if os.path.exists(preaggregate_data_fpath):
print("Writing preaggregated data to disk as .pickle file ...")
logging.info("Writing preaggregated data to disk as .pickle file ...")
# pickle the preaggregated data dictionary to disk
with open(cons.preaggregate_data_fpath, "wb") as f:
pickle.dump(pre_agg_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
Expand Down
11 changes: 6 additions & 5 deletions scripts/PreProcessData/gen_stations_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import logging
import pickle
import pandas as pd
import geopandas as gpd
Expand Down Expand Up @@ -26,21 +27,21 @@ def gen_stations_data(
0, pandas.DataFrame
Depending on return_data parameter, either return zero or gis points data
"""
print("Loading master and stations data from disk ...")
logging.info("Loading master and stations data from disk ...")
# load master and station data
master_data = pd.read_feather(cons.master_data_fpath)
stations_data = pd.read_csv(cons.stations_fpath)
print("Identifying master station ids ...")
logging.info("Identifying master station ids ...")
# extract out station ids from mater file
master_station_ids = master_data["id"].unique()
print("Filtering corresponding station data ...")
logging.info("Filtering corresponding station data ...")
# filter master data with station ids
master_stations = stations_data.loc[
stations_data["station_id"].isin(master_station_ids), :
].copy()
master_stations["county"] = master_stations["county"].str.title()
master_stations["name"] = master_stations["name"].str.title()
print("Creating geopandas DataFrame of station data ...")
logging.info("Creating geopandas DataFrame of station data ...")
# create gis data
geo_master_stations = gpd.GeoDataFrame(
master_stations,
Expand All @@ -52,7 +53,7 @@ def gen_stations_data(
# if the output
if points_data_fpath != None:
if os.path.exists(points_data_fpath):
print("Writing gis stations data to disk as .pickle file ...")
logging.info("Writing gis stations data to disk as .pickle file ...")
# pickle the gis stations data
with open(points_data_fpath, "wb") as f:
pickle.dump(geo_master_stations, f, protocol=pickle.HIGHEST_PROTOCOL)
Expand Down
17 changes: 11 additions & 6 deletions scripts/prg_preprocess_data.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,38 @@
import time
import cons
import logging
from PreProcessData.gen_master_data import gen_master_data
from PreProcessData.gen_preaggregate_data import gen_preaggregate_data
from PreProcessData.gen_counties_data import gen_counties_data
from PreProcessData.gen_stations_data import gen_stations_data

if __name__ == '__main__':

# set up logging
lgr = logging.getLogger()
lgr.setLevel(logging.INFO)

# start timer
t0 = time.time()

print('~~~~~ Generating master data file ...')
logging.info('~~~~~ Generating master data file ...')
# generate master data file
gen_master_data(master_data_fpath = cons.master_data_fpath, return_data = False)

print('~~~~~ Generating preaggregated data file ...')
logging.info('~~~~~ Generating preaggregated data file ...')
# generate the preaggregate data
gen_preaggregate_data(preaggregate_data_fpath = cons.preaggregate_data_fpath, return_data = False)

print('~~~~~ Generating geospatial counties data file ...')
logging.info('~~~~~ Generating geospatial counties data file ...')
# generate counties data
gen_counties_data(map_data_fpath = cons.map_data_fpath, return_data = False)

print('~~~~~ Generating geospatial stations data file ...')
logging.info('~~~~~ Generating geospatial stations data file ...')
# generate wheather station points data
gen_stations_data(points_data_fpath = cons.points_data_fpath, return_data = False)

# end timer and print result
# end timer and log result
t1 = time.time()
tres = t1 - t0
eres = round(tres, 2)
print(f'Total Execution Time: {eres} seconds')
logging.info(f'Total Execution Time: {eres} seconds')

0 comments on commit 7882ab2

Please sign in to comment.