Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #17

Merged
merged 4 commits into from
Oct 9, 2024
Merged

Dev #17

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ FROM ubuntu:20.04
# set environment variables
ENV user=ubuntu
ENV DEBIAN_FRONTEND=noninteractive
# set git branch for cloning
ARG GIT_BRANCH
ENV GIT_BRANCH=${GIT_BRANCH}

# install required software and programmes for development environment
RUN apt-get update
Expand All @@ -14,7 +17,7 @@ RUN useradd ${user}
RUN mkdir -p /home/${user} && chown -R ${user}: /home/${user}

# clone git repo
RUN git clone https://github.com/oislen/IrishClimateDashboard.git /home/ubuntu/IrishClimateDashboard
RUN git clone https://github.com/oislen/IrishClimateDashboard.git --branch ${GIT_BRANCH} /home/ubuntu/IrishClimateDashboard

# install required python packages
COPY requirements.txt /tmp/
Expand Down
3 changes: 2 additions & 1 deletion exeDocker.bat
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ SET DOCKER_REPO=irishclimatedashboard
SET DOCKER_TAG=latest
SET DOCKER_IMAGE=%DOCKER_USER%/%DOCKER_REPO%:%DOCKER_TAG%
SET DOCKER_CONTAINER_NAME=icd
SET GIT_BRANCH=v0.0.0

:: remove existing docker containers and images
docker image rm -f %DOCKER_IMAGE%

:: build docker image
call docker build --no-cache -t %DOCKER_IMAGE% .
call docker build --no-cache -t %DOCKER_IMAGE% . --build-arg GIT_BRANCH=%GIT_BRANCH%
::call docker build -t %DOCKER_IMAGE% .

:: run docker container
Expand Down
1 change: 1 addition & 0 deletions exePreProcessData.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
call python scripts\prg_preprocess_data.py
19 changes: 10 additions & 9 deletions scripts/PreProcessData/gen_counties_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import cons
import os
import logging
import pickle
import pandas as pd
import geopandas as gpd
Expand Down Expand Up @@ -29,7 +30,7 @@ def gen_counties_data(
0, pandas.DataFrame
Depending on return_data parameter, either return zero or map data
"""
print("Loading rep / ni counties shape files ...")
logging.info("Loading rep / ni counties shape files ...")
# load in county shape files
rep_counties = (
gpd.read_file(cons.rep_counties_fpath)[["ENGLISH", "geometry"]]
Expand All @@ -40,32 +41,32 @@ def gen_counties_data(
epsg=2157
)
if type(pre_agg_data_dict) == type(None):
print("Loading preaggregated data dictionary ...")
logging.info("Loading preaggregated data dictionary ...")
# load preaggregated data
with open(cons.preaggregate_data_fpath, "rb") as f:
pre_agg_data_dict = pickle.load(f)
print("Concatenating counties geopandas dataframes ...")
logging.info("Concatenating counties geopandas dataframes ...")
# concatenate county shape files
counties = gpd.GeoDataFrame(
pd.concat([rep_counties, ni_counties], ignore_index=True), crs="EPSG:2157"
)
print("Simplifiying counties geometries ...")
logging.info("Simplifiying counties geometries ...")
# simplify the granularity of the geometry column
counties["geometry"] = counties["geometry"].simplify(tolerance=1000)
print("Standardising county names to title case ...")
logging.info("Standardising county names to title case ...")
# clean up county column
counties["county"] = (
counties["county"].str.title().str.replace(pat="County ", repl="", regex=False)
)
print("Ordering results by county name ...")
logging.info("Ordering results by county name ...")
# sort data by county
counties = counties.sort_values(by="county")
print("Calculating county level statistics ...")
logging.info("Calculating county level statistics ...")
# create a dictionary to contain map data
map_data_dict = {}
# iterate over statistic and pre aggregated data
for stat, pre_agg_data in pre_agg_data_dict.items():
print(f"{stat} ...")
logging.info(f"{stat} ...")
# aggregate data to county level
group_cols = ["county"]
agg_dict = {col: stat for col in cons.col_options}
Expand All @@ -84,7 +85,7 @@ def gen_counties_data(
# if the output
if map_data_fpath != None:
if os.path.exists(map_data_fpath):
print("Writing counties data to disk as pickle file ...")
logging.info("Writing counties data to disk as pickle file ...")
# pickle the preaggregated data dictionary to disk
with open(map_data_fpath, "wb") as f:
pickle.dump(map_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
Expand Down
11 changes: 6 additions & 5 deletions scripts/PreProcessData/gen_master_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pandas as pd
import logging
import cons
from utilities.gen_boto3_excel import gen_boto3_excel
from beartype import beartype
Expand Down Expand Up @@ -36,7 +37,7 @@ def gen_master_data(
# if load data locally
if not aws_s3:
if met_eireann_fpaths == None:
print("Retrieving raw met eireann .xlsx file paths from disk ...")
logging.info("Retrieving raw met eireann .xlsx file paths from disk ...")
# load data files from file directory
met_eireann_fpaths = [
os.path.join(cons.met_eireann_dir, fpath)
Expand All @@ -45,11 +46,11 @@ def gen_master_data(
]
# otherwise if loading data from aws s3
else:
print("Retrieving raw met eireann .xlsx file paths from aws s3 ...")
logging.info("Retrieving raw met eireann .xlsx file paths from aws s3 ...")
met_eireann_fpaths = gen_boto3_excel(
bucket="irishclimateapp", prefix="data/Met_Eireann"
)
print("Reading, concatenating and cleaning .xlsx files ...")
logging.info("Reading, concatenating and cleaning .xlsx files ...")
# load and concatenate data files together
data_list = [
pd.read_excel(fpath, dtype=dtypes, na_values=[" "])
Expand All @@ -59,13 +60,13 @@ def gen_master_data(
data = data[data.columns[~data.columns.str.contains("ind")]]
data["date"] = pd.to_datetime(data["date"])
data["county"] = data["county"].str.title()
print("Sorting master file by county and station names ...")
logging.info("Sorting master file by county and station names ...")
# order results by county and station alphabetically
data = data.sort_values(by=["county", "station"]).reset_index(drop=True)
# if the output
if master_data_fpath != None:
if os.path.exists(master_data_fpath):
print("Writing master file to disk as .feather file ...")
logging.info("Writing master file to disk as .feather file ...")
# save concatenated data to disk
data.to_feather(master_data_fpath)
else:
Expand Down
11 changes: 6 additions & 5 deletions scripts/PreProcessData/gen_preaggregate_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import cons
import os
import logging
import pickle
import pandas as pd
from beartype import beartype
Expand Down Expand Up @@ -29,27 +30,27 @@ def gen_preaggregate_data(
Depending on return_data parameter, either return zero or preaggregated data
"""
if type(master_data) == type(None):
print("Loading master data from disk ...")
logging.info("Loading master data from disk ...")
# load master data
master_data = pd.read_feather(cons.master_data_fpath)
print("Performing initial data aggregation to year-month level ...")
logging.info("Performing initial data aggregation to year-month level ...")
# preaggregate the data to year-month level for each available stat
pre_agg_data_dict = {}
strftime = cons.date_strftime_dict["year-month"]
agg_data = master_data.copy()
agg_data["date_str"] = agg_data["date"].dt.strftime(strftime)
agg_data["date"] = pd.to_datetime(agg_data["date_str"], format=strftime)
group_cols = ["county", "date", "date_str"]
print("Performing final data aggregation to desired statistics ...")
logging.info("Performing final data aggregation to desired statistics ...")
for stat in cons.stat_options:
print(f"{stat} ...")
logging.info(f"{stat} ...")
agg_dict = {col: stat for col in cons.col_options}
tmp_agg_data = agg_data.groupby(group_cols, as_index=False).agg(agg_dict)
pre_agg_data_dict[stat] = tmp_agg_data
# if the output
if preaggregate_data_fpath != None:
if os.path.exists(preaggregate_data_fpath):
print("Writing preaggregated data to disk as .pickle file ...")
logging.info("Writing preaggregated data to disk as .pickle file ...")
# pickle the preaggregated data dictionary to disk
with open(cons.preaggregate_data_fpath, "wb") as f:
pickle.dump(pre_agg_data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
Expand Down
11 changes: 6 additions & 5 deletions scripts/PreProcessData/gen_stations_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import logging
import pickle
import pandas as pd
import geopandas as gpd
Expand Down Expand Up @@ -26,21 +27,21 @@ def gen_stations_data(
0, pandas.DataFrame
Depending on return_data parameter, either return zero or gis points data
"""
print("Loading master and stations data from disk ...")
logging.info("Loading master and stations data from disk ...")
# load master and station data
master_data = pd.read_feather(cons.master_data_fpath)
stations_data = pd.read_csv(cons.stations_fpath)
print("Identifying master station ids ...")
logging.info("Identifying master station ids ...")
# extract out station ids from mater file
master_station_ids = master_data["id"].unique()
print("Filtering corresponding station data ...")
logging.info("Filtering corresponding station data ...")
# filter master data with station ids
master_stations = stations_data.loc[
stations_data["station_id"].isin(master_station_ids), :
].copy()
master_stations["county"] = master_stations["county"].str.title()
master_stations["name"] = master_stations["name"].str.title()
print("Creating geopandas DataFrame of station data ...")
logging.info("Creating geopandas DataFrame of station data ...")
# create gis data
geo_master_stations = gpd.GeoDataFrame(
master_stations,
Expand All @@ -52,7 +53,7 @@ def gen_stations_data(
# if the output
if points_data_fpath != None:
if os.path.exists(points_data_fpath):
print("Writing gis stations data to disk as .pickle file ...")
logging.info("Writing gis stations data to disk as .pickle file ...")
# pickle the gis stations data
with open(points_data_fpath, "wb") as f:
pickle.dump(geo_master_stations, f, protocol=pickle.HIGHEST_PROTOCOL)
Expand Down
17 changes: 11 additions & 6 deletions scripts/prg_preprocess_data.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,38 @@
import time
import cons
import logging
from PreProcessData.gen_master_data import gen_master_data
from PreProcessData.gen_preaggregate_data import gen_preaggregate_data
from PreProcessData.gen_counties_data import gen_counties_data
from PreProcessData.gen_stations_data import gen_stations_data

if __name__ == '__main__':

# set up logging
lgr = logging.getLogger()
lgr.setLevel(logging.INFO)

# start timer
t0 = time.time()

print('~~~~~ Generating master data file ...')
logging.info('~~~~~ Generating master data file ...')
# generate master data file
gen_master_data(master_data_fpath = cons.master_data_fpath, return_data = False)

print('~~~~~ Generating preaggregated data file ...')
logging.info('~~~~~ Generating preaggregated data file ...')
# generate the preaggregate data
gen_preaggregate_data(preaggregate_data_fpath = cons.preaggregate_data_fpath, return_data = False)

print('~~~~~ Generating geospatial counties data file ...')
logging.info('~~~~~ Generating geospatial counties data file ...')
# generate counties data
gen_counties_data(map_data_fpath = cons.map_data_fpath, return_data = False)

print('~~~~~ Generating geospatial stations data file ...')
logging.info('~~~~~ Generating geospatial stations data file ...')
# generate wheather station points data
gen_stations_data(points_data_fpath = cons.points_data_fpath, return_data = False)

# end timer and print result
# end timer and log result
t1 = time.time()
tres = t1 - t0
eres = round(tres, 2)
print(f'Total Execution Time: {eres} seconds')
logging.info(f'Total Execution Time: {eres} seconds')
Loading