diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c86af76e..21c7bc01 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,4 @@ repos: rev: stable hooks: - id: black - language_version: python3.7 + language_version: python3.8 diff --git a/.readthedocs.yml b/.readthedocs.yml index cc1afa13..f3bd2dc1 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,7 +3,7 @@ sphinx: configuration: docs/source/conf.py fail_on_warning: false python: - version: 3.7 + version: 3.8 install: - method: setuptools path: package diff --git a/.travis.yml b/.travis.yml index c1239293..7263ea19 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,12 +9,12 @@ cache: # run tests and linting separately matrix: include: - - name: "3.7 lint" - python: 3.7 + - name: "3.8 lint" + python: 3.8 env: - TEST_CMD="pre-commit run --all-files" - - name: "3.7 tests" - python: 3.7 + - name: "3.8 tests" + python: 3.8 env: - TEST_CMD="pytest --cov=./" diff --git a/README.md b/README.md index d1750ab2..a2bf09ee 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Mapping Scouts data to UK administrative regions. [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) ## Prerequisites: -This is written and tested in Python 3.7. +This is written and tested in Python 3.8. This project is largely dependent on `geopandas` and `pandas`, along with `folium`, `dash`, and `shapely`. @@ -45,7 +45,7 @@ To install geopandas and its dependencies, follow below It is highly recommended to use conda to install geopandas. However, to install geopandas using pip on windows, follow the following steps: -* Download the wheels for [GDAL](http://www.lfd.uci.edu/~gohlke/pythonlibs/#gdal), [Fiona](http://www.lfd.uci.edu/~gohlke/pythonlibs/#fiona), and [Rtree](http://www.lfd.uci.edu/~gohlke/pythonlibs/#rtree). Choose the correct python version (currently 3.7) and platform +* Download the wheels for [GDAL](http://www.lfd.uci.edu/~gohlke/pythonlibs/#gdal), [Fiona](http://www.lfd.uci.edu/~gohlke/pythonlibs/#fiona), and [Rtree](http://www.lfd.uci.edu/~gohlke/pythonlibs/#rtree). Choose the correct python version (currently 3.8) and platform * Install any prerequisites listed on Gohlke's site (e.g. C++ redistributables) * `pip install` the wheels in the following order (preferably in a Virtual Environment) 1. [GDAL](http://www.lfd.uci.edu/~gohlke/pythonlibs/#gdal) diff --git a/environment.yml b/environment.yml index 6e9862bb..ef7d88b2 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge - defaults dependencies: - - python>=3.7 + - python>=3.8 - pip # Core requirements - branca @@ -15,6 +15,7 @@ dependencies: - pyarrow # Development requirements: - pytest + - hypothesis - pytest-cov - pre-commit - black diff --git a/pyproject.toml b/pyproject.toml index 04456503..c6545d48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,18 +1,2 @@ [tool.black] line-length = 180 -exclude = ''' -/( - \.eggs - | \.git - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | venv - | _build - | buck-out - | build - | dist - | setup.py -)/ -''' diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 25e73fa4..00000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -python_files=test_*.py diff --git a/scripts/settings.json b/scripts/settings.json index 04e65970..7e45d6ee 100644 --- a/scripts/settings.json +++ b/scripts/settings.json @@ -1,9 +1,9 @@ { "settings": { - "Raw Census Extract location": "Scout Census Data/Census 2020 Extract (1).csv", - "Scout Census location": "Scout Census Data/Census 2020 Extract (1) with May 2019 fields.csv", - "Full ONS PD location": "ONSPD_MAY_2019_UK/Data/ONSPD_MAY_2019_UK.csv", + "Raw Census Extract location": "Scout Census Data/Census 2020 Extract (1).csv", + "Scout Census location": "Scout Census Data/Census 2020 Extract (1) with May 2019 fields.feather", + "Full ONS PD location": "ONSPD_MAY_2019_UK/Data/ONSPD_MAY_2019_UK.csv", "Reduced ONS PD location": "ONSPD_MAY_2019_UK/Data/ONSPD_MAY_2019_UK reduced.csv", "ONS Names and codes folder": "ONSPD_MAY_2019_UK/Documents/", "National Statistical folder": "National Statistical data/", @@ -17,10 +17,7 @@ { "name": "D_ID", "codes": {"path": "../data/Scout Census Data/district_id_mapping.csv", "key": "D_ID", "key_type": "Int32", "name": "D_name"}, - "boundary": - { - "shapefiles": ["../src/scripts/districts_buffered.geojson"], "key": "id", "name": "name" - }, + "boundary": {"shapefile": "../scripts/districts_buffered.geojson", "key": "id", "name": "name"}, "age_profile": null, "age_profile_code_col": null }, diff --git a/scripts/setup_reduce_onspd.py b/scripts/setup_reduce_onspd.py index b66ac755..37b4350b 100644 --- a/scripts/setup_reduce_onspd.py +++ b/scripts/setup_reduce_onspd.py @@ -1,15 +1,14 @@ import json import src.utility as utility -from src.utility import SCRIPTS_ROOT, DATA_ROOT from src.data.ons_pd_may_19 import ONSPostcodeDirectoryMay19 if __name__ == "__main__": - with open(SCRIPTS_ROOT.joinpath("settings.json"), "r") as read_file: + with open(utility.SCRIPTS_ROOT.joinpath("settings.json"), "r") as read_file: settings = json.load(read_file)["settings"] print("Starting") - ons_pd_location = DATA_ROOT / settings["Full ONS PD location"] + ons_pd_location = utility.DATA_ROOT / settings["Full ONS PD location"] # Load Full ONS Postcode Directory ons_pd = ONSPostcodeDirectoryMay19(ons_pd_location, load_data=True) diff --git a/setup.py b/setup.py index ad396589..03e7ff43 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,6 @@ version="0.2.0", packages=find_namespace_packages(), install_requires=["pandas", "numpy", "folium", "branca", "geopandas", "shapely", "dash", "pyarrow"], - extras_require={"dev": ["pytest", "pytest-cov", "pre-commit", "black"]}, - python_requires=">=3.7", + extras_require={"dev": ["pytest", "hypothesis", "pytest-cov", "pre-commit", "black"]}, + python_requires=">=3.8", ) diff --git a/src/data/ons_pd_may_18.py b/src/data/ons_pd_may_18.py index 2fc7fe56..61e76fee 100644 --- a/src/data/ons_pd_may_18.py +++ b/src/data/ons_pd_may_18.py @@ -1,4 +1,3 @@ -from src.utility import DATA_ROOT from src.data.ons_pd import ONSPostcodeDirectory @@ -50,13 +49,11 @@ class ONSPostcodeDirectoryMay18(ONSPostcodeDirectory): } def __init__(self, ons_pd_csv_path, load_data=True): - ONSPostcodeDirectory.__init__( - self, ons_pd_csv_path, load_data, ONSPostcodeDirectoryMay18.index_column, ONSPostcodeDirectoryMay18.fields, ONSPostcodeDirectoryMay18.data_types, - ) + super().__init__(ons_pd_csv_path, load_data, ONSPostcodeDirectoryMay18.index_column, ONSPostcodeDirectoryMay18.fields, ONSPostcodeDirectoryMay18.data_types) # Folder within the ONS Postcode Directory archive holding names and codes files - names_codes_root = DATA_ROOT / self.settings["ONS Names and codes folder"] - boundaries_root = DATA_ROOT / self.settings["Boundaries folder"] + names_codes_root = self.settings["ONS Names and codes folder"] + boundaries_root = self.settings["Boundaries folder"] # Paths to all shapefiles within the Boundaries folder # fmt: off diff --git a/src/data/ons_pd_may_19.py b/src/data/ons_pd_may_19.py index 69ffee76..070a6ac4 100644 --- a/src/data/ons_pd_may_19.py +++ b/src/data/ons_pd_may_19.py @@ -1,4 +1,3 @@ -from src.utility import DATA_ROOT from src.data.ons_pd import ONSPostcodeDirectory from pathlib import Path @@ -50,13 +49,11 @@ class ONSPostcodeDirectoryMay19(ONSPostcodeDirectory): } def __init__(self, ons_pd_csv_path, load_data=True): - ONSPostcodeDirectory.__init__( - self, ons_pd_csv_path, load_data, ONSPostcodeDirectoryMay19.index_column, ONSPostcodeDirectoryMay19.fields, ONSPostcodeDirectoryMay19.data_types, - ) + super().__init__(ons_pd_csv_path, load_data, self.index_column, ONSPostcodeDirectoryMay19.fields, ONSPostcodeDirectoryMay19.data_types) # Folder within the ONS Postcode Directory archive holding names and codes files - names_codes_root = DATA_ROOT / Path(self.settings["ONS Names and codes folder"]).resolve() - boundaries_dir = DATA_ROOT / Path(self.settings["Boundaries folder"]).resolve() + names_codes_root = Path(self.settings["ONS Names and codes folder"]).resolve() + boundaries_dir = Path(self.settings["Boundaries folder"]).resolve() # Paths to all shapefiles within the Boundaries folder # fmt: off diff --git a/src/data/scout_census.py b/src/data/scout_census.py index e254703a..507db6d8 100644 --- a/src/data/scout_census.py +++ b/src/data/scout_census.py @@ -97,7 +97,11 @@ class ScoutCensus: UNIT_LEVEL_GROUP = "Group" UNIT_LEVEL_DISTRICT = "District" - def __init__(self, census_file_path: Path): + def __init__(self, census_file_path: Path, load_data=True): + if not load_data: + self.data = pd.DataFrame() + return + cols_int_32 = ["Object_ID", "G_ID", "D_ID", "C_ID", "R_ID", "X_ID", "imd"] cols_categorical = ["compass", "type", "name", "G_name", "D_name", "C_name", "R_name", "X_name", "postcode", "Young_Leader_Unit"] # fmt: off diff --git a/src/data/scout_data.py b/src/data/scout_data.py index eb460762..61ec2f18 100644 --- a/src/data/scout_data.py +++ b/src/data/scout_data.py @@ -2,6 +2,7 @@ from datetime import datetime from pathlib import Path import pandas as pd +import geopandas as gpd import time from typing import TYPE_CHECKING @@ -15,6 +16,8 @@ if TYPE_CHECKING: from src.data.ons_pd import ONSPostcodeDirectory +WGS_84 = 4326 + class ScoutData(Base): """Provides access to manipulate and process data @@ -32,7 +35,7 @@ def columns(self): DEFAULT_VALUE = ScoutCensus.DEFAULT_VALUE - def __init__(self, merged_csv=True, load_ons_pd_data=False, census_path=None): + def __init__(self, merged_csv=True, load_ons_pd_data=False, census_path=None, load_census_data=True): super().__init__(settings=True, log_path=str(utility.LOGS_ROOT.joinpath("geo_mapping.log"))) self.logger.info(f"Starting at {datetime.now().time()}") self.logger.finished(f"Logging setup", start_time=self.start_time) @@ -40,9 +43,10 @@ def __init__(self, merged_csv=True, load_ons_pd_data=False, census_path=None): self.logger.info("Loading Scout Census data") # Loads Scout Census Data from a path to a .csv file that contains Scout Census data # We assume no custom path has been passed, but allow for one to be used - census_path = utility.DATA_ROOT / self.settings["Scout Census location"] if not census_path else census_path - self.scout_census: ScoutCensus = ScoutCensus(utility.DATA_ROOT / census_path) + census_path = self.settings["Scout Census location"] if not census_path else census_path + self.scout_census: ScoutCensus = ScoutCensus(utility.DATA_ROOT / census_path, load_data=load_census_data) self.data: pd.DataFrame = self.scout_census.data + self.points_data: gpd.GeoDataFrame = gpd.GeoDataFrame() self.logger.finished(f"Loading Scout Census data", start_time=self.start_time) if merged_csv: @@ -131,7 +135,7 @@ def save_merged_data(self, ons_pd_publication_date: str): self.data.to_csv(output_path.with_suffix(".csv"), index=False, encoding="utf-8-sig") self.data.to_feather(output_path.with_suffix(".feather")) - def filter_records(self: ScoutDataInterface, field: str, value_list: list, mask: bool = False, exclusion_analysis: bool = False): + def filter_records(self, field: str, value_list: list, mask: bool = False, exclusion_analysis: bool = False): """Filters the Census records by any field in ONS PD. :param str field: The field on which to filter @@ -142,3 +146,20 @@ def filter_records(self: ScoutDataInterface, field: str, value_list: list, mask: :returns None: Nothing """ self.data = utility.filter_records(self.data, field, value_list, self.logger, mask, exclusion_analysis) + + def add_shape_data(self, shapes_key: str, path: Path = None, gdf: gpd.GeoDataFrame = None): + if self.points_data.empty: + self.points_data = gpd.GeoDataFrame(geometry=gpd.points_from_xy(self.data.long, self.data.lat)) + self.points_data.crs = WGS_84 + + if path: + shapes = gpd.GeoDataFrame.from_file(path) + elif gdf is not None: + shapes = gdf + else: + raise ValueError("A path to a shapefile or a Ge") + + geo_merged = gpd.sjoin(self.points_data, shapes.to_crs(f"epsg:{WGS_84}"), how="left", op="intersects") + merged = self.data.merge(geo_merged[[shapes_key]], how="left", left_index=True, right_index=True) + assert self.data.equals(merged[self.data.columns]) + self.data = merged diff --git a/src/geographies/geography.py b/src/geographies/geography.py index 0033c942..e1c114fb 100644 --- a/src/geographies/geography.py +++ b/src/geographies/geography.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING from src.base import Base +from src.utility import DATA_ROOT # For type hints if TYPE_CHECKING: @@ -39,6 +40,14 @@ def type(self) -> str: def codes_map_key(self) -> str: return self.geography_metadata_dict["codes"]["key"] + @property + def codes_map_key_type(self) -> str: + return self.geography_metadata_dict["codes"]["key_type"] + + @property + def codes_map_path(self) -> Path: + return DATA_ROOT / self.geography_metadata_dict["codes"].get("path") + @property def codes_map_name(self) -> str: return self.geography_metadata_dict["codes"]["name"] @@ -53,15 +62,19 @@ def shapefile_name(self) -> str: @property def shapefile_path(self) -> Path: - return self.geography_metadata_dict["boundary"]["shapefile"] + return DATA_ROOT / self.geography_metadata_dict["boundary"]["shapefile"] @property def age_profile_path(self) -> Path: - return self.geography_metadata_dict["age_profile"]["path"] + return DATA_ROOT / self.settings["National Statistical folder"] / self.geography_metadata_dict["age_profile"].get("path") @property def age_profile_key(self) -> str: - return self.geography_metadata_dict["age_profile"]["key"] + return self.geography_metadata_dict["age_profile"].get("key") + + @property + def age_profile_pivot(self) -> str: + return self.geography_metadata_dict["age_profile"].get("pivot_key") def _set_boundary(self, geography_name: str, ons_pd: ONSPostcodeDirectory): """Sets the geography_metadata_dict and geography_region_ids_mapping members @@ -80,11 +93,9 @@ def _set_boundary(self, geography_name: str, ons_pd: ONSPostcodeDirectory): boundaries_dict = {**ons_pd.BOUNDARIES, **self.settings["Scout Mappings"]} if geography_name in boundaries_dict.keys(): self.geography_metadata_dict = boundaries_dict[geography_name] - boundary_codes_dict = self.geography_metadata_dict["codes"] - self.geography_region_ids_mapping = pd.read_csv( - boundary_codes_dict.get("path"), dtype={boundary_codes_dict["key"]: boundary_codes_dict["key_type"], boundary_codes_dict["name"]: "object",}, - ) # Names & Codes file path + # Names & Codes file path + self.geography_region_ids_mapping = pd.read_csv(self.codes_map_path, dtype={self.codes_map_key: self.codes_map_key_type, self.codes_map_name: "string"}) else: raise Exception(f"{geography_name} is an invalid boundary.\nValid boundaries include: {boundaries_dict.keys()}") diff --git a/src/maps/map_plotter.py b/src/maps/map_plotter.py index 5be52956..e2f3a7c4 100644 --- a/src/maps/map_plotter.py +++ b/src/maps/map_plotter.py @@ -9,8 +9,9 @@ from src.reports.reports import Reports from src.base import Base -# noinspection PyUnreachableCode -if False: +from typing import TYPE_CHECKING + +if TYPE_CHECKING: from pathlib import Path from branca import colormap @@ -41,9 +42,9 @@ def __init__(self, out_file: Path): self.SCORE_COL: dict = {} self.layers: dict = {} - self.score_col_label: str = None - self.code_name: str = None - self.CODE_COL: str = None + self.score_col_label: str = "" + self.code_name: str = "" + self.CODE_COL: str = "" self.map_data: pd.DataFrame = pd.DataFrame() self.geo_data = None @@ -109,7 +110,7 @@ def _filter_shape_file(self, shape_file_path: Path): self.logger.info(f"Filtering {original_number_of_shapes} shapes by {self.code_name} being in the {self.CODE_COL} of the map_data") self.logger.debug(f"Filtering {original_number_of_shapes} shapes by {self.code_name} being in \n{self.map_data[self.CODE_COL]}") - list_codes = self.map_data[self.CODE_COL].astype(str).to_list() + list_codes = self.map_data[self.CODE_COL].drop_duplicates().astype(str).to_list() all_shapes = all_shapes.loc[all_shapes[self.code_name].isin(list_codes)] self.logger.info(f"Resulting in {len(all_shapes.index)} shapes") diff --git a/src/reports/history_summary.py b/src/reports/history_summary.py index 8f2ff921..0c95d53e 100644 --- a/src/reports/history_summary.py +++ b/src/reports/history_summary.py @@ -5,8 +5,9 @@ from src.data.scout_census import ScoutCensus import src.utility as utility -# noinspection PyUnreachableCode -if False: +from typing import TYPE_CHECKING + +if TYPE_CHECKING: from src.data.scout_data import ScoutData diff --git a/src/reports/reports.py b/src/reports/reports.py index 86474b88..cce1a58d 100644 --- a/src/reports/reports.py +++ b/src/reports/reports.py @@ -8,8 +8,9 @@ from src.data.scout_census import ScoutCensus import src.utility as utility -# noinspection PyUnreachableCode -if False: +from typing import TYPE_CHECKING + +if TYPE_CHECKING: from pathlib import Path from src.data.ons_pd import ONSPostcodeDirectory @@ -51,6 +52,10 @@ def __init__(self, geography_name: str, scout_data_object: ScoutData, ons_pd_obj "Explorers": {"ages": ["14", "15", "16", "17"]}, } + def add_shapefile_data(self, shapefile_key): + self.scout_data.add_shape_data(shapefile_key, path=self.shapefile_path) + self.scout_data.data = self.scout_data.data.rename(columns={shapefile_key: self.geography_type}) + @time_function def filter_boundaries(self, field: str, value_list: list, boundary: str = "", distance: int = 3000, near: bool = False): @@ -94,8 +99,9 @@ def _ons_to_district_mapping(self, ons_code: str) -> dict: count_by_district_by_region = count_by_district_by_region.set_index([region_type, district_id_column]) + count_col: pd.Series = count_by_district_by_region["count"] nested_dict = collections.defaultdict(dict) - for keys, value in count_by_district_by_region["count"].iteritems(): + for keys, value in count_col.iteritems(): nested_dict[keys[0]][keys[1]] = value self.logger.debug("Finished mapping from ons boundary to district") @@ -133,7 +139,7 @@ def create_boundary_report(self, options: list = None, historical: bool = False, True if "waiting list total" in options else False # fmt: on - geog_name = self.geography.type # e.g oslaua osward pcon lsoa11 + geog_name = self.geography_type # e.g oslaua osward pcon lsoa11 if not geog_name: raise Exception("Geography type has not been set. Try calling _set_boundary") @@ -195,7 +201,7 @@ def _year_groupby(group_df: pd.DataFrame) -> dict: return output def _awards_groupby(group_df: pd.DataFrame, awards_data: pd.DataFrame) -> dict: - summed = group_df[[award_name, award_eligible,]].sum() + summed = group_df[[award_name, award_eligible]].sum() output = summed.to_dict() if summed[award_eligible] > 0: output[f"%-{award_name}"] = (summed[award_name] * 100) / summed[award_eligible] @@ -204,7 +210,7 @@ def _awards_groupby(group_df: pd.DataFrame, awards_data: pd.DataFrame) -> dict: # Divides total # of awards by the number of Scout Districts that the ONS Region is in code = group_df.name district_ids = awards_mapping.get(code, {}) if not geog_name == "D_ID" else {code: 1} - awards_regions_data = awards_data.loc[[id for id in district_ids.keys()]].sum() + awards_regions_data = awards_data.loc[[d_id for d_id in district_ids.keys()]].sum() output["QSA"] = awards_regions_data["QSA"] if awards_regions_data["qsa_eligible"] > 0: @@ -289,10 +295,10 @@ def create_uptake_report(self, report_name: str = None) -> pd.DataFrame: :returns pd.DataFrame: Uptake data of Scouts in the boundary """ - geog_name: str = self.geography.type + geog_name = self.geography_type try: - age_profile_path: str = self.geography.age_profile_path - age_profile_key: str = self.geography.age_profile_key + age_profile_path = self.geography.age_profile_path + age_profile_key = self.geography.age_profile_key except KeyError: raise AttributeError(f"Population by age data not present for this {geog_name}") @@ -303,22 +309,38 @@ def create_uptake_report(self, report_name: str = None) -> pd.DataFrame: data_types = {str(key): "Int16" for key in range(5, 26)} try: - full_age_profile_path = utility.DATA_ROOT / self.settings["National Statistical folder"] / age_profile_path - age_profile_pd = pd.read_csv(full_age_profile_path, dtype=data_types) + age_profile_pd = pd.read_csv(age_profile_path, dtype=data_types) except TypeError: self.logger.error("Age profiles must be integers in each age category") raise # population data for section, ages in Reports.SECTION_AGES.items(): - age_profile_pd[f"Pop_{section}"] = age_profile_pd[ages["ages"]].sum(axis=1) - age_profile_pd[f"Pop_{section}"] += age_profile_pd[ages["halves"]].sum(axis=1) // 2 if ages.get("halves") else 0 - age_profile_pd["Pop_All"] = age_profile_pd[[f"{age}" for age in range(6, 17 + 1)]].sum(axis=1) + section_population = age_profile_pd[ages["ages"]].sum(axis=1) + section_population += age_profile_pd[ages["halves"]].sum(axis=1) // 2 if ages.get("halves") else 0 + age_profile_pd[f"Pop_{section}"] = section_population.astype("UInt32") + age_profile_pd["Pop_All"] = age_profile_pd[[f"{age}" for age in range(6, 17 + 1)]].sum(axis=1).astype("UInt32") # merge population data - cols = [f"Pop_{section}" for section in Reports.SECTION_AGES.keys()] + ["Pop_All"] + [age_profile_key] - uptake_report = boundary_report.merge(age_profile_pd[cols], how="left", left_on=geog_name, right_on=age_profile_key, sort=False) - del uptake_report[age_profile_key] + cols = [age_profile_key] + [f"Pop_{section}" for section in Reports.SECTION_AGES.keys()] + ["Pop_All"] + reduced_age_profile_pd = age_profile_pd[cols] + + # Pivot age profile to current geography type if needed + if self.geography.age_profile_pivot and self.geography.age_profile_pivot != geog_name: + pivot_key = self.geography.age_profile_pivot + + ons_data_subset = self.ons_pd.data[[geog_name, pivot_key]] + merged_age_profile = reduced_age_profile_pd.merge(ons_data_subset, how="left", left_on=age_profile_key, right_on=pivot_key).drop(pivot_key, axis=1) + merged_age_profile_no_na = merged_age_profile.dropna(subset=[geog_name]) + pivoted_age_profile = merged_age_profile_no_na.groupby(geog_name).sum().astype("UInt32") + + # Check we did not accidentally expand the population! + assert merged_age_profile["Pop_All"].sum() == reduced_age_profile_pd["Pop_All"].sum() # this will fail + assert pivoted_age_profile["Pop_All"].sum() == merged_age_profile_no_na["Pop_All"].sum() + uptake_report = boundary_report.merge(pivoted_age_profile, how="left", left_on=geog_name, right_index=True, sort=False) + else: + uptake_report = boundary_report.merge(reduced_age_profile_pd, how="left", left_on=geog_name, right_on=age_profile_key, sort=False) + del uptake_report[age_profile_key] years = self.scout_data.data["Year"].drop_duplicates().dropna().sort_values() diff --git a/src/utility.py b/src/utility.py index 8dd2e4b3..693b614f 100644 --- a/src/utility.py +++ b/src/utility.py @@ -56,6 +56,10 @@ def filter_records(data: pd.DataFrame, field: str, value_list: list, logger: log logger.info(f"Resulting in {remaining_records} records remaining.") if exclusion_analysis: + cols = [ScoutCensus.column_labels["UNIT_TYPE"]] + [sections_dict[section]["total"] for section in sections_dict.keys()] + if not all([col in data.columns for col in cols]): + raise ValueError("Required columns are not in dataset!\n" f"Required columns are: {cols}.\n" f"Your columns are: {data.columns.to_list()}") + # Calculate the number of records that have been filtered out excluded_records = original_records - remaining_records logger.info(f"{excluded_records} records were removed ({excluded_records / original_records * 100}% of total)") diff --git a/tests/test_base.py b/tests/test_base.py index 569719dd..a1d37385 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,24 +1,25 @@ import json import logging +from numbers import Real import pytest from src.base import Base, time_function from src.utility import LOGS_ROOT, SCRIPTS_ROOT -def example_function(number1, number2): +def add(number1: Real, number2: Real) -> Real: return number1 + number2 class ExampleClassLogger(Base): - def __init__(self, path=True): + def __init__(self, path: bool = True): if path: - super().__init__(log_path=str(LOGS_ROOT.joinpath("tests.log"))) + super().__init__(log_path=str(LOGS_ROOT / "tests.log")) else: super().__init__() @time_function - def example_function(self, number1, number2): + def add(number1: Real, number2: Real) -> Real: self.logger.info("Example Function") return number1 + number2 @@ -32,7 +33,7 @@ def __init__(self): def ec_logger(): """Returns an ExampleClassLogger instance""" - def _instantiator(path=True): + def _instantiator(path: bool = True) -> ExampleClassLogger: return ExampleClassLogger(path) return _instantiator @@ -45,7 +46,7 @@ def ec_settings(): def test_time_function_wraps_function(): - assert time_function(example_function)(2, 2) == example_function(2, 2) + assert time_function(add)(2, 2) == add(2, 2) # noinspection PyTypeChecker @@ -67,17 +68,17 @@ def test_base_settings_are_accurate(ec_settings): def test_time_function_no_logger_entity(): try: - time_function(example_function)(2, 2) + time_function(add)(2, 2) except AttributeError: pytest.fail(f"Unexpected AttributeError in base.test_function") def test_time_function_logger_output(caplog, ec_logger): caplog.set_level(logging.INFO) - ec_logger().example_function(2, 2) + ec_logger().add(2, 2) - assert "Calling function example_function" in caplog.text - assert "example_function took 0.0" in caplog.text + assert "Calling function add" in caplog.text + assert "add took 0.0" in caplog.text def test_base_logger_creation(ec_logger): diff --git a/tests/test_scout_data.py b/tests/test_scout_data.py index 0b2f0575..34f007be 100644 --- a/tests/test_scout_data.py +++ b/tests/test_scout_data.py @@ -1,35 +1,99 @@ -from src.data.scout_data import ScoutData -from src.base import Base -import src.utility as utility import pandas as pd +import geopandas as gpd + +import pytest +import hypothesis +import hypothesis.strategies as st +from hypothesis.extra.pandas import data_frames, column, range_indexes + +from data.scout_census import ScoutCensus +from src.data.scout_data import ScoutData + + +COLUMN_NAME = "ctry" + + +@pytest.fixture(scope="module") +def scout_data_factory(): + """Returns a ScoutData factory""" + + def _scout_data_factory(data_df: pd.DataFrame): + sd = ScoutData(load_census_data=False, load_ons_pd_data=False, merged_csv=False) + sd.data = data_df + return sd + + return _scout_data_factory + + +@pytest.fixture(scope="module") +def blank_geo_data_frame(): + gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(*zip([0] * 2))) + gdf["id"] = 0 + gdf.crs = 4326 + return gdf + + +CountryDataFrame = data_frames(columns=[column(name=COLUMN_NAME, elements=st.from_regex(r"^[A-Za-z]{2}[0-9]{8}\Z"))], index=range_indexes(min_size=2),) + +LocationDataFrame = data_frames( + columns=[column(name="lat", elements=st.floats(min_value=-85, max_value=85)), column(name="long", elements=st.floats(min_value=-180, max_value=180)),], + index=range_indexes(min_size=2), +) + + +def test_scout_data_columns(scout_data_factory): + scout_data_stub = scout_data_factory(pd.DataFrame()) + + column_labels = ScoutCensus.column_labels + columns = [*column_labels["id"].values(), *column_labels["name"].values()] + + assert scout_data_stub.columns == columns + + +@hypothesis.given(CountryDataFrame) +def test_filter_records_inclusion(scout_data_factory, data): + first_country_code = data.loc[0, COLUMN_NAME] + scout_data_stub = scout_data_factory(data) + scout_data_stub.filter_records(field=COLUMN_NAME, value_list=[first_country_code], mask=True, exclusion_analysis=False) + + expected_outcome = data.loc[~(data[COLUMN_NAME] == first_country_code)] + assert scout_data_stub.data.equals(expected_outcome) + + +@hypothesis.given(CountryDataFrame) +def test_filter_records_exclusion(scout_data_factory, data): + first_country_code = data.loc[0, COLUMN_NAME] + scout_data_stub = scout_data_factory(data) + scout_data_stub.filter_records(field=COLUMN_NAME, value_list=[first_country_code], mask=False, exclusion_analysis=False) + + expected_outcome = data.loc[data[COLUMN_NAME] == first_country_code] + assert scout_data_stub.data.equals(expected_outcome) + + +@hypothesis.given(CountryDataFrame) +def test_filter_records_exclusion_analysis_with_incorrect_columns(scout_data_factory, data): + first_country_code = data.loc[0, COLUMN_NAME] + scout_data_stub = scout_data_factory(data) + + with pytest.raises(ValueError): + scout_data_stub.filter_records(field=COLUMN_NAME, value_list=[first_country_code], mask=False, exclusion_analysis=True) + scout_data_stub.filter_records(field=COLUMN_NAME, value_list=[first_country_code], mask=True, exclusion_analysis=True) + + +@hypothesis.given(LocationDataFrame) +def test_add_shape_data_points_data(scout_data_factory, blank_geo_data_frame, data): + sd = scout_data_factory(data) + sd.add_shape_data("id", gdf=blank_geo_data_frame) + + points_data = gpd.GeoDataFrame(geometry=gpd.points_from_xy(data.long, data.lat)) + assert points_data.equals(sd.points_data[points_data.columns]) + +@hypothesis.given(LocationDataFrame) +def test_add_shape_data_merge(scout_data_factory, blank_geo_data_frame, data): + sd = scout_data_factory(data) + sd.add_shape_data("id", gdf=blank_geo_data_frame) -class ScoutDataStub(ScoutData): - def __init__(self): - Base.__init__(self, settings=True, log_path=str(utility.LOGS_ROOT.joinpath("geo_mapping.log"))) - data = {"row_1": [1, "E92000001", 32844], "row_2": [2, "W92000004", 1]} - self.data = pd.DataFrame.from_dict(data, orient="index", columns=["id", "ctry", "imd"]) - - -def test_filter_records_inclusion(): - scout_data_stub = ScoutDataStub() - scout_data_stub.filter_records(field="ctry", value_list=["E92000001"], mask=True, exclusion_analysis=False) - predicted_data = {"row_2": [2, "W92000004", 1]} - predicted_result = pd.DataFrame.from_dict(predicted_data, orient="index", columns=["id", "ctry", "imd"]) - answer = scout_data_stub.data.equals(predicted_result) - if not answer: - print(scout_data_stub.data) - print(predicted_result) - assert scout_data_stub.data.equals(predicted_result) - - -def test_filter_records_exclusion(): - scout_data_stub = ScoutDataStub() - scout_data_stub.filter_records(field="ctry", value_list=["E92000001"], mask=False, exclusion_analysis=False) - predicted_data = {"row_1": [1, "E92000001", 32844]} - predicted_result = pd.DataFrame.from_dict(predicted_data, orient="index", columns=["id", "ctry", "imd"]) - answer = scout_data_stub.data.equals(predicted_result) - if not answer: - print(scout_data_stub.data) - print(predicted_result) - assert scout_data_stub.data.equals(predicted_result) + points_data = gpd.GeoDataFrame(geometry=gpd.points_from_xy(data.long, data.lat)) + merged = data.merge(gpd.sjoin(points_data, blank_geo_data_frame, how="left", op="intersects")[["id"]], how="left", left_index=True, right_index=True) + assert sd.data.equals(merged) diff --git a/tests/test_utility.py b/tests/test_utility.py index c618d5fb..a511e629 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -1,9 +1,11 @@ import src.utility as utility +import src.data.ons_pd as ons_pd import pandas as pd -class ONSPDStub: +class ONSPostcodeDirectoryStub(ons_pd.ONSPostcodeDirectory): def __init__(self): + super().__init__(load_data=False, ons_pd_csv_path="") self.IMD_MAX = {"England": 32844, "Wales": 1909, "Scotland": 6976, "Northern Ireland": 890} self.COUNTRY_CODES = {"E92000001": "England", "W92000004": "Wales", "S92000003": "Scotland", "N92000002": "Northern Ireland"} @@ -12,7 +14,7 @@ def test_calc_imd_decile(): data = {"row_1": [1, "E92000001", 32844], "row_2": [2, "W92000004", 1]} frame = pd.DataFrame.from_dict(data, orient="index", columns=["id", "ctry", "imd"]) - imd_decile_data: pd.Series = utility.calc_imd_decile(frame["imd"], frame["ctry"], ONSPDStub()) + imd_decile_data: pd.Series = utility.calc_imd_decile(frame["imd"], frame["ctry"], ONSPostcodeDirectoryStub()) predicted_result = pd.Series(data=[10, 1], index=["row_1", "row_2"], name="imd_decile") assert isinstance(imd_decile_data, pd.Series)