the-scouts · AA-Turner · Mar 8, 2021 · May 13, 2020 · May 13, 2020 · May 13, 2020
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
     rev: stable
     hooks:
       - id: black
-        language_version: python3.7
+        language_version: python3.8
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -3,7 +3,7 @@ sphinx:
   configuration: docs/source/conf.py
   fail_on_warning: false
 python:
-  version: 3.7
+  version: 3.8
   install:
     - method: setuptools
       path: package

diff --git a/.travis.yml b/.travis.yml
@@ -9,12 +9,12 @@ cache:
 # run tests and linting separately
 matrix:
   include:
-    - name: "3.7 lint"
-      python: 3.7
+    - name: "3.8 lint"
+      python: 3.8
       env:
         - TEST_CMD="pre-commit run --all-files"
-    - name: "3.7 tests"
-      python: 3.7
+    - name: "3.8 tests"
+      python: 3.8
       env:
         - TEST_CMD="pytest --cov=./"
 

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Mapping Scouts data to UK administrative regions.
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 ## Prerequisites:
-This is written and tested in Python 3.7.
+This is written and tested in Python 3.8.
 
 This project is largely dependent on `geopandas` and `pandas`, along with `folium`, `dash`, and `shapely`.
 
@@ -45,7 +45,7 @@ To install geopandas and its dependencies, follow below
 It is highly recommended to use conda to install geopandas.
 
 However, to install geopandas using pip on windows, follow the following steps:
-* Download the wheels for [GDAL](http://www.lfd.uci.edu/~gohlke/pythonlibs/#gdal), [Fiona](http://www.lfd.uci.edu/~gohlke/pythonlibs/#fiona), and [Rtree](http://www.lfd.uci.edu/~gohlke/pythonlibs/#rtree). Choose the correct python version (currently 3.7) and platform
+* Download the wheels for [GDAL](http://www.lfd.uci.edu/~gohlke/pythonlibs/#gdal), [Fiona](http://www.lfd.uci.edu/~gohlke/pythonlibs/#fiona), and [Rtree](http://www.lfd.uci.edu/~gohlke/pythonlibs/#rtree). Choose the correct python version (currently 3.8) and platform
 * Install any prerequisites listed on Gohlke's site (e.g. C++ redistributables)
 * `pip install` the wheels in the following order (preferably in a Virtual Environment)
     1. [GDAL](http://www.lfd.uci.edu/~gohlke/pythonlibs/#gdal)

diff --git a/environment.yml b/environment.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python>=3.7
+  - python>=3.8
   - pip
   # Core requirements
   - branca
@@ -15,6 +15,7 @@ dependencies:
   - pyarrow
   # Development requirements:
   - pytest
+  - hypothesis
   - pytest-cov
   - pre-commit
   - black

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,18 +1,2 @@
 [tool.black]
 line-length = 180
-exclude = '''
-/(
-    \.eggs
-  | \.git
-  | \.hg
-  | \.mypy_cache
-  | \.tox
-  | \.venv
-  | venv
-  | _build
-  | buck-out
-  | build
-  | dist
-  | setup.py
-)/
-'''
diff --git a/pytest.ini b/pytest.ini
diff --git a/scripts/settings.json b/scripts/settings.json
@@ -1,9 +1,9 @@
 {
   "settings":
   {
-    "Raw Census Extract location":  "Scout Census Data/Census 2020 Extract (1).csv",
-    "Scout Census location":  "Scout Census Data/Census 2020 Extract (1) with May 2019 fields.csv",
-    "Full ONS PD location":        "ONSPD_MAY_2019_UK/Data/ONSPD_MAY_2019_UK.csv",
+    "Raw Census Extract location": "Scout Census Data/Census 2020 Extract (1).csv",
+    "Scout Census location": "Scout Census Data/Census 2020 Extract (1) with May 2019 fields.feather",
+    "Full ONS PD location": "ONSPD_MAY_2019_UK/Data/ONSPD_MAY_2019_UK.csv",
     "Reduced ONS PD location": "ONSPD_MAY_2019_UK/Data/ONSPD_MAY_2019_UK reduced.csv",
     "ONS Names and codes folder": "ONSPD_MAY_2019_UK/Documents/",
     "National Statistical folder": "National Statistical data/",
@@ -17,10 +17,7 @@
       {
         "name": "D_ID",
         "codes": {"path": "../data/Scout Census Data/district_id_mapping.csv", "key": "D_ID", "key_type": "Int32", "name": "D_name"},
-        "boundary":
-          {
-            "shapefiles": ["../src/scripts/districts_buffered.geojson"], "key": "id", "name": "name"
-          },
+        "boundary": {"shapefile": "../scripts/districts_buffered.geojson", "key": "id", "name": "name"},
         "age_profile": null,
         "age_profile_code_col": null
       },

diff --git a/scripts/setup_reduce_onspd.py b/scripts/setup_reduce_onspd.py
@@ -1,15 +1,14 @@
 import json
 
 import src.utility as utility
-from src.utility import SCRIPTS_ROOT, DATA_ROOT
 from src.data.ons_pd_may_19 import ONSPostcodeDirectoryMay19
 
 if __name__ == "__main__":
-    with open(SCRIPTS_ROOT.joinpath("settings.json"), "r") as read_file:
+    with open(utility.SCRIPTS_ROOT.joinpath("settings.json"), "r") as read_file:
         settings = json.load(read_file)["settings"]
     print("Starting")
 
-    ons_pd_location = DATA_ROOT / settings["Full ONS PD location"]
+    ons_pd_location = utility.DATA_ROOT / settings["Full ONS PD location"]
 
     # Load Full ONS Postcode Directory
     ons_pd = ONSPostcodeDirectoryMay19(ons_pd_location, load_data=True)

diff --git a/setup.py b/setup.py
@@ -6,6 +6,6 @@
     version="0.2.0",
     packages=find_namespace_packages(),
     install_requires=["pandas", "numpy", "folium", "branca", "geopandas", "shapely", "dash", "pyarrow"],
-    extras_require={"dev": ["pytest", "pytest-cov", "pre-commit", "black"]},
-    python_requires=">=3.7",
+    extras_require={"dev": ["pytest", "hypothesis", "pytest-cov", "pre-commit", "black"]},
+    python_requires=">=3.8",
 )
diff --git a/src/data/ons_pd_may_18.py b/src/data/ons_pd_may_18.py
@@ -1,4 +1,3 @@
-from src.utility import DATA_ROOT
 from src.data.ons_pd import ONSPostcodeDirectory
 
 
@@ -50,13 +49,11 @@ class ONSPostcodeDirectoryMay18(ONSPostcodeDirectory):
     }
 
     def __init__(self, ons_pd_csv_path, load_data=True):
-        ONSPostcodeDirectory.__init__(
-            self, ons_pd_csv_path, load_data, ONSPostcodeDirectoryMay18.index_column, ONSPostcodeDirectoryMay18.fields, ONSPostcodeDirectoryMay18.data_types,
-        )
+        super().__init__(ons_pd_csv_path, load_data, ONSPostcodeDirectoryMay18.index_column, ONSPostcodeDirectoryMay18.fields, ONSPostcodeDirectoryMay18.data_types)
 
         # Folder within the ONS Postcode Directory archive holding names and codes files
-        names_codes_root = DATA_ROOT / self.settings["ONS Names and codes folder"]
-        boundaries_root = DATA_ROOT / self.settings["Boundaries folder"]
+        names_codes_root = self.settings["ONS Names and codes folder"]
+        boundaries_root = self.settings["Boundaries folder"]
 
         # Paths to all shapefiles within the Boundaries folder
         # fmt: off

diff --git a/src/data/ons_pd_may_19.py b/src/data/ons_pd_may_19.py
@@ -1,4 +1,3 @@
-from src.utility import DATA_ROOT
 from src.data.ons_pd import ONSPostcodeDirectory
 from pathlib import Path
 
@@ -50,13 +49,11 @@ class ONSPostcodeDirectoryMay19(ONSPostcodeDirectory):
     }
 
     def __init__(self, ons_pd_csv_path, load_data=True):
-        ONSPostcodeDirectory.__init__(
-            self, ons_pd_csv_path, load_data, ONSPostcodeDirectoryMay19.index_column, ONSPostcodeDirectoryMay19.fields, ONSPostcodeDirectoryMay19.data_types,
-        )
+        super().__init__(ons_pd_csv_path, load_data, self.index_column, ONSPostcodeDirectoryMay19.fields, ONSPostcodeDirectoryMay19.data_types)
 
         # Folder within the ONS Postcode Directory archive holding names and codes files
-        names_codes_root = DATA_ROOT / Path(self.settings["ONS Names and codes folder"]).resolve()
-        boundaries_dir = DATA_ROOT / Path(self.settings["Boundaries folder"]).resolve()
+        names_codes_root = Path(self.settings["ONS Names and codes folder"]).resolve()
+        boundaries_dir = Path(self.settings["Boundaries folder"]).resolve()
 
         # Paths to all shapefiles within the Boundaries folder
         # fmt: off

diff --git a/src/data/scout_census.py b/src/data/scout_census.py
@@ -97,7 +97,11 @@ class ScoutCensus:
     UNIT_LEVEL_GROUP = "Group"
     UNIT_LEVEL_DISTRICT = "District"
 
-    def __init__(self, census_file_path: Path):
+    def __init__(self, census_file_path: Path, load_data=True):
+        if not load_data:
+            self.data = pd.DataFrame()
+            return
+
         cols_int_32 = ["Object_ID", "G_ID", "D_ID", "C_ID", "R_ID", "X_ID", "imd"]
         cols_categorical = ["compass", "type", "name", "G_name", "D_name", "C_name", "R_name", "X_name", "postcode", "Young_Leader_Unit"]
         # fmt: off

diff --git a/src/data/scout_data.py b/src/data/scout_data.py
@@ -2,6 +2,7 @@
 from datetime import datetime
 from pathlib import Path
 import pandas as pd
+import geopandas as gpd
 import time
 from typing import TYPE_CHECKING
 
@@ -15,6 +16,8 @@
 if TYPE_CHECKING:
     from src.data.ons_pd import ONSPostcodeDirectory
 
+WGS_84 = 4326
+
 
 class ScoutData(Base):
     """Provides access to manipulate and process data
@@ -32,17 +35,18 @@ def columns(self):
 
     DEFAULT_VALUE = ScoutCensus.DEFAULT_VALUE
 
-    def __init__(self, merged_csv=True, load_ons_pd_data=False, census_path=None):
+    def __init__(self, merged_csv=True, load_ons_pd_data=False, census_path=None, load_census_data=True):
         super().__init__(settings=True, log_path=str(utility.LOGS_ROOT.joinpath("geo_mapping.log")))
         self.logger.info(f"Starting at {datetime.now().time()}")
         self.logger.finished(f"Logging setup", start_time=self.start_time)
 
         self.logger.info("Loading Scout Census data")
         # Loads Scout Census Data from a path to a .csv file that contains Scout Census data
         # We assume no custom path has been passed, but allow for one to be used
-        census_path = utility.DATA_ROOT / self.settings["Scout Census location"] if not census_path else census_path
-        self.scout_census: ScoutCensus = ScoutCensus(utility.DATA_ROOT / census_path)
+        census_path = self.settings["Scout Census location"] if not census_path else census_path
+        self.scout_census: ScoutCensus = ScoutCensus(utility.DATA_ROOT / census_path, load_data=load_census_data)
         self.data: pd.DataFrame = self.scout_census.data
+        self.points_data: gpd.GeoDataFrame = gpd.GeoDataFrame()
         self.logger.finished(f"Loading Scout Census data", start_time=self.start_time)
 
         if merged_csv:
@@ -131,7 +135,7 @@ def save_merged_data(self, ons_pd_publication_date: str):
         self.data.to_csv(output_path.with_suffix(".csv"), index=False, encoding="utf-8-sig")
         self.data.to_feather(output_path.with_suffix(".feather"))
 
-    def filter_records(self: ScoutDataInterface, field: str, value_list: list, mask: bool = False, exclusion_analysis: bool = False):
+    def filter_records(self, field: str, value_list: list, mask: bool = False, exclusion_analysis: bool = False):
         """Filters the Census records by any field in ONS PD.
 
         :param str field: The field on which to filter
@@ -142,3 +146,20 @@ def filter_records(self: ScoutDataInterface, field: str, value_list: list, mask:
         :returns None: Nothing
         """
         self.data = utility.filter_records(self.data, field, value_list, self.logger, mask, exclusion_analysis)
+
+    def add_shape_data(self, shapes_key: str, path: Path = None, gdf: gpd.GeoDataFrame = None):
+        if self.points_data.empty:
+            self.points_data = gpd.GeoDataFrame(geometry=gpd.points_from_xy(self.data.long, self.data.lat))
+            self.points_data.crs = WGS_84
+
+        if path:
+            shapes = gpd.GeoDataFrame.from_file(path)
+        elif gdf is not None:
+            shapes = gdf
+        else:
+            raise ValueError("A path to a shapefile or a Ge")
+
+        geo_merged = gpd.sjoin(self.points_data, shapes.to_crs(f"epsg:{WGS_84}"), how="left", op="intersects")
+        merged = self.data.merge(geo_merged[[shapes_key]], how="left", left_index=True, right_index=True)
+        assert self.data.equals(merged[self.data.columns])
+        self.data = merged
diff --git a/src/geographies/geography.py b/src/geographies/geography.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING
 
 from src.base import Base
+from src.utility import DATA_ROOT
 
 # For type hints
 if TYPE_CHECKING:
@@ -39,6 +40,14 @@ def type(self) -> str:
     def codes_map_key(self) -> str:
         return self.geography_metadata_dict["codes"]["key"]
 
+    @property
+    def codes_map_key_type(self) -> str:
+        return self.geography_metadata_dict["codes"]["key_type"]
+
+    @property
+    def codes_map_path(self) -> Path:
+        return DATA_ROOT / self.geography_metadata_dict["codes"].get("path")
+
     @property
     def codes_map_name(self) -> str:
         return self.geography_metadata_dict["codes"]["name"]
@@ -53,15 +62,19 @@ def shapefile_name(self) -> str:
 
     @property
     def shapefile_path(self) -> Path:
-        return self.geography_metadata_dict["boundary"]["shapefile"]
+        return DATA_ROOT / self.geography_metadata_dict["boundary"]["shapefile"]
 
     @property
     def age_profile_path(self) -> Path:
-        return self.geography_metadata_dict["age_profile"]["path"]
+        return DATA_ROOT / self.settings["National Statistical folder"] / self.geography_metadata_dict["age_profile"].get("path")
 
     @property
     def age_profile_key(self) -> str:
-        return self.geography_metadata_dict["age_profile"]["key"]
+        return self.geography_metadata_dict["age_profile"].get("key")
+
+    @property
+    def age_profile_pivot(self) -> str:
+        return self.geography_metadata_dict["age_profile"].get("pivot_key")
 
     def _set_boundary(self, geography_name: str, ons_pd: ONSPostcodeDirectory):
         """Sets the geography_metadata_dict and geography_region_ids_mapping members
@@ -80,11 +93,9 @@ def _set_boundary(self, geography_name: str, ons_pd: ONSPostcodeDirectory):
         boundaries_dict = {**ons_pd.BOUNDARIES, **self.settings["Scout Mappings"]}
         if geography_name in boundaries_dict.keys():
             self.geography_metadata_dict = boundaries_dict[geography_name]
-            boundary_codes_dict = self.geography_metadata_dict["codes"]
 
-            self.geography_region_ids_mapping = pd.read_csv(
-                boundary_codes_dict.get("path"), dtype={boundary_codes_dict["key"]: boundary_codes_dict["key_type"], boundary_codes_dict["name"]: "object",},
-            )  # Names & Codes file path
+            # Names & Codes file path
+            self.geography_region_ids_mapping = pd.read_csv(self.codes_map_path, dtype={self.codes_map_key: self.codes_map_key_type, self.codes_map_name: "string"})
         else:
             raise Exception(f"{geography_name} is an invalid boundary.\nValid boundaries include: {boundaries_dict.keys()}")
 

diff --git a/src/maps/map_plotter.py b/src/maps/map_plotter.py
@@ -9,8 +9,9 @@
 from src.reports.reports import Reports
 from src.base import Base
 
-# noinspection PyUnreachableCode
-if False:
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
     from pathlib import Path
     from branca import colormap
 
@@ -41,9 +42,9 @@ def __init__(self, out_file: Path):
         self.SCORE_COL: dict = {}
         self.layers: dict = {}
 
-        self.score_col_label: str = None
-        self.code_name: str = None
-        self.CODE_COL: str = None
+        self.score_col_label: str = ""
+        self.code_name: str = ""
+        self.CODE_COL: str = ""
         self.map_data: pd.DataFrame = pd.DataFrame()
 
         self.geo_data = None
@@ -109,7 +110,7 @@ def _filter_shape_file(self, shape_file_path: Path):
         self.logger.info(f"Filtering {original_number_of_shapes} shapes by {self.code_name} being in the {self.CODE_COL} of the map_data")
         self.logger.debug(f"Filtering {original_number_of_shapes} shapes by {self.code_name} being in \n{self.map_data[self.CODE_COL]}")
 
-        list_codes = self.map_data[self.CODE_COL].astype(str).to_list()
+        list_codes = self.map_data[self.CODE_COL].drop_duplicates().astype(str).to_list()
         all_shapes = all_shapes.loc[all_shapes[self.code_name].isin(list_codes)]
         self.logger.info(f"Resulting in {len(all_shapes.index)} shapes")
 

diff --git a/src/reports/history_summary.py b/src/reports/history_summary.py
@@ -5,8 +5,9 @@
 from src.data.scout_census import ScoutCensus
 import src.utility as utility
 
-# noinspection PyUnreachableCode
-if False:
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
     from src.data.scout_data import ScoutData