Merge branch '#146-and-#125-gis-functions-in-separate-file-and-ground…

…-floor-height-updating' into fiat_integrator
Deltares · Oct 10, 2023 · fb2670e · fb2670e
2 parents bce3b93 + 59e6b54
commit fb2670e
Show file tree

Hide file tree

Showing 4 changed files with 318 additions and 30 deletions.
diff --git a/hydromt_fiat/fiat.py b/hydromt_fiat/fiat.py
@@ -697,6 +697,7 @@ def read_tables(self):
                 crs=self.get_config("exposure.geom.crs"),
                 logger=self.logger,
                 unit=self.get_config("exposure.geom.unit"),
+                data_catalog=self.data_catalog,  # TODO: See if this works also when no data catalog is provided
             )
             self.exposure.read_table(exposure_fn)
             self._tables["exposure"] = self.exposure.exposure_db

diff --git a/hydromt_fiat/workflows/exposure_vector.py b/hydromt_fiat/workflows/exposure_vector.py
@@ -7,7 +7,6 @@
 import numpy as np
 import pandas as pd
 from hydromt.data_catalog import DataCatalog
-from hydromt.gis_utils import utm_crs
 from pyproj import CRS
 
 from hydromt_fiat.data_apis.national_structure_inventory import get_assets_from_nsi
@@ -20,6 +19,7 @@
 from hydromt_fiat.workflows.exposure import Exposure
 from hydromt_fiat.workflows.utils import detect_delimiter
 from hydromt_fiat.workflows.vulnerability import Vulnerability
+from hydromt_fiat.workflows.gis import get_area, sjoin_largest_area, get_crs_str_from_gdf, join_spatial_data
 
 
 class ExposureVector(Exposure):
@@ -229,8 +229,7 @@ def setup_asset_locations(self, asset_locations: str) -> None:
             )
 
         # Set the CRS of the exposure data
-        source_data_authority = assets.crs.to_authority()
-        self.crs = source_data_authority[0] + ":" + source_data_authority[1]
+        self.crs = get_crs_str_from_gdf(assets.crs)
 
         # Check if the 'Object ID' column exists and if so, is unique
         if "Object ID" not in assets.columns:
@@ -299,13 +298,7 @@ def setup_occupancy_type(self, occupancy_source: str, occupancy_attr: str) -> No
             # If there is only one exposure geom, do the spatial join with the
             # occupancy_map. Only take the largest overlapping object from the
             # occupancy_map.
-            gdf = gpd.overlay(
-                self.exposure_geoms[0], occupancy_map[to_keep], how="intersection"
-            )
-            gdf["area"] = gdf.geometry.area
-            gdf.sort_values(by="area", inplace=True)
-            gdf.drop_duplicates(subset="Object ID", keep="last", inplace=True)
-            gdf.drop(columns=["area"], inplace=True)
+            gdf = sjoin_largest_area(self.exposure_geoms[0], occupancy_map[to_keep])
 
             # Remove the objects that do not have a Primary Object Type, that were not
             # overlapping with the land use map, or that had a land use type of 'nan'.
@@ -419,18 +412,50 @@ def setup_aggregation_labels(self):
 
     def setup_ground_floor_height(
         self,
-        ground_floor_height: Union[int, float, None, str, Path],
+        ground_floor_height: Union[int, float, None, str, Path, List[str], List[Path]],
+        attr_name: Union[str, List[str], None] = None,
+        method: Union[str, List[str], None] = "nearest",
     ) -> None:
-        # Set the ground floor height column.
-        # If the Ground Floor Height is input as a number, assign all objects with
-        # the same Ground Floor Height.
+        """Set the ground floor height of the exposure data. This function overwrites
+        the existing Ground Floor Height column if it already exists.
+
+        Parameters
+        ----------
+        ground_floor_height : Union[int, float, None, str, Path, List[str], List[Path]]
+            A number to set the Ground Floor Height of all assets to the same value, a
+            path to a file that contains the Ground Floor Height of each asset, or a
+            list of paths to files that contain the Ground Floor Height of each asset,
+            in the order of preference (the first item in the list gets the highest
+            priority in assigning the values).
+        attr_name : Union[str, List[str]], optional
+            The name of the attribute that contains the Ground Floor Height in the
+            file(s) that are submitted. If multiple `ground_floor_height` files are
+            submitted, the attribute names are linked to the files in the same order as
+            the files are submitted. By default None.
+        method : Union[str, List[str]], optional
+            The method to use to assign the Ground Floor Height to the assets. If
+            multiple `ground_floor_height` files are submitted, the methods are linked
+            to the files in the same order as the files are submitted. The method can
+            be either 'nearest' (nearest neighbor) or 'intersection'. By default
+            'nearest'.
+        """
         if ground_floor_height:
             if isinstance(ground_floor_height, int) or isinstance(
                 ground_floor_height, float
             ):
+                # If the Ground Floor Height is input as a number, assign all objects with
+                # the same Ground Floor Height.
                 self.exposure_db["Ground Floor Height"] = ground_floor_height
-            elif isinstance(ground_floor_height, str):
-                # TODO: implement the option to add the ground floor height from a file.
+            elif isinstance(ground_floor_height, str) or isinstance(
+                ground_floor_height, Path
+            ):
+                # A single file is used to assign the ground floor height to the assets
+                gfh = self.data_catalog.get_geodataframe(ground_floor_height)
+                gdf = self.get_full_gdf(self.exposure_db)
+                gdf = join_spatial_data(gdf, gfh, attr_name, method)
+                gdf = self._set_values_from_other_column(gdf, "Ground Floor Height", attr_name)
+            elif isinstance(ground_floor_height, list):
+                # Multiple files are used to assign the ground floor height to the assets
                 NotImplemented
         else:
             # Set the Ground Floor Height to 0 if the user did not specify any
@@ -463,14 +488,7 @@ def setup_max_potential_damage(
 
         # Calculate the area of each object
         gdf = self.get_full_gdf(self.exposure_db)[["Primary Object Type", "geometry"]]
-        if gdf.crs.is_geographic:
-            # If the CRS is geographic, reproject to the nearest UTM zone
-            nearest_utm = utm_crs(gdf.total_bounds)
-            gdf_utm = gdf.to_crs(nearest_utm)
-            gdf["area"] = gdf_utm["geometry"].area
-        elif gdf.crs.is_projected:
-            # If the CRS is projected, calculate the area in the same CRS
-            gdf["area"] = gdf["geometry"].area
+        gdf = get_area(gdf)
 
         # Set the damage values to the exposure data
         if damage_types is None:
@@ -910,17 +928,13 @@ def link_exposure_vulnerability(
             len_diff_primary_linking_types = 100000
             len_diff_secondary_linking_types = 100000
             if "Primary Object Type" in self.exposure_db.columns:
-                unique_types_primary = set(
-                    self.exposure_db["Primary Object Type"].unique()
-                )
+                unique_types_primary = set(self.get_primary_object_type())
                 diff_primary_linking_types = unique_types_primary - unique_linking_types
                 len_diff_primary_linking_types = len(diff_primary_linking_types)
 
             unique_types_secondary = set()
             if "Secondary Object Type" in self.exposure_db.columns:
-                unique_types_secondary = set(
-                    self.exposure_db["Secondary Object Type"].unique()
-                )
+                unique_types_secondary = set(self.get_secondary_object_type())
                 diff_secondary_linking_types = (
                     unique_types_secondary - unique_linking_types
                 )
@@ -1281,3 +1295,16 @@ def set_height_relative_to_reference(
         )
 
         return exposure_to_modify.reset_index(drop=True)
+
+    @staticmethod
+    def _set_values_from_other_column(
+        df: Union[pd.DataFrame, gpd.GeoDataFrame], col_to_set: str, col_to_copy: str
+    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
+        """Sets the values of <col_to_set> to where the values of <col_to_copy> are 
+        nan and deletes <col_to_copy>.
+        """
+        df.loc[df[col_to_copy].notna(), col_to_set] = df.loc[
+            df[col_to_copy].notna(), col_to_copy
+        ]
+        del df[col_to_copy]
+        return df
diff --git a/hydromt_fiat/workflows/gis.py b/hydromt_fiat/workflows/gis.py
@@ -0,0 +1,217 @@
+import geopandas as gpd
+from hydromt.gis_utils import utm_crs, nearest_merge
+from typing import List
+
+
+def get_area(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+    """Adds an area column to a GeoDataFrame.
+
+    Parameters
+    ----------
+    gdf : gpd.GeoDataFrame
+        The GeoDataFrame to which the area column will be added.
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        The GeoDataFrame with an additional column "area".
+    """
+    # Calculate the area of each object
+    if gdf.crs.is_geographic:
+        # If the CRS is geographic, reproject to the nearest UTM zone
+        nearest_utm = utm_crs(gdf.total_bounds)
+        gdf_utm = gdf.to_crs(nearest_utm)
+        gdf["area"] = gdf_utm["geometry"].area
+    elif gdf.crs.is_projected:
+        # If the CRS is projected, calculate the area in the same CRS
+        gdf["area"] = gdf["geometry"].area
+    return gdf
+
+
+def sjoin_largest_area(
+    left_gdf: gpd.GeoDataFrame, right_gdf: gpd.GeoDataFrame, id_col: str = "Object ID"
+) -> gpd.GeoDataFrame:
+    """Spatial join of two GeoDataFrames, keeping only the joined data from the largest
+    intersection per object.
+
+    Parameters
+    ----------
+    left_gdf : gpd.GeoDataFrame
+        The GeoDataFrame to which the data from the right GeoDataFrame will be joined.
+    right_gdf : gpd.GeoDataFrame
+        The GeoDataFrame from which the data will be joined to the left GeoDataFrame.
+    id_col : str, optional
+        The ID column that will be used to drop the duplicates from overlapping
+        geometries, by default "Object ID"
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        Resulting GeoDataFrame with the joined data from the largest intersection per
+        object.
+    """
+    gdf = gpd.overlay(left_gdf, right_gdf, how="intersection")
+    gdf["area"] = gdf.geometry.area
+    gdf.sort_values(by="area", inplace=True)
+    gdf.drop_duplicates(subset=id_col, keep="last", inplace=True)
+    gdf.drop(columns=["area"], inplace=True)
+    return gdf
+
+
+def check_geometry_type(gdf: gpd.GeoDataFrame) -> str:
+    """Check if the geometry type of a GeoDataFrame is homogeneous.
+
+    Parameters
+    ----------
+    gdf : gpd.GeoDataFrame
+        The GeoDataFrame to check.
+
+    Raises
+    ------
+    ValueError
+        If the geometry type of the GeoDataFrame is not homogeneous.
+    """
+    if len(gdf.geom_type.unique()) > 1:
+        raise ValueError(
+            "The geometry type of the GeoDataFrame is not homogeneous. "
+            f"Geometry types found: {gdf.geom_type.unique()}"
+        )
+    else:
+        return gdf.geom_type.unique()[0]
+
+
+def get_crs_str_from_gdf(gdf_crs: gpd.GeoDataFrame.crs) -> str:
+    source_data_authority = gdf_crs.to_authority()
+    return source_data_authority[0] + ":" + source_data_authority[1]
+
+
+def clean_up_gdf(gdf: gpd.GeoDataFrame, columns: List[str]) -> gpd.GeoDataFrame:
+    """Clean up a GeoDataFrame by removing unnecessary columns.
+
+    Parameters
+    ----------
+    gdf : gpd.GeoDataFrame
+        The GeoDataFrame to clean up.
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        The cleaned up GeoDataFrame.
+    """
+    # Remove unnecessary columns
+    for col in columns:
+        del gdf[col]
+
+    return gdf
+
+
+def join_nearest_points(left_gdf, right_gdf, attribute_name, max_dist) -> gpd.GeoDataFrame:
+    """Join two GeoDataFrames based on the nearest distance between their points.
+    
+    Parameters
+    ----------
+    left_gdf : gpd.GeoDataFrame
+        The GeoDataFrame to which the data from the right GeoDataFrame will be joined.
+    right_gdf : gpd.GeoDataFrame
+        The GeoDataFrame from which the data will be joined to the left GeoDataFrame.
+    attribute_name : str
+        The name of the attribute that will be joined.
+    max_dist : float
+        The maximum distance for the nearest join measured in meters.
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        The joined GeoDataFrame.
+    """    
+    # Use the HydroMT function nearest_merge to join the geodataframes
+    gdf_merged = nearest_merge(
+        gdf1=left_gdf, gdf2=right_gdf, columns=[attribute_name], max_dist=max_dist
+    )
+
+    # Clean up the geodataframe (remove unnecessary columns)
+    gdf_merged = clean_up_gdf(gdf_merged, ["distance_right", "index_right"])
+
+    return gdf_merged
+
+
+def intersect_points_polygons(left_gdf, right_gdf, attribute_name) -> gpd.GeoDataFrame:
+    gdf_merged = gpd.sjoin(left_gdf, right_gdf, how="left", predicate="intersects")
+
+    # Clean up the geodataframe (remove unnecessary columns)
+    gdf_merged = clean_up_gdf(gdf_merged, ["index_right"])
+
+    return gdf_merged
+
+
+def join_spatial_data(
+    left_gdf: gpd.GeoDataFrame,
+    right_gdf: gpd.GeoDataFrame,
+    attribute_name: str,
+    method: str,
+    max_dist: float = 10,
+) -> gpd.GeoDataFrame:
+    """Join two GeoDataFrames based on their spatial relationship.
+
+    Parameters
+    ----------
+    left_gdf : gpd.GeoDataFrame
+        The GeoDataFrame to which the data from the right GeoDataFrame will be joined.
+    right_gdf : gpd.GeoDataFrame
+        The GeoDataFrame from which the data will be joined to the left GeoDataFrame.
+    attribute_name : str
+        The name of the attribute that will be joined.
+    method : str
+        The method that will be used to join the data. Either "nearest" or
+        "intersection".
+    max_dist : float, optional
+        The maximum distance for the nearest join measured in meters, by default 
+        10 (meters).
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        The joined GeoDataFrame.
+    """
+    assert left_gdf.crs == right_gdf.crs, (
+            "The CRS of the GeoDataFrames to join do not match. "
+            f"Left CRS: {get_crs_str_from_gdf(left_gdf.crs)}, "
+            f"Right CRS: {get_crs_str_from_gdf(right_gdf.crs)}"
+        )
+
+    left_gdf_type = check_geometry_type(left_gdf)
+    right_gdf_type = check_geometry_type(right_gdf)
+
+    assert (left_gdf_type == "Polygon") or (left_gdf_type == "Point"), (
+            "The left GeoDataFrame should contain either polygons or points."
+        )
+
+    assert (right_gdf_type == "Polygon") or (right_gdf_type == "Point"), (
+        "The right GeoDataFrame should contain either polygons or points."
+    )
+
+    if method == "nearest":
+        if left_gdf_type == "Polygon":
+            left_gdf_geom_original = left_gdf.geometry
+            left_gdf.geometry = left_gdf.geometry.centroid
+
+        if right_gdf_type == "Polygon":
+            right_gdf.geometry = right_gdf.geometry.centroid
+
+        gdf = join_nearest_points(left_gdf, right_gdf, attribute_name, max_dist)
+        left_gdf.geometry = left_gdf_geom_original # TODO: Check if this works well
+
+    elif method == "intersection":
+        if (left_gdf_type == "Polygon") and (right_gdf_type == "Polygon"):
+            gdf = sjoin_largest_area(left_gdf, right_gdf)
+        elif (left_gdf_type == "Point") and (right_gdf_type == "Polygon"):
+            gdf = intersect_points_polygons(left_gdf, right_gdf, attribute_name)
+        else:
+            raise NotImplementedError(
+                f"Join method {method} is not implemented for joining data of geometry "
+                f"type {left_gdf_type} and {right_gdf_type}"
+            )
+    else:
+        raise NotImplementedError(f"Join method {method} is not implemented")
+
+    return gdf