Add second pass for times calculation

dfsnow · dfsnow · commit 9e94055d11bd · 2024-11-29T19:39:09.000-06:00
diff --git a/data/src/calculate_times.py b/data/src/calculate_times.py
@@ -10,7 +10,11 @@
 import yaml
 from utils.constants import DOCKER_INTERNAL_PATH
 from utils.logging import create_logger
-from utils.times import TravelTimeCalculator, TravelTimeConfig
+from utils.times import (
+    TravelTimeCalculator,
+    TravelTimeConfig,
+    TravelTimeInputs,
+)
 from utils.utils import format_time, get_md5_hash
 
 logger = create_logger(__name__)
@@ -75,15 +79,48 @@ def main() -> None:
 
     # Extract missing pairs to a separate DataFrame
     missing_pairs_df = results_df[results_df["duration_sec"].isnull()]
-    missing_pairs_df = (
-        pd.DataFrame(missing_pairs_df)
-        .drop(columns=["duration_sec", "distance_km"])
-        .sort_values(by=["origin_id", "destination_id"])
-    )
+
+    # If there are missing pairs, rerun the routing for only those pairs
+    # using a more aggressive (but time consuming) second pass approach
+    if len(missing_pairs_df) > 0:
+        logger.info(
+            "Found %s missing pairs. Rerouting with a more aggressive method",
+            len(missing_pairs_df),
+        )
+        actor_sp = valhalla.Actor((Path.cwd() / "valhalla_sp.json").as_posix())
+
+        # Create a new input class, keeping only pairs that were unroutable
+        inputs_sp = TravelTimeInputs(
+            origins=inputs.origins[
+                inputs.origins["id"].isin(
+                    missing_pairs_df.index.get_level_values("origin_id")
+                )
+            ].reset_index(drop=True),
+            destinations=inputs.destinations[
+                inputs.destinations["id"].isin(
+                    missing_pairs_df.index.get_level_values("destination_id")
+                )
+            ].reset_index(drop=True),
+            chunk=None,
+            max_split_size_origins=inputs.max_split_size_origins,
+            max_split_size_destinations=inputs.max_split_size_destinations,
+        )
+
+        # Route using the more aggressive settings and update the results
+        tt_calc_sp = TravelTimeCalculator(actor_sp, config, inputs_sp)
+        results_df.update(tt_calc_sp.get_times())
+
+        # Extract the missing pairs again since they may have changed
+        missing_pairs_df = results_df[results_df["duration_sec"].isnull()]
 
     # Drop missing pairs and sort for more efficient compression
-    results_df = results_df.dropna(subset=["duration_sec"]).sort_values(
-        by=["origin_id", "destination_id"]
+    missing_pairs_df = (
+        missing_pairs_df.drop(columns=["duration_sec", "distance_km"])
+        .sort_index()
+        .reset_index()
+    )
+    results_df = (
+        results_df.dropna(subset=["duration_sec"]).sort_index().reset_index()
     )
 
     # Loop through files and write to both local and remote paths
diff --git a/data/src/utils/times.py b/data/src/utils/times.py
@@ -507,7 +507,9 @@ def get_times(self) -> pd.DataFrame:
                     )
                 )
 
-        results_df = pd.concat(results, ignore_index=True)
+        results_df = pd.concat(results, ignore_index=True).set_index(
+            ["origin_id", "destination_id"]
+        )
         del results
 
         return results_df
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -45,6 +45,8 @@ services:
       - ./data/build/:/custom_files:rw
       - ./data/valhalla.json:/custom_files/valhalla.json:rw
       - ./data/valhalla.json:/data/valhalla.json:rw
+      - ./data/valhalla_sp.json:/data/valhalla_sp.json:rw
+      - ./data/valhalla_sp.json:/custom_files/valhalla_sp.json:rw
       - ./data/output:/data/output:rw
       # Read only volumes to serve data to the container
       - $HOME/.aws/credentials:/home/valhalla/.aws/credentials:ro

Original file line number	Diff line number	Diff line change
`@@ -507,7 +507,9 @@ def get_times(self) -> pd.DataFrame:`
`507`	`507`	`)`
`508`	`508`	`)`
`509`	`509`
`510`		`- results_df = pd.concat(results, ignore_index=True)`
	`510`	`+ results_df = pd.concat(results, ignore_index=True).set_index(`
	`511`	`+ ["origin_id", "destination_id"]`
	`512`	`+ )`
`511`	`513`	`del results`
`512`	`514`
`513`	`515`	`return results_df`