Skip to content

Commit

Permalink
Merge pull request #109 from IGNF/yaml-thresholds
Browse files Browse the repository at this point in the history
Dump optimized thresholds for buildings as a yaml file
  • Loading branch information
leavauchier authored Apr 18, 2024
2 parents 0c01034 + 1313577 commit 969019c
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 201 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# main

Save optimized thresholds as yaml instead of pickle to make it easier to read

### 1.10.2
- Add support for metadata propagation through compound pdal pipelines:
- fix epsg propagation
Expand Down
2 changes: 1 addition & 1 deletion configs/building_validation/optimization/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ paths:
group_info_pickle_path: ${.results_output_dir}/group_info.pickle
prepared_las_dir: ${.results_output_dir}/prepared/
updated_las_dir: ${.results_output_dir}/updated/
building_validation_thresholds_pickle: ${.results_output_dir}/optimized_thresholds.pickle # Wher
building_validation_thresholds: ${.results_output_dir}/optimized_thresholds.yaml # Wher

# CLASSIFICATION CODES of a dataset which was inspected
# and labeled post TerraSolid macro
Expand Down
6 changes: 3 additions & 3 deletions docs/source/guides/thresholds_optimization.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ building_validation.optimization.paths.results_output_dir=[path/to/save/results]

### Evaluation of optimized thresholds on a test set

Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the pickled decision thresholds from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together.
Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the decision thresholds file (yaml file) from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together.


```bash
Expand All @@ -48,7 +48,7 @@ python lidar_prod/run.py \
building_validation.optimization.todo='prepare+evaluate+update' \
building_validation.optimization.paths.input_las_dir=[path/to/labelled/test/dataset/] \
building_validation.optimization.paths.results_output_dir=[path/to/save/results] \
building_validation.optimization.paths.building_validation_thresholds_pickle=[path/to/optimized_thresholds.pickle]
building_validation.optimization.paths.building_validation_thresholds=[path/to/optimized_thresholds.yaml]
```

### Utils
Expand All @@ -57,4 +57,4 @@ Debug mode: to run on a single file during development, add a `+building_validat


Reference:
- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017)).
- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017).
12 changes: 12 additions & 0 deletions lidar_prod/tasks/building_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import geopandas
import numpy as np
import pdal
import yaml
from tqdm import tqdm

from lidar_prod.tasks.utils import (
Expand Down Expand Up @@ -378,3 +379,14 @@ class thresholds:
min_frac_refutation: float
min_entropy_uncertainty: float
min_frac_entropy_uncertain: float

def dump(self, filename: str):
with open(filename, "w") as f:
yaml.safe_dump(self.__dict__, f)

@staticmethod
def load(filename: str):
with open(filename, "r") as f:
data = yaml.safe_load(f)

return thresholds(**data)
23 changes: 11 additions & 12 deletions lidar_prod/tasks/building_validation_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,22 +185,22 @@ def evaluate(self) -> dict:
"""
clusters = self._load_clusters()
self._set_thresholds_from_pickle_if_available()
self._set_thresholds_from_file_if_available()
decisions = np.array([self.bv._make_group_decision(c) for c in clusters])
mts_gt = np.array([c.target for c in clusters])
metrics_dict = self.evaluate_decisions(mts_gt, decisions)
log.info(f"\n Results:\n{self._get_results_logs_str(metrics_dict)}")
return metrics_dict

def _set_thresholds_from_pickle_if_available(self):
def _set_thresholds_from_file_if_available(self):
try:
with open(self.paths.building_validation_thresholds_pickle, "rb") as f:
self.bv.thresholds = pickle.load(f)
self.bv.thresholds = thresholds.load(self.paths.building_validation_thresholds)

except FileNotFoundError:
warnings.warn(
"Using default thresholds from hydra config to perform decisions. "
"You may want to specify different thresholds via a pickled object by specifying "
"building_validation.optimization.paths.building_validation_thresholds_pickle",
"You may want to specify different thresholds via a yaml file by specifying "
"building_validation.optimization.paths.building_validation_thresholds",
UserWarning,
)

Expand All @@ -213,7 +213,7 @@ def update(self):
"""
log.info(f"Updated las will be saved in {self.paths.results_output_dir}")
self._set_thresholds_from_pickle_if_available()
self._set_thresholds_from_file_if_available()
for prepared_las_path, target_las_path in tqdm(
zip(self.prepared_las_filepaths, self.out_las_filepaths),
total=len(self.prepared_las_filepaths),
Expand Down Expand Up @@ -354,11 +354,10 @@ def _select_best_rules(self, study):
best_rules = thresholds(**best.params)
return best_rules

def _dump_best_rules(self, best_trial_params):
"""Serializes best thresholds."""
with open(self.paths.building_validation_thresholds_pickle, "wb") as f:
pickle.dump(best_trial_params, f)
log.info(f"Pickled best params to {self.paths.building_validation_thresholds_pickle}")
def _dump_best_rules(self, best_trial_params: thresholds):
"""Saves best thresholds to a yaml file."""
best_trial_params.dump(self.paths.building_validation_thresholds)
log.info(f"Saved best params to {self.paths.building_validation_thresholds}")

def _dump_clusters(self, clusters):
"""Serializes the list of cluster-level information objects."""
Expand Down
23 changes: 22 additions & 1 deletion tests/lidar_prod/tasks/test_building_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import pytest

from lidar_prod.tasks.building_validation import BuildingValidator
from lidar_prod.tasks.building_validation import BuildingValidator, thresholds
from lidar_prod.tasks.utils import BDUniConnectionParams, get_las_data_from_las
from tests.conftest import (
check_expected_classification,
Expand Down Expand Up @@ -171,3 +171,24 @@ def test_run(hydra_cfg):
dims.candidate_buildings_flag,
],
)


def test_thresholds():
dump_file = str(TMP_DIR / "threshold_dump.yml")

th = thresholds(
min_confidence_confirmation=0.1,
min_frac_confirmation=0.2,
min_frac_confirmation_factor_if_bd_uni_overlay=0.3,
min_uni_db_overlay_frac=0.4,
min_confidence_refutation=0.5,
min_frac_refutation=0.6,
min_entropy_uncertainty=0.7,
min_frac_entropy_uncertain=0.8,
)

th.dump(dump_file)

th1 = th.load(dump_file)

assert th1 == th
194 changes: 194 additions & 0 deletions tests/lidar_prod/tasks/test_building_validation_optimization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import os
import os.path as osp
import shutil
from pathlib import Path

import hydra
import numpy as np
import pytest

from lidar_prod.tasks.building_validation import thresholds
from lidar_prod.tasks.building_validation_optimization import (
BuildingValidationOptimizer,
)
from lidar_prod.tasks.utils import BDUniConnectionParams
from tests.conftest import pdal_read_las_array

"""We test the building validation optimizer against two LAS:
These datasets must have the right classification codes, i.e. the ones defined in
buildings_correction_labels.
WARNING: The large LAS cannot be versionned by git. If it is absent from environment,
pytest expects the test to fail.
This is to enable a shallower run of these tests without the file.
"""

TMP_DIR = Path("tmp/lidar_prod/tasks/building_validation_optimization")


# Small LAS, for which we optimize thresholds and reach perfect validation,
# to quickly check optimization logic.
LAS_SUBSET_FILE = "tests/files/870000_6618000.subset.postIA.corrected.las"
SUBSET_EXPECTED_METRICS = {
"exact": {
"groups_count": 15,
"group_no_buildings": 0.4,
},
"min": {
"p_auto": 1.0,
"recall": 1.0,
"precision": 1.0,
},
}
# Large LAS, for which we evaluate performance, to control that there was no regression in terms of
# automation/precision/recall of building validation.
LAS_LARGE_FILE = "tests/files/large/V0.5_792000_6272000.las"
LARGE_EXPECTED_METRICS = {
"exact": {
"groups_count": 1493,
"group_no_buildings": 0.149,
"group_building": 0.847,
},
"min": {
"p_auto": 0.94,
"recall": 0.99,
"precision": 0.94,
},
}

# Relative tolerance when comparing metrics to their expected value for large LAS.
# i.e. resulting metrics are >= (1-tolerance) * expected metrics for performance indicators.
RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS = 0.05


def test_BVOptimization_on_subset(hydra_cfg):
out_dir = str(TMP_DIR / "subset")
# Optimization output (thresholds and prepared/updated LASfiles) saved to out_dir
hydra_cfg.building_validation.optimization.paths.results_output_dir = out_dir

# We isolate the input file in a subdir, and prepare it for optimization
input_las_dir = osp.join(out_dir, "inputs/")
hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir
os.makedirs(input_las_dir, exist_ok=False)
src_las_copy_path = osp.join(input_las_dir, "copy.las")
shutil.copy(LAS_SUBSET_FILE, src_las_copy_path)

# Check that a full optimization run can pass successfully
bvo: BuildingValidationOptimizer = hydra.utils.instantiate(
hydra_cfg.building_validation.optimization
)
bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate(
hydra_cfg.bd_uni_connection_params
)
bvo.bv.bd_uni_connection_params = bd_uni_connection_params
bvo.run()

# Check that the threshold are saved in a yaml file successfully
th_yaml = hydra_cfg.building_validation.optimization.paths.building_validation_thresholds
assert os.path.isfile(th_yaml)
assert isinstance(thresholds.load(th_yaml), thresholds)

# Assert that a prepared and an updated file are generated in the temporary dir
# in subfolders.
assert os.path.isfile(osp.join(out_dir, "prepared", osp.basename(src_las_copy_path)))
updated_las_path = osp.join(out_dir, "updated", osp.basename(src_las_copy_path))
assert os.path.isfile(updated_las_path)

# Check the output of the evaluate method. Note that it uses the
# prepared data and the threshold from previous run
metrics_dict = bvo.evaluate()
print(metrics_dict)
# Assert inclusion
assert SUBSET_EXPECTED_METRICS["exact"].items() <= metrics_dict.items()
# Assert <= with a relative tolerance
for k, v in SUBSET_EXPECTED_METRICS["min"].items():
v <= metrics_dict[k]
# Update classification dimension and check if the codes are the expected ones.
bvo.bv.use_final_classification_codes = True
bvo.update()
assert os.path.isfile(updated_las_path)
arr, _ = pdal_read_las_array(updated_las_path, hydra_cfg.data_format.epsg)
# Check that we have either 1/2 (ground/unclassified), or one of
# the final classification code of the module.
final_codes = hydra_cfg.data_format.codes.building.final
expected_codes = {
1,
2,
final_codes.building,
final_codes.not_building,
final_codes.unsure,
}
actual_codes = {*np.unique(arr["Classification"])}
assert actual_codes.issubset(expected_codes)


@pytest.mark.slow()
def test_BVOptimization_on_large_file(hydra_cfg):

if not os.path.isfile(LAS_LARGE_FILE):
pytest.xfail(reason=f"File {LAS_LARGE_FILE} is not present in environment.")

out_dir = str(TMP_DIR / "large_file")

# Optimization output (thresholds and prepared/updated LASfiles) saved to td
hydra_cfg.building_validation.optimization.paths.results_output_dir = out_dir

# We isolate the input file in a subdir, and prepare it for optimization
input_las_dir = osp.join(out_dir, "inputs/")
hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir
os.makedirs(input_las_dir, exist_ok=False)
src_las_copy_path = osp.join(input_las_dir, "copy.las")
shutil.copy(LAS_LARGE_FILE, src_las_copy_path)

# Check that a full optimization run can pass successfully
bvo: BuildingValidationOptimizer = hydra.utils.instantiate(
hydra_cfg.building_validation.optimization
)

bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate(
hydra_cfg.bd_uni_connection_params
)
bvo.bv.bd_uni_connection_params = bd_uni_connection_params

bvo.prepare()
metrics_dict = bvo.evaluate()
print(metrics_dict)

exact_expected_val = LARGE_EXPECTED_METRICS["exact"]
for k in exact_expected_val:
assert (
pytest.approx(exact_expected_val[k], RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS)
== metrics_dict[k]
)
min_expected_val = LARGE_EXPECTED_METRICS["min"]
for k in min_expected_val:
assert (
(1 - RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS) * min_expected_val[k]
) <= metrics_dict[k]


# All expected metrics for reference:
"""
groups_count=1493
group_unsure=0.00402
group_no_buildings=0.149
group_building=0.847
p_auto=0.889
p_unsure=0.111
p_refute=0.0924
p_confirm=0.797
a_refute=0.899
a_confirm=0.976
precision=0.98
recall=0.99
Confusion Matrix
[[ 2 1 3]
[ 74 124 25]
[ 89 13 1162]]
Confusion Matrix (normalized)
[[0.333 0.167 0.5 ]
[0.332 0.556 0.112]
[0.07 0.01 0.919]]
"""
Loading

0 comments on commit 969019c

Please sign in to comment.