From b092fd730f1b95bfab02ec76955add91e4e7695b Mon Sep 17 00:00:00 2001 From: Justin Johnson Date: Wed, 26 Nov 2025 09:41:21 -0600 Subject: [PATCH 1/6] Checkpoint from VS Code for coding agent session --- seals/seals_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/seals/seals_utils.py b/seals/seals_utils.py index 44ca65b..c0984a0 100644 --- a/seals/seals_utils.py +++ b/seals/seals_utils.py @@ -1529,6 +1529,8 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan ### Define the allocation of the total to individual cells + # TODO If the user provides a regional_change_classes_path that has each year as a column, instead of the current format which has + # Creates a dict for each zone_id: to_allocate, which will be reclassified onto the zone ids. for year_c, year in enumerate(years): allocate_per_zone_dict = {} @@ -1539,6 +1541,7 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan hb.log('Processing ' + column + ' for ' + scenario_label + ', writing to ' + output_path) regions_column_id = regions_column_id.replace('label', 'id') + # TODO Right here, we see that the merged dataframes have years in the columns (instead of a single column with year as a variable). But we actually want landuse class in the columns. Pivot this table so it works. for i, change in merged[column].items(): zone_id = int(merged[regions_column_id][i]) From ca8b299299f5a4f2dcb03ddc63fc7061711d41b5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 26 Nov 2025 15:46:20 +0000 Subject: [PATCH 2/6] Implement support for year-based CSV format in convert_regional_change_to_coarse Co-authored-by: jandrewjohnson <23742350+jandrewjohnson@users.noreply.github.com> --- seals/seals_utils.py | 145 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 115 insertions(+), 30 deletions(-) diff --git a/seals/seals_utils.py b/seals/seals_utils.py index c0984a0..d662097 100644 --- a/seals/seals_utils.py +++ b/seals/seals_utils.py @@ -1476,13 +1476,16 @@ def load_blocks_list(p, input_dir): def convert_regional_change_to_coarse(regional_change_vector_path, regional_change_classes_path, coarse_ha_per_cell_path, scenario_label, output_dir, output_filename_end, columns_to_process, regions_column_label, years, region_ids_raster_path=None, distribution_algorithm='proportional', coarse_change_raster_path=None): # Converts a regional vector change map to a coarse gridded representation. - # - Regiona_change_vector_path is a gpkg with the boundaries of the regions that are changing. + # - regional_change_vector_path is a gpkg with the boundaries of the regions that are changing. # - regional_change_classes_path is a csv with columns for each changing landuse class (in columns to process) that reports net change in hectares per polygon. - # - Coarse_ha_per_cell_path is a raster that reports the number of hectares per cell in the coarse grid. Necessary to work with unprojected data, which is our assumed form. - # - Output path is the path to save the output raster. - # - Columns to process is a list of the columns headers in the gpkg that represent net changes in LU classes. + # This CSV can be in two formats: + # 1. Class-based format (original): columns are land-use classes (e.g., 'cropland', 'forest'), rows are regions + # 2. Year-based format (new): columns are years (e.g., '2030', '2050'), rows are regions + class combinations with a 'class' column + # - coarse_ha_per_cell_path is a raster that reports the number of hectares per cell in the coarse grid. Necessary to work with unprojected data, which is our assumed form. + # - output_dir is the directory where output rasters will be saved. + # - columns_to_process is a list of the column headers that represent net changes in LU classes. # - distribution_algorithm is a string that indicates how to distribute the change across the cells. default is "proportional". - # - Coarse_change_raster_path is an optional path that, if provided, will be combined with the regional change to produce a combined change raster. + # - coarse_change_raster_path is an optional path that, if provided, will be combined with the regional change to produce a combined change raster. # Read both inputs regional_change_vector = gpd.read_file(regional_change_vector_path) @@ -1529,40 +1532,122 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan ### Define the allocation of the total to individual cells - # TODO If the user provides a regional_change_classes_path that has each year as a column, instead of the current format which has + # Detect if the CSV has years as columns (year-based format) or classes as columns (class-based format) + # Year-based format: columns are years (e.g., '2030', '2050'), rows have region + class + # Class-based format: columns are classes (e.g., 'cropland', 'forest'), rows have region only + + # Check if any of the columns_to_process appear in the merged dataframe columns + # If not, we assume it's a year-based format that needs pivoting + has_class_columns = any(col in merged.columns for col in columns_to_process) + + # Check if any year appears as a column (as string or int) + year_columns = [col for col in merged.columns if str(col).replace('.', '').replace('-', '').isdigit()] + has_year_columns = len(year_columns) > 0 + + if not has_class_columns and has_year_columns: + # Year-based format detected - need to pivot the data + # Expected format: region_id/label, class_label, year1, year2, ..., yearN + # We need to transform it to: region_id/label, class1, class2, ..., classN (for each year) + + # Find the class column name (usually 'class', 'lulc_class', 'class_label', etc.) + class_column_candidates = ['class', 'lulc_class', 'class_label', 'land_class', 'landuse_class'] + class_column = None + for candidate in class_column_candidates: + if candidate in merged.columns: + class_column = candidate + break + + if class_column is None: + # Try to find any column that might contain class information + for col in merged.columns: + if 'class' in col.lower() and col != regions_column_label: + class_column = col + break + + if class_column is None: + raise NameError('Could not detect class column in year-based format CSV. Expected columns like "class", "lulc_class", or "class_label".') + + hb.log('Detected year-based format in CSV. Pivoting data from class column: ' + class_column) # Creates a dict for each zone_id: to_allocate, which will be reclassified onto the zone ids. for year_c, year in enumerate(years): allocate_per_zone_dict = {} - for column in columns_to_process: - output_path = os.path.join(output_dir, column + output_filename_end) + + # If year-based format, we need to work with data for this specific year + if not has_class_columns and has_year_columns: + # Find the column for this year (could be string or int) + year_col = None + for col in year_columns: + if str(col) == str(year) or int(col) == int(year): + year_col = col + break + + if year_col is None: + hb.log('Warning: Year ' + str(year) + ' not found in CSV columns. Available years: ' + str(year_columns)) + continue - if not hb.path_exists(output_path): - hb.log('Processing ' + column + ' for ' + scenario_label + ', writing to ' + output_path) - regions_column_id = regions_column_id.replace('label', 'id') + # For each class we need to process, extract the data from the pivoted structure + for column in columns_to_process: + output_path = os.path.join(output_dir, column + output_filename_end) - # TODO Right here, we see that the merged dataframes have years in the columns (instead of a single column with year as a variable). But we actually want landuse class in the columns. Pivot this table so it works. - - for i, change in merged[column].items(): - zone_id = int(merged[regions_column_id][i]) + if not hb.path_exists(output_path): + hb.log('Processing ' + column + ' for year ' + str(year) + ' in ' + scenario_label + ', writing to ' + output_path) + regions_column_id = regions_column_id.replace('label', 'id') + + # Filter merged dataframe for this specific class + class_data = merged[merged[class_column] == column] - if int(zone_id) in n_cells_per_zone: - n_cells = n_cells_per_zone[int(zone_id)] # BAD HACK, should be generalized to know ahead of time if it's an int or string - else: - n_cells = 0 + # Now iterate over the filtered data to get regional values for this year and class + for idx, row in class_data.iterrows(): + zone_id = int(row[regions_column_id]) + change = row[year_col] + + if int(zone_id) in n_cells_per_zone: + n_cells = n_cells_per_zone[int(zone_id)] + else: + n_cells = 0 + + if n_cells > 0 and change != 0 and not pd.isna(change): + result = change / n_cells + else: + result = 0.0 + + if 'nan' in str(result).lower() or pd.isna(result): + allocate_per_zone_dict[zone_id] = 0.0 + else: + allocate_per_zone_dict[zone_id] = result + + print('Allocate per zone dict for ' + column + ' (year ' + str(year) + '): ' + str(allocate_per_zone_dict)) + hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) + else: + # Original class-based format logic + for column in columns_to_process: + output_path = os.path.join(output_dir, column + output_filename_end) + + if not hb.path_exists(output_path): + hb.log('Processing ' + column + ' for ' + scenario_label + ', writing to ' + output_path) + regions_column_id = regions_column_id.replace('label', 'id') + + for i, change in merged[column].items(): + zone_id = int(merged[regions_column_id][i]) - if n_cells > 0 and change != 0: - result = change / n_cells - else: - result = 0.0 - - if 'nan' in str(result).lower(): - allocate_per_zone_dict[zone_id] = 0.0 - else: - allocate_per_zone_dict[zone_id] = result + if int(zone_id) in n_cells_per_zone: + n_cells = n_cells_per_zone[int(zone_id)] # BAD HACK, should be generalized to know ahead of time if it's an int or string + else: + n_cells = 0 - print('Allocate per zone dict for ' + column + ': ' + str(allocate_per_zone_dict)) - hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) + if n_cells > 0 and change != 0: + result = change / n_cells + else: + result = 0.0 + + if 'nan' in str(result).lower(): + allocate_per_zone_dict[zone_id] = 0.0 + else: + allocate_per_zone_dict[zone_id] = result + + print('Allocate per zone dict for ' + column + ': ' + str(allocate_per_zone_dict)) + hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) def combine_coarsified_regional_with_coarse_estimate(coarsified_path, coarse_estimate_path, combination_algorithm, output_path): ### ABANDONED? From d03ec65d506b7e75c447bcf71ad6ea72f9691bf0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 26 Nov 2025 15:50:38 +0000 Subject: [PATCH 3/6] Address code review feedback: improve error handling and consistency Co-authored-by: jandrewjohnson <23742350+jandrewjohnson@users.noreply.github.com> --- seals/seals_utils.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/seals/seals_utils.py b/seals/seals_utils.py index d662097..089a57e 100644 --- a/seals/seals_utils.py +++ b/seals/seals_utils.py @@ -1578,9 +1578,15 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan # Find the column for this year (could be string or int) year_col = None for col in year_columns: - if str(col) == str(year) or int(col) == int(year): - year_col = col - break + try: + if str(col) == str(year) or int(col) == int(year): + year_col = col + break + except (ValueError, TypeError): + # Skip columns that can't be converted to int + if str(col) == str(year): + year_col = col + break if year_col is None: hb.log('Warning: Year ' + str(year) + ' not found in CSV columns. Available years: ' + str(year_columns)) @@ -1607,17 +1613,17 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan else: n_cells = 0 - if n_cells > 0 and change != 0 and not pd.isna(change): + if n_cells > 0 and not pd.isna(change) and change != 0: result = change / n_cells else: result = 0.0 - if 'nan' in str(result).lower() or pd.isna(result): + if pd.isna(result): allocate_per_zone_dict[zone_id] = 0.0 else: allocate_per_zone_dict[zone_id] = result - print('Allocate per zone dict for ' + column + ' (year ' + str(year) + '): ' + str(allocate_per_zone_dict)) + hb.log('Allocate per zone dict for ' + column + ' (year ' + str(year) + '): ' + str(allocate_per_zone_dict)) hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) else: # Original class-based format logic @@ -1641,12 +1647,12 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan else: result = 0.0 - if 'nan' in str(result).lower(): + if pd.isna(result): allocate_per_zone_dict[zone_id] = 0.0 else: allocate_per_zone_dict[zone_id] = result - print('Allocate per zone dict for ' + column + ': ' + str(allocate_per_zone_dict)) + hb.log('Allocate per zone dict for ' + column + ': ' + str(allocate_per_zone_dict)) hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) def combine_coarsified_regional_with_coarse_estimate(coarsified_path, coarse_estimate_path, combination_algorithm, output_path): From f3cf2912fe4956bad58765d4842cedb8bb631a35 Mon Sep 17 00:00:00 2001 From: Justin Johnson Date: Wed, 26 Nov 2025 10:27:48 -0600 Subject: [PATCH 4/6] Rgit log --oneline -- seals/seals_utils revert "Merge pull request #10 from jandrewjohnson/copilot/swift-bee" This reverts commit f9f30301af777bbf2296079a7a7134642600ed6a, reversing changes made to fce2da7a8a248a22b3604ac6359816557aad8efd. '#' will be ignored, and an empty message aborts the commit. --- seals/seals_utils.py | 150 ++++++++----------------------------------- 1 file changed, 28 insertions(+), 122 deletions(-) diff --git a/seals/seals_utils.py b/seals/seals_utils.py index 089a57e..44ca65b 100644 --- a/seals/seals_utils.py +++ b/seals/seals_utils.py @@ -1476,16 +1476,13 @@ def load_blocks_list(p, input_dir): def convert_regional_change_to_coarse(regional_change_vector_path, regional_change_classes_path, coarse_ha_per_cell_path, scenario_label, output_dir, output_filename_end, columns_to_process, regions_column_label, years, region_ids_raster_path=None, distribution_algorithm='proportional', coarse_change_raster_path=None): # Converts a regional vector change map to a coarse gridded representation. - # - regional_change_vector_path is a gpkg with the boundaries of the regions that are changing. + # - Regiona_change_vector_path is a gpkg with the boundaries of the regions that are changing. # - regional_change_classes_path is a csv with columns for each changing landuse class (in columns to process) that reports net change in hectares per polygon. - # This CSV can be in two formats: - # 1. Class-based format (original): columns are land-use classes (e.g., 'cropland', 'forest'), rows are regions - # 2. Year-based format (new): columns are years (e.g., '2030', '2050'), rows are regions + class combinations with a 'class' column - # - coarse_ha_per_cell_path is a raster that reports the number of hectares per cell in the coarse grid. Necessary to work with unprojected data, which is our assumed form. - # - output_dir is the directory where output rasters will be saved. - # - columns_to_process is a list of the column headers that represent net changes in LU classes. + # - Coarse_ha_per_cell_path is a raster that reports the number of hectares per cell in the coarse grid. Necessary to work with unprojected data, which is our assumed form. + # - Output path is the path to save the output raster. + # - Columns to process is a list of the columns headers in the gpkg that represent net changes in LU classes. # - distribution_algorithm is a string that indicates how to distribute the change across the cells. default is "proportional". - # - coarse_change_raster_path is an optional path that, if provided, will be combined with the regional change to produce a combined change raster. + # - Coarse_change_raster_path is an optional path that, if provided, will be combined with the regional change to produce a combined change raster. # Read both inputs regional_change_vector = gpd.read_file(regional_change_vector_path) @@ -1532,128 +1529,37 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan ### Define the allocation of the total to individual cells - # Detect if the CSV has years as columns (year-based format) or classes as columns (class-based format) - # Year-based format: columns are years (e.g., '2030', '2050'), rows have region + class - # Class-based format: columns are classes (e.g., 'cropland', 'forest'), rows have region only - - # Check if any of the columns_to_process appear in the merged dataframe columns - # If not, we assume it's a year-based format that needs pivoting - has_class_columns = any(col in merged.columns for col in columns_to_process) - - # Check if any year appears as a column (as string or int) - year_columns = [col for col in merged.columns if str(col).replace('.', '').replace('-', '').isdigit()] - has_year_columns = len(year_columns) > 0 - - if not has_class_columns and has_year_columns: - # Year-based format detected - need to pivot the data - # Expected format: region_id/label, class_label, year1, year2, ..., yearN - # We need to transform it to: region_id/label, class1, class2, ..., classN (for each year) - - # Find the class column name (usually 'class', 'lulc_class', 'class_label', etc.) - class_column_candidates = ['class', 'lulc_class', 'class_label', 'land_class', 'landuse_class'] - class_column = None - for candidate in class_column_candidates: - if candidate in merged.columns: - class_column = candidate - break - - if class_column is None: - # Try to find any column that might contain class information - for col in merged.columns: - if 'class' in col.lower() and col != regions_column_label: - class_column = col - break - - if class_column is None: - raise NameError('Could not detect class column in year-based format CSV. Expected columns like "class", "lulc_class", or "class_label".') - - hb.log('Detected year-based format in CSV. Pivoting data from class column: ' + class_column) - # Creates a dict for each zone_id: to_allocate, which will be reclassified onto the zone ids. for year_c, year in enumerate(years): allocate_per_zone_dict = {} - - # If year-based format, we need to work with data for this specific year - if not has_class_columns and has_year_columns: - # Find the column for this year (could be string or int) - year_col = None - for col in year_columns: - try: - if str(col) == str(year) or int(col) == int(year): - year_col = col - break - except (ValueError, TypeError): - # Skip columns that can't be converted to int - if str(col) == str(year): - year_col = col - break - - if year_col is None: - hb.log('Warning: Year ' + str(year) + ' not found in CSV columns. Available years: ' + str(year_columns)) - continue + for column in columns_to_process: + output_path = os.path.join(output_dir, column + output_filename_end) - # For each class we need to process, extract the data from the pivoted structure - for column in columns_to_process: - output_path = os.path.join(output_dir, column + output_filename_end) + if not hb.path_exists(output_path): + hb.log('Processing ' + column + ' for ' + scenario_label + ', writing to ' + output_path) + regions_column_id = regions_column_id.replace('label', 'id') - if not hb.path_exists(output_path): - hb.log('Processing ' + column + ' for year ' + str(year) + ' in ' + scenario_label + ', writing to ' + output_path) - regions_column_id = regions_column_id.replace('label', 'id') - - # Filter merged dataframe for this specific class - class_data = merged[merged[class_column] == column] - - # Now iterate over the filtered data to get regional values for this year and class - for idx, row in class_data.iterrows(): - zone_id = int(row[regions_column_id]) - change = row[year_col] - - if int(zone_id) in n_cells_per_zone: - n_cells = n_cells_per_zone[int(zone_id)] - else: - n_cells = 0 - - if n_cells > 0 and not pd.isna(change) and change != 0: - result = change / n_cells - else: - result = 0.0 - - if pd.isna(result): - allocate_per_zone_dict[zone_id] = 0.0 - else: - allocate_per_zone_dict[zone_id] = result - - hb.log('Allocate per zone dict for ' + column + ' (year ' + str(year) + '): ' + str(allocate_per_zone_dict)) - hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) - else: - # Original class-based format logic - for column in columns_to_process: - output_path = os.path.join(output_dir, column + output_filename_end) - - if not hb.path_exists(output_path): - hb.log('Processing ' + column + ' for ' + scenario_label + ', writing to ' + output_path) - regions_column_id = regions_column_id.replace('label', 'id') - for i, change in merged[column].items(): - zone_id = int(merged[regions_column_id][i]) + for i, change in merged[column].items(): + zone_id = int(merged[regions_column_id][i]) + + if int(zone_id) in n_cells_per_zone: + n_cells = n_cells_per_zone[int(zone_id)] # BAD HACK, should be generalized to know ahead of time if it's an int or string + else: + n_cells = 0 - if int(zone_id) in n_cells_per_zone: - n_cells = n_cells_per_zone[int(zone_id)] # BAD HACK, should be generalized to know ahead of time if it's an int or string - else: - n_cells = 0 + if n_cells > 0 and change != 0: + result = change / n_cells + else: + result = 0.0 + + if 'nan' in str(result).lower(): + allocate_per_zone_dict[zone_id] = 0.0 + else: + allocate_per_zone_dict[zone_id] = result - if n_cells > 0 and change != 0: - result = change / n_cells - else: - result = 0.0 - - if pd.isna(result): - allocate_per_zone_dict[zone_id] = 0.0 - else: - allocate_per_zone_dict[zone_id] = result - - hb.log('Allocate per zone dict for ' + column + ': ' + str(allocate_per_zone_dict)) - hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) + print('Allocate per zone dict for ' + column + ': ' + str(allocate_per_zone_dict)) + hb.reclassify_raster_hb(region_ids_raster_path, allocate_per_zone_dict, output_path, output_data_type=7, match_path=None, invoke_full_callback=False, existing_values='zero', verbose=False) def combine_coarsified_regional_with_coarse_estimate(coarsified_path, coarse_estimate_path, combination_algorithm, output_path): ### ABANDONED? From 42626aac3bfa286ba28394a27ffc7b38f58de4d4 Mon Sep 17 00:00:00 2001 From: Justin Johnson Date: Wed, 26 Nov 2025 15:01:47 -0600 Subject: [PATCH 5/6] Enhance regional change functionality: add support for input override path and improve AOI handling in project_aoi --- seals/run_seals_bonn.py | 149 +++++++++++++++++++++++ seals/seals_process_coarse_timeseries.py | 4 + seals/seals_tasks.py | 21 +++- 3 files changed, 169 insertions(+), 5 deletions(-) create mode 100644 seals/run_seals_bonn.py diff --git a/seals/run_seals_bonn.py b/seals/run_seals_bonn.py new file mode 100644 index 0000000..a5d6e66 --- /dev/null +++ b/seals/run_seals_bonn.py @@ -0,0 +1,149 @@ +import os +import sys + +import hazelbean as hb +import pandas as pd + +from seals import seals_generate_base_data, seals_initialize_project, seals_main, seals_process_coarse_timeseries, seals_tasks, seals_visualization_tasks + + + +def convert_cgebox_output_to_seals_regional_projections_input(p): + + input_path = p.regional_projections_input_path + output_path = os.path.join(p.cur_dir, 'regional_projections_input_pivoted.csv') + p.regional_projections_input_override_path = output_path + + if p.run_this: + + if not hb.path_exists(output_path): + df = hb.df_read(input_path) + + # Step 1: Melt the DataFrame to convert year columns into rows. + + # Get the list of columns to unpivot (years) + years_to_unpivot = [col for col in df.columns if col.isdigit()] + # years_to_unpivot = [] + melted = df.melt( + id_vars=[p.regions_column_label, 'LandCover'], # Assumes the land cover column is named 'LandCover' + value_vars=years_to_unpivot, + var_name='year', + value_name='value' + ) + + # Step 2: Pivot the melted DataFrame. + # We set the region and year as the new index, and create new columns from 'LandCover' categories. + merged_pivoted = melted.pivot_table( + index=[p.regions_column_label, 'year'], + columns='LandCover', + values='value' + ).reset_index() + + + # Now add nuts_id + merged_pivoted['nuts_id'], unique_countries = pd.factorize(merged_pivoted[p.regions_column_label]) + + # Write a new file in the task dir and reassign the project attribute to the new csv + hb.df_write(merged_pivoted, output_path) + + +def build_bonn_task_tree(p): + + # Define the project AOI + p.project_aoi_task = p.add_task(seals_tasks.project_aoi) + p.convert_cgebox_output_to_seals_regional_projections_input_task = p.add_task(convert_cgebox_output_to_seals_regional_projections_input) + + + ##### FINE PROCESSED INPUTS ##### + p.fine_processed_inputs_task = p.add_task(seals_generate_base_data.fine_processed_inputs) + p.generated_kernels_task = p.add_task(seals_generate_base_data.generated_kernels, parent=p.fine_processed_inputs_task, creates_dir=False) + p.lulc_clip_task = p.add_task(seals_generate_base_data.lulc_clip, parent=p.fine_processed_inputs_task, creates_dir=False) + p.lulc_simplifications_task = p.add_task(seals_generate_base_data.lulc_simplifications, parent=p.fine_processed_inputs_task, creates_dir=False) + p.lulc_binaries_task = p.add_task(seals_generate_base_data.lulc_binaries, parent=p.fine_processed_inputs_task, creates_dir=False) + p.lulc_convolutions_task = p.add_task(seals_generate_base_data.lulc_convolutions, parent=p.fine_processed_inputs_task, creates_dir=False) + + ##### COARSE CHANGE ##### + p.coarse_change_task = p.add_task(seals_process_coarse_timeseries.coarse_change, skip_existing=0) + p.extraction_task = p.add_task(seals_process_coarse_timeseries.coarse_extraction, parent=p.coarse_change_task, run=1, skip_existing=0) + p.coarse_simplified_task = p.add_task(seals_process_coarse_timeseries.coarse_simplified_proportion, parent=p.coarse_change_task, skip_existing=0) + p.coarse_simplified_ha_task = p.add_task(seals_process_coarse_timeseries.coarse_simplified_ha, parent=p.coarse_change_task, skip_existing=0) + p.coarse_simplified_ha_difference_from_previous_year_task = p.add_task(seals_process_coarse_timeseries.coarse_simplified_ha_difference_from_previous_year, parent=p.coarse_change_task, skip_existing=0) + + ##### REGIONAL + p.regional_change_task = p.add_task(seals_process_coarse_timeseries.regional_change) + + ##### ALLOCATION ##### + p.allocations_task = p.add_iterator(seals_main.allocations, skip_existing=0) + p.allocation_zones_task = p.add_iterator(seals_main.allocation_zones, run_in_parallel=p.run_in_parallel, parent=p.allocations_task, skip_existing=0) + p.allocation_task = p.add_task(seals_main.allocation, parent=p.allocation_zones_task, skip_existing=0) + + ##### STITCH ZONES ##### + p.stitched_lulc_simplified_scenarios_task = p.add_task(seals_main.stitched_lulc_simplified_scenarios) + + ##### VIZUALIZE EXISTING DATA ##### + p.visualization_task = p.add_task(seals_visualization_tasks.visualization) + p.lulc_pngs_task = p.add_task(seals_visualization_tasks.lulc_pngs, parent=p.visualization_task) + + + + +main = '' +if __name__ == '__main__': + + ### ------- ENVIRONMENT SETTINGS ------------------------------- + # Users should only need to edit lines in this section + + # Create a ProjectFlow Object to organize directories and enable parallel processing. + p = hb.ProjectFlow() + + # Assign project-level attributes to the p object (such as in p.base_data_dir = ... below) + # including where the project_dir and base_data are located. + # The project_name is used to name the project directory below. If the directory exists, each task will not recreate + # files that already exist. + p.user_dir = os.path.expanduser('~') + p.extra_dirs = ['Files', 'seals', 'projects'] + p.project_name = 'bonn_esm_cgebox_seals' + # p.project_name = p.project_name + '_' + hb.pretty_time() # If don't you want to recreate everything each time, comment out this line. + + # Based on the paths above, set the project_dir. All files will be created in this directory. + p.project_dir = os.path.join(p.user_dir, os.sep.join(p.extra_dirs), p.project_name) + p.set_project_dir(p.project_dir) + + p.run_in_parallel = 1 # Must be set before building the task tree if the task tree has parralel iterator tasks. + + # Build the task tree via a building function and assign it to p. IF YOU WANT TO LOOK AT THE MODEL LOGIC, INSPECT THIS FUNCTION + build_bonn_task_tree(p) + + # Set the base data dir. The model will check here to see if it has everything it needs to run. + # If anything is missing, it will download it. You can use the same base_data dir across multiple projects. + # Additionally, if you're clever, you can move files generated in your tasks to the right base_data_dir + # directory so that they are available for future projects and avoids redundant processing. + # The final directory has to be named base_data to match the naming convention on the google cloud bucket. + p.base_data_dir = os.path.join(p.user_dir, 'Files/base_data') + + # ProjectFlow downloads all files automatically via the p.get_path() function. If you want it to download from a different + # bucket than default, provide the name and credentials here. Otherwise uses default public data 'gtap_invest_seals_2023_04_21'. + p.data_credentials_path = None + p.input_bucket_name = None + + ## Set defaults and generate the scenario_definitions.csv if it doesn't exist. + # SEALS will run based on the scenarios defined in a scenario_definitions.csv + # If you have not run SEALS before, SEALS will generate it in your project's input_dir. + # A useful way to get started is to to run SEALS on the test data without modification + # and then edit the scenario_definitions.csv to your project needs. + p.scenario_definitions_filename = 'standard_scenarios.csv' + p.scenario_definitions_path = os.path.join(p.input_dir, p.scenario_definitions_filename) + seals_initialize_project.initialize_scenario_definitions(p) + + # Set processing resolution: determines how large of a chunk should be processed at a time. 4 deg is about max for 64gb memory systems + p.processing_resolution = 1.0 # In degrees. Must be in pyramid_compatible_resolutions + + seals_initialize_project.set_advanced_options(p) + + p.L = hb.get_logger('test_run_seals') + hb.log('Created ProjectFlow object at ' + p.project_dir + '\n from script ' + p.calling_script + '\n with base_data set at ' + p.base_data_dir) + + p.execute() + + result = 'Done!' + diff --git a/seals/seals_process_coarse_timeseries.py b/seals/seals_process_coarse_timeseries.py index 14a0d3d..13ab182 100644 --- a/seals/seals_process_coarse_timeseries.py +++ b/seals/seals_process_coarse_timeseries.py @@ -60,6 +60,10 @@ def regional_change(p): scenario_label = p.scenario_label output_filename_end = '_' + str(year) + '_' + str(previous_year) + '_ha_diff_' + p.exogenous_label + '_' + p.climate_label + '_' + p.model_label + '_' + p.counterfactual_label + '_regional_coarsified.tif' regions_column_label = p.regions_column_label + + if getattr(p, 'regional_projections_input_override_path', None): + regional_change_classes_path = p.regional_projections_input_override_path + # TODOO I should have iterated on class label here, then called the util file only on that one seals_utils.convert_regional_change_to_coarse(regional_change_vector_path, regional_change_classes_path, diff --git a/seals/seals_tasks.py b/seals/seals_tasks.py index e1fa90e..a47f0b0 100644 --- a/seals/seals_tasks.py +++ b/seals/seals_tasks.py @@ -35,11 +35,22 @@ def project_aoi(p): if not p.regions_column_label in gdf.columns: raise ValueError(f"The column '{p.regions_column_label}' is not found in the regions vector file: {p.regions_vector_path}. Please check the column name or the vector file.") - p.aoi_path = os.path.join(p.cur_dir, 'aoi_' + str(p.aoi) + '.gpkg') - p.aoi_label = p.aoi - - filter_column = p.regions_column_label # if it's exactly 3 characters, assume it's an ISO3 code. - filter_value = p.aoi + + if p.aoi == 'from_regional_projections_input_path': + # Read the csv to get the unique AOI values + df = hb.df_read(p.regional_projections_input_path) + unique_aois = list(df[p.regions_column_label].unique()) + filter_column = p.regions_column_label + filter_value = unique_aois + p.aoi_path = os.path.join(p.cur_dir, 'aoi_' + str(p.aoi) + '.gpkg') + p.aoi_label = p.aoi + else: + + p.aoi_path = os.path.join(p.cur_dir, 'aoi_' + str(p.aoi) + '.gpkg') + p.aoi_label = p.aoi + + filter_column = p.regions_column_label # if it's exactly 3 characters, assume it's an ISO3 code. + filter_value = p.aoi for current_aoi_path in hb.list_filtered_paths_nonrecursively(p.cur_dir, include_strings='aoi'): From e5bed421ddd87e1e6f19a7c1e0afd84787a0aa7b Mon Sep 17 00:00:00 2001 From: Justin Johnson Date: Fri, 5 Dec 2025 09:23:42 -0600 Subject: [PATCH 6/6] bonn done in devstack but not in release script. waiting for final feedback from bonn team. --- seals/run_seals_bonn.py | 105 +++++++++++++++-------- seals/seals_process_coarse_timeseries.py | 21 +++-- seals/seals_utils.py | 6 +- 3 files changed, 87 insertions(+), 45 deletions(-) diff --git a/seals/run_seals_bonn.py b/seals/run_seals_bonn.py index a5d6e66..7dbcf46 100644 --- a/seals/run_seals_bonn.py +++ b/seals/run_seals_bonn.py @@ -4,49 +4,84 @@ import hazelbean as hb import pandas as pd -from seals import seals_generate_base_data, seals_initialize_project, seals_main, seals_process_coarse_timeseries, seals_tasks, seals_visualization_tasks +from seals import seals_generate_base_data, seals_initialize_project, seals_main, seals_process_coarse_timeseries, seals_tasks, seals_utils, seals_visualization_tasks +# TODO +# 1. make regional projections be the change not the total. +# 2. decide how to handle project_dir in the project repo so it uses the git-cloned inputs. +# 3. test the setup.bat and decide if it's dumb to put it in the src (tho with git ignore listed) def convert_cgebox_output_to_seals_regional_projections_input(p): - input_path = p.regional_projections_input_path - output_path = os.path.join(p.cur_dir, 'regional_projections_input_pivoted.csv') - p.regional_projections_input_override_path = output_path + + p.regional_projections_input_override_paths = {} if p.run_this: - - if not hb.path_exists(output_path): - df = hb.df_read(input_path) - - # Step 1: Melt the DataFrame to convert year columns into rows. - - # Get the list of columns to unpivot (years) - years_to_unpivot = [col for col in df.columns if col.isdigit()] - # years_to_unpivot = [] - melted = df.melt( - id_vars=[p.regions_column_label, 'LandCover'], # Assumes the land cover column is named 'LandCover' - value_vars=years_to_unpivot, - var_name='year', - value_name='value' - ) - - # Step 2: Pivot the melted DataFrame. - # We set the region and year as the new index, and create new columns from 'LandCover' categories. - merged_pivoted = melted.pivot_table( - index=[p.regions_column_label, 'year'], - columns='LandCover', - values='value' - ).reset_index() - + for index, row in p.scenarios_df.iterrows(): + seals_utils.assign_df_row_to_object_attributes(p, row) + + if p.scenario_type != 'baseline': + input_path = p.regional_projections_input_path + output_path = os.path.join(p.cur_dir, f'regional_projections_input_pivoted_{p.exogenous_label}.csv') + + # START HERE, the override doesn't work cause it doesn't iterate over years.... use a catear? + p.regional_projections_input_override_paths[p.scenario_label] = output_path + + if not hb.path_exists(output_path): + df = hb.df_read(input_path) + + # Step 1: Melt the DataFrame to convert year columns into rows. + + # Get the list of columns to unpivot (years) + years_to_unpivot = [col for col in df.columns if col.isdigit()] + # years_to_unpivot = [] + melted = df.melt( + id_vars=[p.regions_column_label, 'LandCover'], # Assumes the land cover column is named 'LandCover' + value_vars=years_to_unpivot, + var_name='year', + value_name='value' + ) + + # Step 2: Pivot the melted DataFrame. + # We set the region and year as the new index, and create new columns from 'LandCover' categories. + merged_pivoted = melted.pivot_table( + index=[p.regions_column_label, 'year'], + columns='LandCover', + values='value' + ).reset_index() + + + # Now add nuts_id + merged_pivoted['nuts_id'], unique_countries = pd.factorize(merged_pivoted[p.regions_column_label]) + merged_pivoted['nuts_id'] = merged_pivoted['nuts_id'] + 1 + + # Define the columns for which the year-over-year change should be calculated + land_use_columns = ['cropland', 'forest', 'grassland', 'other', 'othernat', 'urban', 'water'] + + # Sort the DataFrame by 'nuts_label' and 'year' to ensure correct chronological order + df_sorted = merged_pivoted.sort_values(by=['nuts_label', 'year']) + + # Group by 'nuts_label' and calculate the difference for the specified columns + # .diff() calculates the difference from the previous row within each group + # .fillna(0) replaces the initial NaN values with 0 + df_sorted[land_use_columns] = df_sorted.groupby('nuts_label')[land_use_columns].diff().fillna(0) + + # The 'df_sorted' DataFrame now contains the year-over-year change. + # You can display the first few rows of the result for a specific region to verify. + print("Year-over-year changes for CZ03:") + print(df_sorted[df_sorted['nuts_label'] == 'CZ03'].head()) + + # multiply by 1000 because i think cgebox outputs in thousands of ha + for col in land_use_columns: + df_sorted[col] = df_sorted[col] * 1000 + # 2019 2020 2021 2023 2025 2027 2029 2030 2031 2033 2035 2037 2039 2040 2041 2043 2045 2047 2049 2050 + # repeat these numbers + + # Write a new file in the task dir and reassign the project attribute to the new csv + hb.df_write(df_sorted, output_path) + - # Now add nuts_id - merged_pivoted['nuts_id'], unique_countries = pd.factorize(merged_pivoted[p.regions_column_label]) - - # Write a new file in the task dir and reassign the project attribute to the new csv - hb.df_write(merged_pivoted, output_path) - - def build_bonn_task_tree(p): # Define the project AOI diff --git a/seals/seals_process_coarse_timeseries.py b/seals/seals_process_coarse_timeseries.py index 13ab182..03b0233 100644 --- a/seals/seals_process_coarse_timeseries.py +++ b/seals/seals_process_coarse_timeseries.py @@ -61,8 +61,8 @@ def regional_change(p): output_filename_end = '_' + str(year) + '_' + str(previous_year) + '_ha_diff_' + p.exogenous_label + '_' + p.climate_label + '_' + p.model_label + '_' + p.counterfactual_label + '_regional_coarsified.tif' regions_column_label = p.regions_column_label - if getattr(p, 'regional_projections_input_override_path', None): - regional_change_classes_path = p.regional_projections_input_override_path + if getattr(p, 'regional_projections_input_override_paths', None): + regional_change_classes_path = p.regional_projections_input_override_paths[p.scenario_label] # TODOO I should have iterated on class label here, then called the util file only on that one seals_utils.convert_regional_change_to_coarse(regional_change_vector_path, @@ -102,7 +102,7 @@ def regional_change(p): regional_coarsified_raster = hb.as_array(regional_coarsified_path) # covariate_additive - covariate_additive = (current_luc_coarse_projections + regional_coarsified_raster) * 1000.0 + covariate_additive = (current_luc_coarse_projections + regional_coarsified_raster) hb.save_array_as_geotiff(covariate_additive, hb.suri(output_path_template, 'covariate_additive'), current_luc_coarse_projections_path) @@ -152,8 +152,11 @@ def regional_change(p): covariate_sum_shift_path = hb.suri(output_path_template, 'covariate_sum_shift') input = ((regional_coarsified_path, 1), (current_luc_coarse_projections_path, 1), (target_raster_path, 1)) + + print('WARNING! sometimes need to use *1000 here to convert from ha to m2. Check if this is desired behavior. GTAP NEEDS THIS OTHERS DONT') def op(a, b, c): - return (a - (c-b)) * 1000.0 + return (a - (c-b)) + # return (a - (c-b)) * 1000 hb.raster_calculator(input, op, covariate_sum_shift_path, 7, -9999.) #### COVARIATE MULTIPLY SHIFT @@ -230,7 +233,7 @@ def op(a, b, c): covariate_multiply_shift_path = hb.suri(output_path_template, 'covariate_multiply_shift') input = ((current_luc_coarse_projections_path, 1), (covariate_multiply_regional_change_sum_path, 1), (covariate_multiply_regional_change_sum_path, 1)) def op(a, b, c): - return (a * (b/c)) * 1000.0 + return (a * (b/c)) hb.raster_calculator(input, op, covariate_multiply_shift_path, 7, -9999.) # This is the one i want to use so also save it as the template. Can choose from different algorithms above. @@ -238,7 +241,7 @@ def op(a, b, c): hb.path_copy(alg_to_use_path, output_path_template) 5 # Given all of these, copy the one that we want to use to the name without a label - + # 2019 2020 2021 2023 2025 2027 2029 2030 2031 2033 2035 2037 2039 2040 2041 2043 2045 2047 2049 2050 else: @@ -342,7 +345,7 @@ def regional_change_new_test(p): regional_coarsified_raster = hb.as_array(regional_coarsified_path) # covariate_additive - covariate_additive = (current_luc_coarse_projections + regional_coarsified_raster) * 1000.0 + covariate_additive = (current_luc_coarse_projections + regional_coarsified_raster) hb.save_array_as_geotiff(covariate_additive, hb.suri(output_path_template, 'covariate_additive'), current_luc_coarse_projections_path) @@ -393,7 +396,7 @@ def regional_change_new_test(p): covariate_sum_shift_path = hb.suri(output_path_template, 'covariate_sum_shift') input = ((regional_coarsified_path, 1), (current_luc_coarse_projections_path, 1), (target_raster_path, 1)) def op(a, b, c): - return (a - (c-b)) * 1000 + return (a - (c-b)) hb.raster_calculator(input, op, covariate_sum_shift_path, 7, -9999.) #### COVARIATE MULTIPLY SHIFT @@ -469,7 +472,7 @@ def op(a, b, c): covariate_multiply_shift_path = hb.suri(output_path_template, 'covariate_multiply_shift') input = ((current_luc_coarse_projections_path, 1), (covariate_multiply_regional_change_sum_path, 1), (covariate_multiply_regional_change_sum_path, 1)) def op(a, b, c): - return (a * (b/c)) * 1000 + return (a * (b/c)) hb.raster_calculator(input, op, covariate_multiply_shift_path, 7, -9999.) if p.region_to_coarse_algorithm == 'covariate_additive': diff --git a/seals/seals_utils.py b/seals/seals_utils.py index 44ca65b..1e8c1d4 100644 --- a/seals/seals_utils.py +++ b/seals/seals_utils.py @@ -1508,7 +1508,11 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan if region_ids_raster_path is None: region_ids_raster_path = os.path.join(output_dir, 'region_ids.tif') + + merged_vector_output_path = os.path.join(os.path.split(region_ids_raster_path)[0], 'regional_change_with_ids_' + scenario_label + '.gpkg') + + merged.to_file(merged_vector_output_path, driver='GPKG') ### Rasterize the regions vector to a raster. @@ -1519,7 +1523,7 @@ def convert_regional_change_to_coarse(regional_change_vector_path, regional_chan # TODOO NOTE that here we are not using all_touched. This is a fundamental problem with coarse reclassification. Lots of the polygon will be missed. Ideally, you use all_touched=False for # country-country borders but all_touched=True for country-coastline boarders. Or join with EEZs? - hb.rasterize_to_match(regional_change_vector_path, coarse_ha_per_cell_path, region_ids_raster_path, burn_column_name=regions_column_id, burn_values=None, datatype=13, ndv=0, all_touched=False) + hb.rasterize_to_match(merged_vector_output_path, coarse_ha_per_cell_path, region_ids_raster_path, burn_column_name=regions_column_id, burn_values=None, datatype=13, ndv=0, all_touched=False) ### Get the number of cells per zone.