continue notebook

IndEcol · Jan 29, 2025 · 8a6ede1 · 8a6ede1
1 parent dd4b983
commit 8a6ede1
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 72 deletions.
diff --git a/doc/source/notebooks/GLAM_EXIO_link.py b/doc/source/notebooks/GLAM_EXIO_link.py
@@ -39,17 +39,17 @@
 from pathlib import Path
 import pymrio
 
-import warnings
 import pandas as pd
-warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
 
 
 # %% [markdown]
 # Next, we specify were data should be stored
 
 
 # %%
-DATA_ROOT = Path("/tmp/glam_exio_tutorial") # set this to your data directory
+# TODO: Fix back 
+# DATA_ROOT = Path("/tmp/glam_exio_tutorial") # set this to your data directory
+DATA_ROOT = Path("/home/konstans/tmp/glam_exio_tutorial") # set this to your data directory
 
 EXIOBASE_STORAGE_FOLDER = DATA_ROOT / "exiobase"
 GLAM_STORAGE_FOLDER = DATA_ROOT / "glam"
@@ -91,6 +91,13 @@
 # %%
 GLAM_char = pymrio.GLAMprocessing.prep_GLAM(GLAM_data=GLAM_raw)
 
+# TODO: remove later, just for fast testing
+GLAM_char_archive = GLAM_char.copy()
+
+# TODO: remove later
+# take 10000 random samples:
+GLAM_char = GLAM_char_archive.sample(10000)
+
 # %% [markdown]
 # This results in a long table with all characterization factors from GLAM. 
 # We can then later use this table to characterize EXIOBASE flows after renaming to GLAM flow names.
@@ -159,13 +166,37 @@
 # We are now ready to convert these stressors to GLAM flows. To do so we use the convert function of Pymrio.
 # This function can be used for many more things and is [explained in detail in the notebook here](./convert.ipynb)
 
+# TODO: remove later, just a fast way to save and load for pymrio development
+
+EXIO3_TMP = Path(EXIOBASE_STORAGE_FOLDER / "TMP_2018")
+EXIO3_TMP.mkdir(parents=True, exist_ok=True)
+exio3.save_all(EXIO3_TMP, table_format="parquet")
+
+import pymrio
+import pyinstrument
+
+exio3 = pymrio.load_all(EXIO3_TMP)
+exio3.reset_all_full()
+
 # %%
+debug_bridge = exio_glam_bridge
+
+with pyinstrument.Profiler() as p:
+    debug_sat = exio3.satellite.convert(
+        debug_bridge, new_extension_name="GLAM flows",
+        unit_column_orig="EXIOBASE_unit",
+        unit_column_new="FLOW_unit",
+        ignore_columns=["comment"]
+    )
+debug_sat.F
+
+
 exio3.glam_flows = exio3.satellite.convert(
-    exio_glam_bridge, new_extension_name="GLAM flows",
-    unit_column_orig="EXIOBASE_unit",
-    unit_column_new="FLOW_unit",
-    ignore_columns=["comment"]
-)
+        exio_glam_bridge, new_extension_name="GLAM flows",
+        unit_column_orig="EXIOBASE_unit",
+        unit_column_new="FLOW_unit",
+        ignore_columns=["comment"]
+    )
 
 # %% [markdown]
 # This now gives us a new satellite account "glam_flows".
@@ -175,7 +206,6 @@
 
 # %% [markdown]
 # With flow names corresponding to GLAM flows.
-# Since we already had consumption based account calculated in EXIOBASE before, we can immediately see the same for the GLAM flows.
 
 # %%
 exio3.glam_flows.D_cba
@@ -206,8 +236,10 @@
 
 GLAM_char = GLAM_char.loc[GLAM_char.LCIAMethod_name__FLOW_uuid == "EQ Land use"]
 
-
+# TODO: fix region error - use GLAM_char only with land use for that
 # %%
+# when debug, only one country (200 columns) and not the full dataset in there.
+# must be as long as the full dataset, with 0 otherwise
 exio3.glam_characterized = exio3.glam_flows.convert(
     GLAM_char, new_extension_name="GLAM characterized"
 )

diff --git a/pymrio/tools/ioutil.py b/pymrio/tools/ioutil.py
@@ -1006,7 +1006,9 @@ def check_df_map(df_orig, df_map):
     # would be in effect given df_orig.
     pass
 
+import line_profiler
 
+@line_profiler.profile
 def convert(
     df_orig, df_map, agg_func="sum", drop_not_bridged_index=True, ignore_columns=None
 ):
@@ -1176,74 +1178,54 @@ def convert(
         # and renames by the new one (bridge.new)
 
         already_renamed = dict()
+
         for bridge in bridges:
+
             # encountering a bridge with the same orig name but which should
             # lead to two new index levels
             if bridge.orig in already_renamed.keys():
-                # duplicate the index level
-                _index_order = list(df_collected.index.names)
-                df_collected.reset_index(
-                    level=already_renamed[bridge.orig].new, inplace=True
-                )
-                df_collected[bridge.new] = df_cur_map.index.get_level_values(
-                    bridge.raw
-                )[0]
-                if (len(df_collected.index.names) == 1) and (
-                    df_collected.index.names[0] is None
-                ):
-                    df_collected.set_index(
-                        already_renamed[bridge.orig].new,
-                        drop=True,
-                        append=False,
-                        inplace=True,
-                    )
-                else:
-                    df_collected.set_index(
-                        already_renamed[bridge.orig].new,
-                        drop=True,
-                        append=True,
-                        inplace=True,
-                    )
-                df_collected.set_index(bridge.new, drop=True, append=True, inplace=True)
-                df_collected.index = df_collected.index.reorder_levels(
-                    _index_order + [bridge.new]
-                )
+                # already renamed the index to another one previously,
+                # but we need to create more index levels for the
+                # same original index level
+                new_index_value = df_cur_map.index.get_level_values(bridge.raw)[0]
+                _old_index = df_collected.index.to_frame()
+                # as we go along in order, we add them to the end of the index
+                _old_index.insert(len(_old_index.columns), bridge.new, new_index_value)  
+                df_collected.index = pd.MultiIndex.from_frame(_old_index)
 
-                continue
 
-            for idx_old_names in df_collected.index.names:
-                if bridge.orig in idx_old_names:
-                    # rename the index names
-                    if isinstance(df_collected.index, pd.MultiIndex):
-                        df_collected.index = df_collected.index.set_names(
-                            bridge.new, level=idx_old_names
-                        )
-                    else:
-                        df_collected.index = df_collected.index.set_names(
-                            bridge.new, level=None
-                        )
-
-                    # rename the actual index values
-                    df_collected.reset_index(level=bridge.new, inplace=True)
-                    for row in df_cur_map.reset_index().iterrows():
-                        new_row_name = row[1][bridge.raw]
-                        old_row_name = row[1][bridge.orig]
-                        df_collected.loc[:, bridge.new] = df_collected.loc[
-                            :, bridge.new
-                        ].str.replace(pat=old_row_name, repl=new_row_name, regex=True)
-
-                    # put the index back
-                    if df_collected.index.name is None:
-                        # The case with a single index where the previous reset index
-                        # left only a numerical index
-                        df_collected.set_index(
-                            bridge.new, drop=True, append=False, inplace=True
-                        )
-                    else:
-                        df_collected.set_index(
-                            bridge.new, drop=True, append=True, inplace=True
-                        )
-                    already_renamed[bridge.orig] = bridge
+            else:
+
+                for idx_old_names in df_collected.index.names:
+                    if bridge.orig in idx_old_names:
+                        # rename the index names
+                        if isinstance(df_collected.index, pd.MultiIndex):
+                            df_collected.index = df_collected.index.set_names(
+                                bridge.new, level=idx_old_names
+                            )
+                        else:
+                            df_collected.index = df_collected.index.set_names(
+                                bridge.new, level=None
+                            )
+
+                        # rename the actual index values
+                        df_collected = df_collected.reset_index(level=bridge.new)
+                        for row in df_cur_map.reset_index().iterrows():
+                            new_row_name = row[1][bridge.raw]
+                            old_row_name = row[1][bridge.orig]
+                            df_collected.loc[:, bridge.new] = df_collected.loc[
+                                :, bridge.new
+                            ].str.replace(pat=old_row_name, repl=new_row_name, regex=True)
+
+                        # put the index back
+                        if df_collected.index.name is None:
+                            # The case with a single index where the previous reset index
+                            # left only a numerical index
+                            df_collected = df_collected.set_index(bridge.new, drop=True, append=False)
+                        else:
+                            df_collected = df_collected.set_index(bridge.new, drop=True, append=True)
+
+                        already_renamed[bridge.orig] = bridge
 
         res_collector.append(
             df_collected.groupby(by=df_collected.index.names).agg(agg_func)