From 7c652f91162bf0f40bc03930dcc445e7848d5cf7 Mon Sep 17 00:00:00 2001
From: Konstantin Stadler <konstantin.stadler@ntnu.no>
Date: Wed, 6 Nov 2024 16:21:26 +0100
Subject: [PATCH] started aux glam

---
 doc/source/notebooks/convert.ipynb |  37 +++++---
 doc/source/notebooks/convert.py    |   4 +-
 pymrio/__init__.py                 |   3 +
 pymrio/aux/GLAM/GLAMprocessing.py  | 143 ++++++++++++++++++++++++++++-
 pymrio/aux/__init__.py             |   1 +
 pymrio/tools/__init__.py           |   1 +
 pymrio/tools/iodownloader.py       |  27 +++++-
 7 files changed, 196 insertions(+), 20 deletions(-)
 create mode 100644 pymrio/aux/__init__.py

diff --git a/doc/source/notebooks/convert.ipynb b/doc/source/notebooks/convert.ipynb
index 42c6cd54..6917f122 100644
--- a/doc/source/notebooks/convert.ipynb
+++ b/doc/source/notebooks/convert.ipynb
@@ -27,8 +27,11 @@
     "- Characterization of stressors to impact categories\n",
     "\n",
     "We will cover each of these points in the examples below.\n",
-    "We will start with applying the conversion to a single table\n",
-    "and then cover the conversion of a full MRIO extension.\n",
+    "First [we will go through the general setup](#Basic-setup) describing the structure of the bridging/mapping table.\n",
+    "We will then cover the application of the [conversion function to a single, standalone table](#Converting-standalone-table).\n",
+    "There we will show [how to rename the index/stressor names](#Renaming-the-index-of-a-single-table), do [unit conversions](#Unit-conversion),\n",
+    "and then do [global](#Global-characterization-factors) and [regional](#Regional-specific-characterization-factors) characterizations.\n",
+    "For the case of converting a full pymrio Extension, see the [Converting pymrio Extensions](#Converting-pymrio-Extensions) section.\n",
     "\n",
     "For the connected topic of *Aggregation of MRIOs*\n",
     "see the [Aggregation](./aggregation_examples.ipynb) page."
@@ -88,9 +91,19 @@
   {
    "cell_type": "markdown",
    "id": "ff2125d8",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "source": [
+    "## Converting standalone tables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2cc15f8b",
    "metadata": {},
    "source": [
-    "## Renaming the index of a single table"
+    "### Renaming the index of a single table"
    ]
   },
   {
@@ -481,7 +494,7 @@
    "id": "e5f1c043",
    "metadata": {},
    "source": [
-    "## Unit conversion"
+    "### Unit conversion"
    ]
   },
   {
@@ -761,7 +774,7 @@
    "source": [
     "In case of unit conversion of pymrio satellite accounts,\n",
     "we can also check the unit before and set the unit after conversion:\n",
-    "TODO: unit conversion extensions"
+    "TODO: unit conversion extensions, link to extension beow"
    ]
   },
   {
@@ -769,7 +782,7 @@
    "id": "7024fc74",
    "metadata": {},
    "source": [
-    "## Characterization"
+    "### Characterization"
    ]
   },
   {
@@ -1771,7 +1784,7 @@
    "source": [
     "The same principles as for individual tables can be used for converting full pymrio type Extensions (aka satellite accounts).\n",
     "In difference to the single tables, pymrio Extensions consist of several pandas DataFrames which can be converted in one go.\n",
-    "Almost the same bridge table structure as for single tables can be used. The main additional information needed is in regard to \n",
+    "Almost the same bridge table structure as for single tables can be used. The main additional information needed is in regard to\n",
     "units. Since pymrio Extensions include a unit dataframe, information about the unit names need to be included."
    ]
   },
@@ -1780,7 +1793,7 @@
    "id": "da124a31",
    "metadata": {},
    "source": [
-    "Extensions can be converted either one at a time, but the main power of the method lies in collecting stressor data across different extensions \n",
+    "Extensions can be converted either one at a time, but the main power of the method lies in collecting stressor data across different extensions\n",
     "and converting them in one go."
    ]
   },
@@ -2315,7 +2328,7 @@
    "id": "93107bf8",
    "metadata": {},
    "source": [
-    "We now setup a bridge table for converting/characterizing these emission data \n",
+    "We now setup a bridge table for converting/characterizing these emission data\n",
     "to several other accounts."
    ]
   },
@@ -2640,9 +2653,7 @@
    "cell_type": "code",
    "execution_count": 72,
    "id": "63b953a2",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -2928,7 +2939,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.0"
+   "version": "3.9.19"
   }
  },
  "nbformat": 4,
diff --git a/doc/source/notebooks/convert.py b/doc/source/notebooks/convert.py
index ed26c7f7..e765e0c8 100644
--- a/doc/source/notebooks/convert.py
+++ b/doc/source/notebooks/convert.py
@@ -5,7 +5,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.15.2
+#       jupytext_version: 1.16.4
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
@@ -384,7 +384,7 @@
 # This bridge table has to follow a specific format, depending on the table to be converted.
 
 # %% [markdown]
-## Converting pymrio Extensions
+# # Converting pymrio Extensions
 
 # %% [markdown]
 # The same principles as for individual tables can be used for converting full pymrio type Extensions (aka satellite accounts).
diff --git a/pymrio/__init__.py b/pymrio/__init__.py
index 3cba2309..c27da38b 100644
--- a/pymrio/__init__.py
+++ b/pymrio/__init__.py
@@ -80,4 +80,7 @@
     index_match,
     to_long,
 )
+
+from pymrio.aux.GLAM import GLAMprocessing
+
 from pymrio.version import __version__
diff --git a/pymrio/aux/GLAM/GLAMprocessing.py b/pymrio/aux/GLAM/GLAMprocessing.py
index ca8e6087..01678fde 100644
--- a/pymrio/aux/GLAM/GLAMprocessing.py
+++ b/pymrio/aux/GLAM/GLAMprocessing.py
@@ -6,9 +6,13 @@
 from pymrio.tools.iodownloader import _download_urls
 from pymrio.tools.iometadata import MRIOMetaData
 
+import pandas as pd
+from pandas._libs.parsers import STR_NA_VALUES
+
+import zipfile
 
 GLAM_CONFIG = {
-        "V2024.10": "https://www.lifecycleinitiative.org/wp-content/uploads/2024/10/V1.0.2024.10.zip"
+        "V2024.10": "url""https://www.lifecycleinitiative.org/wp-content/uploads/2024/10/V1.0.2024.10.zip"
     }
 
 def get_GLAM(storage_folder, overwrite_existing=False, version="V2024.10"):
@@ -28,7 +32,7 @@ def get_GLAM(storage_folder, overwrite_existing=False, version="V2024.10"):
         be a url to the zip file to download.
     """
 
-    if type(storage_folder) is str:
+    if isinstance(storage_folder, str):
         storage_folder = Path(storage_folder)
     storage_folder.mkdir(exist_ok=True, parents=True)
 
@@ -52,3 +56,138 @@ def get_GLAM(storage_folder, overwrite_existing=False, version="V2024.10"):
     downlog.save()
     return downlog
 
+def prep_GLAM(GLAM_data, GLAM_char_table_file):
+    """ Extract/read GLAM data and convert to valid characterization file
+
+    This reads the data either from the GLAM zip archive or from the 
+    extracted GLAM folder. It then merges all GLAM xlsx files and
+    renames the header to make it a valid characterization table 
+    for pymrio.
+
+    GLAM_data : Path or str
+        Path to the GLAM zip archive or the extracted GLAM folder.
+        If the ending is .zip, data is read directly from the zip archive.
+        Otherwise, the routing finds all xlsx files in the folder given
+        in GLAM_data.
+
+    GLAM_char_table_file : Path or str
+        Path to the file where the characterization table should be stored.
+
+    """
+    GLAM_subfolders = ["EQ", "HH", "SEA"]
+
+    if isinstance(GLAM_data, str):
+        GLAM_data = Path(GLAM_data)
+
+    def read_GLAM_xlsx(file):
+        accepted_na_values = STR_NA_VALUES - {"NA"}
+        GLAMdata = pd.read_excel(
+            file,
+            sheet_name="lciamethods_CF_GLAM",
+            keep_default_na=False,
+            na_values=accepted_na_values,
+            dtype={
+                "FLOW_uuid": str,
+                "FLOW_name": str,
+                "FLOW_casnumber": str,
+                "LCIAMethod_location": str,
+                "LCIAMethod_location_name": str,
+                "LCIAMethod_loction_ISO2": str,
+                "CF": float,
+                "Unit": str,
+                "CF_Uncertainty_Lower": float,
+                "CF_Uncertainty_Higher": float,
+                "FLOW_class0": str,
+                "FLOW_class1": str,
+                "FLOW_class2": str,
+                "Species": str,
+                "LCIAMethod_realm": str,
+                "LCIAMethod_mathematicalApproach": str,
+                "Scenario": str,
+                "CF_derivation": str,
+                "Matching_CF": str,
+                "Matching_Compartment": str,
+                "LCIAMethod_type": str,
+                "LCIAMethod_name": str},
+            )
+        return GLAMdata
+
+    GLAM_collector = {}
+
+    if GLAM_data.suffix == ".zip":
+        with zipfile.ZipFile(GLAM_data, "r") as zz:
+            all_xlsx = [xlsx for xlsx in zz.namelist() for subfolder in GLAM_subfolders if subfolder in xlsx]
+            for xlsx in all_xlsx:
+                print(f"Reading {xlsx}")
+                data_name = Path(xlsx).stem
+                GLAM_collector[data_name] = read_GLAM_xlsx(zz.open(xlsx))
+
+    else:
+        # read all xlsx files in the folder
+        all_xlsx = [xlsx for xlsx in GLAM_data.rglob("*.xlsx") for subfolder in GLAM_subfolders if subfolder in xlsx.name]
+        for xlsx in all_xlsx:
+            print(f"Reading {xlsx}")
+            data_name = xlsx.stem
+            GLAM_collector[data_name] = read_GLAM_xlsx(xlsx)
+
+    GLAM_full = pd.concat(GLAM_collector, axis=0, ignore_index=True)
+
+    # TODO: base it on flow name/classes, not uuid
+    GLAM_char_col_rename = {
+        "LCIAMethod_name": "LCIAMethod_name__FLOW_uuid",
+        "LCIAMethod_realm": "LCIAMethod_realm__FLOW_uuid",
+        "LCIAMethod_location_ISO2": "region",
+        "Unit": "unit_new",
+        "CF": "factor",
+    }
+
+    GLAM_char_col_dtypes = {
+        "LCIAMethod_name__FLOW_uuid": str,
+        "LCIAMethod_realm__FLOW_uuid": str,
+        "region": str,
+        "unit_new": str,
+        "factor": float,
+    }
+
+    # Find nan in GLAM_full columns iso 2
+    # TODO: change to raise error
+    iso2nans = GLAM_full.loc[GLAM_full.LCIAMethod_location_ISO2.isna(), :]
+    print(f"Found {iso2nans.shape[0]} rows with nan in LCIAMethod_location_ISO2")
+
+
+    GLAM_res = (
+        GLAM_full.loc[
+            :,
+            [
+                "FLOW_uuid",
+                "LCIAMethod_name",
+                "LCIAMethod_realm",
+                "CF",
+                "LCIAMethod_location_ISO2",
+                "Unit",
+            ],
+        ]
+        .rename(columns=GLAM_char_col_rename)
+        .astype(GLAM_char_col_dtypes)
+    )
+
+    # make the unit conversion - this assumes the flows in EXIOBASE will be already
+    # converted to the correct denominator. It will be checked(by the convert method)!
+    GLAM_res.loc[:, "unit_orig"] = GLAM_res["unit_new"].str.split("/").str[1]
+
+    # update the regions with the regex needed for EXIOBASE
+
+    # global characterizations for Climate Change apply to all regions in EXIOBASE
+    GLAM_res.loc[GLAM_res.LCIAMethod_name__FLOW_uuid == "EQ Climate Change", "region"] = (
+        ".*"
+    )
+
+    # using China characterization factors for Taiwan as well
+    GLAM_res.loc[GLAM_res.region == "CN", "region"] = "CN|TW"
+
+    # Use the GLO value for all rest of world regions
+    GLAM_res.loc[GLAM_res.region == "GLO", "region"] = "WA|WL|WE|WF|WM"
+
+
+
+
diff --git a/pymrio/aux/__init__.py b/pymrio/aux/__init__.py
new file mode 100644
index 00000000..20674735
--- /dev/null
+++ b/pymrio/aux/__init__.py
@@ -0,0 +1 @@
+# Auxillary data processing and source data
diff --git a/pymrio/tools/__init__.py b/pymrio/tools/__init__.py
index e69de29b..9134b3eb 100644
--- a/pymrio/tools/__init__.py
+++ b/pymrio/tools/__init__.py
@@ -0,0 +1 @@
+# More or less generic tools for IO processing
diff --git a/pymrio/tools/iodownloader.py b/pymrio/tools/iodownloader.py
index 3cc1a81d..2b76ba67 100644
--- a/pymrio/tools/iodownloader.py
+++ b/pymrio/tools/iodownloader.py
@@ -103,12 +103,18 @@
 
 GLORIA_CONFIG = {"datafiles": GLORIA_URLS}
 
+HEADERS = {
+    # Standard headers for downloading files, just python requests gets blocked quite often
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/131.0'
+}
+
 
 def _get_url_datafiles(
     url_db_view,
     url_db_content,
     mrio_regex,
     access_cookie=None,
+    headers=HEADERS,
     requests_func=requests.post,
 ):
     """Urls of mrio files by parsing url content for mrio_regex
@@ -129,10 +135,14 @@ def _get_url_datafiles(
     access_cookie: dict, optional
         If needed, cookie to access the database
 
+    headers: dict, optional
+        Header to be passed to the request_func function fetching the data
+
     requests_func: function
         Function to use for retrieving the url content.
         Can be requests.get or requests.post
 
+
     Returns
     -------
     Named tuple:
@@ -143,13 +153,13 @@ def _get_url_datafiles(
     # Use post here - NB: get could be necessary for some other pages
     # but currently works for wiod and eora
     returnvalue = namedtuple("url_content", ["raw_text", "data_urls"])
-    url_text = requests_func(url_db_view, cookies=access_cookie).text
+    url_text = requests_func(url_db_view, cookies=access_cookie, headers=headers).text
     data_urls = [url_db_content + ff for ff in re.findall(mrio_regex, url_text)]
     return returnvalue(raw_text=url_text, data_urls=data_urls)
 
 
 def _download_urls(
-    url_list, storage_folder, overwrite_existing, downlog_handler, access_cookie=None
+    url_list, storage_folder, overwrite_existing, downlog_handler, access_cookie=None, headers=HEADERS,
 ):
     """Save url from url_list to storage_folder
 
@@ -174,6 +184,10 @@ def _download_urls(
     access_cookie: cookie, optional
         Cookie to be passed to the requests.post function fetching the data
 
+    headers: dict, optional
+        Header to be passed to the requests.get function fetching the data
+        Be default a Firefox, set in the HEADER variable
+
 
     Returns
     -------
@@ -181,18 +195,25 @@ def _download_urls(
     The downlog_handler is passed back
 
     """
+
+
     for url in url_list:
         filename = filename_from_url(url)
         if downlog_handler.name == "Eora":
             filename = filename.split(".zip")[0] + ".zip"
         if not overwrite_existing and filename in os.listdir(storage_folder):
+            downlog_handler._add_fileio("Skip download existing file {}".format(filename))
             continue
         storage_file = os.path.join(storage_folder, filename)
 
         # Using requests here - tried with aiohttp but was actually slower
         # Also don’t use shutil.copyfileobj - corrupts zips from Eora
         # req = requests.post(url, stream=True, cookies=access_cookie)
-        req = requests.get(url, stream=True, cookies=access_cookie)
+        req = requests.get(url, stream=True, cookies=access_cookie, headers=headers)
+        if req.status_code != 200:
+            raise requests.exceptions.HTTPError(
+                "HTTP Error {} for {}".format(req.status_code, url)
+            )
         with open(storage_file, "wb") as lf:
             for chunk in req.iter_content(1024 * 5):
                 lf.write(chunk)