Merge pull request #77 from Project-Resilience/eluc-ci

Create yaml file for ELUC use case CI
Project-Resilience · Mar 22, 2024 · d5619d7 · d5619d7
2 parents a1747c7 + 4fb6f99
commit d5619d7
Show file tree

Hide file tree

Showing 27 changed files with 258 additions and 206 deletions.
diff --git a/.github/workflows/eluc.yml b/.github/workflows/eluc.yml
@@ -0,0 +1,36 @@
+# This runs the unit tests for the ELUC use case
+
+name: ELUC Use Case
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./use_cases/eluc
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Set PYTHONPATH
+      run: echo "PYTHONPATH=$PWD" >> $GITHUB_ENV
+    - name: Test PYTHONPATH
+      run: printenv PYTHONPATH
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with PyLint
+      run: pylint --ignore="demo" --recursive=y --fail-under=9 ./*
+    - name: Run unit tests
+      run: python -m unittest
+
diff --git a/use_cases/eluc/.pylintrc b/use_cases/eluc/.pylintrc
@@ -0,0 +1,10 @@
+[MASTER]
+ignore=demo
+
+jobs=0
+
+max-line-length=120
+
+suggestion-mode=yes
+
+good-names=X_train, X_val, X_test, y_train, y_val, y_test, X, Y, y, X_test_scaled
diff --git a/use_cases/eluc/README.md b/use_cases/eluc/README.md
@@ -23,7 +23,7 @@ BLUE simulations with committed emissions could be used to estimate the long-ter
 "Committed emissions" means all the emissions that are caused by a land-use change event are attributed to the year
 of the event.
 BLUE (bookkeeping of land use emissions) is a bookkeeping model that attributes carbon fluxes to land use activities.
-See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details.  
+See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details. 
 
 ### LUC
 

diff --git a/use_cases/eluc/data/constants.py b/use_cases/eluc/data/constants.py
@@ -9,8 +9,8 @@
 CODES_PATH = "data/codes.csv"
 
 # Different variations of land-use change columns
-LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 
-                 'pastr', 'primf', 'primn', 
+LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
+                 'pastr', 'primf', 'primn',
                  'range', 'secdf', 'secdn', 'urban']
 CROP_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per']
 LAND_USE_COLS = ["crop"] + [col for col in LAND_USE_COLS if col not in CROP_COLS]
@@ -29,7 +29,8 @@
 
 # ["United Kingdom", "France", "Germany", "Netherlands", "Belgium", "Switzerland", "Ireland"]
 EU_COUNTRIES = ["GB", "FR", "DE", "NL", "BE", "CH", "IE"]
-# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia", "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
+# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia",
+# "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
 SA_COUNTRIES = ["BR", "BO", "PY", "PE", "EC", "CO", "VE", "GY", "SR", "UY", "AR", "CL"]
 # ["United States"]
 US_COUNTRIES = ["US"]

diff --git a/use_cases/eluc/data/conversion.py b/use_cases/eluc/data/conversion.py
@@ -9,7 +9,7 @@
 from data import constants
 
 # TODO: Note: This table is not perfect and has some errors,
-# we should consider manually fixing them. I tried my best but 
+# we should consider manually fixing them. I tried my best but
 # I'm not 100% sure it's correct.
 MANUAL_MAP = {
     "INDO": 360,
@@ -57,7 +57,7 @@ def construct_countries_df():
     # Replace all the bad codes with their real ones
     for i in range(len(countries_df)):
         old_abbrev = countries_df.iloc[i]["abbrevs"]
-        if old_abbrev in MANUAL_MAP.keys() and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
+        if old_abbrev in MANUAL_MAP and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
             countries_df.iloc[i]["abbrevs"] = codes_df[codes_df["Numeric code"] == MANUAL_MAP[old_abbrev]]["Alpha-2 code"].iloc[0]
 
     return countries_df
diff --git a/use_cases/eluc/data/eluc_data.py b/use_cases/eluc/data/eluc_data.py
@@ -42,9 +42,9 @@ def encode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
                 if min_val == max_val:
                     new_df[col] = 0
                 else:
-                    new_df[col] = (new_df[col] - self.fields[col]["range"][0]) / (self.fields[col]["range"][1] - self.fields[col]["range"][0])
+                    new_df[col] = (new_df[col] - min_val) / (max_val - min_val)
         return new_df
-    
+
     def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Decodes a dataframe using the fields given in the constructor.
@@ -53,7 +53,9 @@ def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
         new_df = df.copy()
         for col in new_df.columns:
             if col in self.fields:
-                new_df[col] = new_df[col] * (self.fields[col]["range"][1] - self.fields[col]["range"][0]) + self.fields[col]["range"][0]
+                min_val = self.fields[col]["range"][0]
+                max_val = self.fields[col]["range"][1]
+                new_df[col] = new_df[col] * (max_val - min_val) + min_val
         return new_df
 
 
@@ -87,22 +89,23 @@ def get_encoded_train(self):
         if self.encoded_train_df is None:
             self.encoded_train_df = self.encoder.encode_as_df(self.train_df)
         return self.encoded_train_df
-    
+
     def get_encoded_test(self):
         """
         Same as above but for test data.
         """
         if self.encoded_test_df is None:
             self.encoded_test_df = self.encoder.encode_as_df(self.test_df)
         return self.encoded_test_df
-    
+
     def get_fields(self) -> dict:
         """
         Creates fields json object for the data encoder/prescriptor.
         """
-        fields_df = self.train_df[constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]].astype("float64")
-        fields = dict()
-        for col in constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]:
+        cao_cols = constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]
+        fields_df = self.train_df[cao_cols].astype("float64")
+        fields = {}
+        for col in cao_cols:
             # Set range of land and diff land uses manually to their true ranges because they
             # do not need to be scaled
             if col in constants.LAND_USE_COLS:
@@ -132,28 +135,27 @@ def get_fields(self) -> dict:
             "valued": "CONTINUOUS"
         }
 
-        return fields 
-    
+        return fields
+
     def push_to_hf(self, repo_path, commit_message, token=None):
         """
         Pushes data to huggingface repo. Don't use this unless you're sure you want to update it!
         :param repo_path: Path to huggingface repo.
         """
-
         whole_df = pd.concat([self.train_df, self.test_df])
         # We get the indices as columns anyways so we can drop them
         whole_df = whole_df.drop(["lat", "lon", "time"], axis=1)
         ds = Dataset.from_pandas(whole_df)
         if not token:
             token = os.getenv("HF_TOKEN")
         ds.push_to_hub(repo_path, commit_message=commit_message, token=token)
-        
+
 
 class ELUCData(AbstractData):
     """
     Loads ELUC data from HuggingFace repo and processes it.
     """
-    
+
     def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=None):
         """
         If update_path is given, load raw data the old way using 2 files that are merged.
@@ -169,12 +171,13 @@ def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=Non
 
         self.train_df = df.loc[start_year:test_year-1]
         self.test_df = df.loc[test_year:end_year-1]
-        
+
         self.encoder = ELUCEncoder(self.get_fields())
 
     def hf_to_df(self, hf_repo):
         """
-        Loads dataset from huggingface, converts to pandas, then sets indices appropriately to time/lat/lon.
+        Loads dataset from huggingface, converts to pandas, then sets indices
+        appropriately to time/lat/lon.
         Keep old time/lat/lon columns so we can use them as features later.
         """
         ds = load_dataset(hf_repo)["train"]
@@ -194,7 +197,7 @@ def __init__(self, path, update_path, start_year=1851, test_year=2012, end_year=
 
         self.train_df = df.loc[start_year:test_year-1]
         self.test_df = df.loc[test_year:end_year-1]
-        
+
         self.encoder = ELUCEncoder(self.get_fields())
 
     def import_data(self, path, update_path):
@@ -217,15 +220,17 @@ def import_data(self, path, update_path):
             raw = raw.merge(eluc)
 
             # Shift actions back a year
-            raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban']
+            raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
+                         'pastr', 'primf', 'primn', 'range',
+                         'secdf', 'secdn', 'urban']
             raw_diffs = [f"{col}_diff" for col in raw_diffs]
             raw[raw_diffs] = raw[raw_diffs].shift(time=-1)
 
             # Finds country for each cell using lat/lon coordinates
             country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(raw)
             raw["country"] = country_mask
         return raw
-    
+
     def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=None) -> pd.DataFrame:
         """
         Converts an xarray DataArray to a pandas DataFrame.
@@ -259,10 +264,10 @@ def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=N
         # Merge crops into one column because BLUE model doesn't differentiate
         df["crop"] = df[constants.CROP_COLS].sum(axis=1)
         df["crop_diff"] = df[[f"{c}_diff" for c in constants.CROP_COLS]].sum(axis=1)
-            
+
         df['country_name'] = self.countries_df.loc[df['country'], 'names'].values
-        
+
         # Drop this column we used for preprocessing (?)
         df = df.drop("mask", axis=1)
-            
+
         return df
diff --git a/use_cases/eluc/data/torch_data.py b/use_cases/eluc/data/torch_data.py
@@ -15,7 +15,7 @@ class TorchDataset(Dataset):
     :param y: labels
     """
     def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"):
-        super().__init__()   
+        super().__init__()
         self.X = torch.tensor(X, dtype=torch.float32, device=device)
         self.y = torch.tensor(y, device=device)
         assert len(self.X) == len(self.y), "X and y must have the same length"
@@ -24,4 +24,4 @@ def __len__(self):
         return len(self.X)
 
     def __getitem__(self, idx: int) -> tuple:
-        return self.X[idx], self.y[idx]
+        return self.X[idx], self.y[idx]
diff --git a/use_cases/eluc/predictors/neural_network/__init__.py b/use_cases/eluc/predictors/neural_network/__init__.py