.

thornoe · Jan 16, 2024 · 74151a1 · 74151a1
1 parent 7ccdf47
commit 74151a1
Show file tree

Hide file tree

Showing 11 changed files with 7,044 additions and 6,979 deletions.
diff --git a/gis/imputation.py b/gis/imputation.py
@@ -19,27 +19,20 @@
 ########################################################################################
 import os
 
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import tqdm
 
-# import matplotlib.colors
-# import seaborn as sns
 # To use this experimental feature, we need to explicitly ask for it:
 from sklearn.experimental import enable_iterative_imputer  # noqa
 from sklearn.impute import IterativeImputer
 from sklearn.metrics import accuracy_score
 
-# from sklearn.kernel_approximation import Nystroem
-# from sklearn.ensemble import RandomForestRegressor
-# from sklearn.neighbors import KNeighborsRegressor
-
 
 # Function for score
 def AccuracyScore(y_true, y_pred):
-    """Return average precision (AP) of predicted ecological status compared to known ecological status after converting DVFI fauna index for streams to index of ecological status."""
-    eco_true, eco_pred = [], []
+    """Convert DVFI fauna index for streams to index of ecological status and return accuracy score, i.e., the share of observed streams each year where predicted ecological status matches the true ecological status (which LOO-CV omits)."""
+    eco_true, eco_pred = [], []  #  empy lists for storing transformed observations
     for a, b in zip([y_true, y_pred], [eco_true, eco_pred]):
         # Categorical variable for ecological status: Bad, Poor, Moderate, Good, High
         conditions = [
@@ -66,10 +59,6 @@ def AccuracyScore(y_true, y_pred):
 
 # Create typology dummies
 df_VP = pd.read_csv("output\\streams_VP.csv", index_col="wb")
-for RW in ["RW1", "RW2", "RW3", "RW4", "RW5", "RW6"]:
-    len(df_VP[df_VP.ov_typ == RW]), round(
-        100 * len(df_VP[df_VP.ov_typ == RW]) / len(df_VP)
-    )
 d = pd.get_dummies(df_VP["ov_typ"]).astype(int)
 d["softBottom"] = d["RW4"] + d["RW5"]
 d.columns = [
@@ -82,13 +71,19 @@ def AccuracyScore(y_true, y_pred):
 ]
 
 # Merge DataFrames for typology and DVFI fauna index
-dfTypology = dfIndicator.merge(
-    d[["small", "medium", "large", "softBottom"]], how="inner", on="wb"
-)
+col = ["small", "medium", "large", "softBottom"]
+dfTypology = dfIndicator.merge(d[col], how="inner", on="wb")
+
+# Typology for observed water bodies by year
+typ = pd.DataFrame(index=years)  #  empty DataFrame to store typology by year
+for c in col:
+    typ[c] = 100 * dfTypology[dfTypology[c] == 1].count() / dfTypology.count()
+    typ.loc["all", c] = 100 * len(d[d[c] == 1]) / len(d)
+typ.to_csv("output/streams_VP_stats.csv")
 
 
 ########################################################################################
-#   2. Multivariate feature imputation - leave-one-out CV makes it very slow!
+#   2. Multivariate feature imputation (note: LOO CV takes ~1 day to run for each model)
 ########################################################################################
 # Iterative imputer using the BayesianRidge() estimator with increased tolerance
 imputer = IterativeImputer(random_state=0, tol=1e-1)
@@ -119,16 +114,26 @@ def AccuracyScore(y_true, y_pred):
 #         1992: [np.nan, 1.4, 1.9, 2.4, 2.9],
 #     }
 # )
-# years = dfIndicator.columns
 # dfTypology = dfIndicator.copy()
 # dfTypology["small"] = [1, 1, 0, 0, 0]
+# years = dfIndicator.columns
+
+# Data subset to measure the prediction improvement from including 1988
+# dfIndicator = dfIndicator.drop(columns=1988)
+# dfTypology = dfTypology.drop(columns=1988)
+# years = [1989]
+# accuracy dfIndicator & dfTypology in 1989 drops by  up to if omitting year 1988
+#   0.6073825503355704 & 0.6191275167785235 respectively
+
+# DataFrame for storing accuracy scores by year and calculating weighted average
+scores = pd.DataFrame(dfIndicator.count(), index=years, columns=["obs"])
 
 # Leave-one-out cross-validation (LOO-CV) loop over every observed stream and year
-scores = pd.DataFrame()  #  empty DataFrame for accuracy scores by year
 dfIndicator.name = "No typology"
 dfTypology.name = "Typology"
 for df in (dfIndicator, dfTypology):
-    print(df.name, "used for imputation. LOO-CV applied to observed streams each year:")
+    print("Total:", (scores[df.name] * scores["obs"]).sum() / scores["obs"].sum())
+    print(df.name, "used for imputation. LOO-CV of observed streams each year:")
     for t in tqdm.tqdm(years):
         y = df[df[t].notnull()].index
         Y = pd.DataFrame(index=y)
@@ -144,48 +149,26 @@ def AccuracyScore(y_true, y_pred):
             Y.loc[i, "pred"] = X_imp.loc[i, t]
 
         # Accuracy of ecological status after converting DVFI fauna index for streams
-        acc = AccuracyScore(Y["true"], Y["pred"])
+        accuracy = AccuracyScore(Y["true"], Y["pred"])
 
         # Save accuracy score each year to DataFrame for scores
-        scores.loc[t, df.name] = acc
+        scores.loc[t, df.name] = accuracy
 
-scores
+    # Total accuracy (weighted by number of observations)
+    print("Total:", (scores[df.name] * scores["obs"]).sum() / scores["obs"].sum())
+    # accuracy dfIndicator - accuracy dfTypology (improvement of .14 percentage points)
+    #   0.6241328429537296 - 0.6255202942277622
 
-# Save accuracy to CSV
+# Save accuracy scores to CSV
 scores.to_csv("output/streams_eco_imp_accuracy.csv")
 
-# Total accuracy, i.e., weight by observations: dfIndicator.count()
-
 ########################################################################################
 #   4. Visualization: Distribution by year                                    #
 ########################################################################################
-b = "#1f77b4"  #  blue from d3.scale.category10c()
-o = "#ff7f0e"  #  orange from d3.scale.category10c()
-
-
-# plot accuracy scores - find pd.plot() bar example!; wider page margins in appendix?
-fig, ax = plt.figure(figsize=(12, 7.4))  #  create new figure
-plt.bar(ax - 0.2, scores["No typology"], 0.4, label="No typology")
-plt.bar(ax + 0.2, scores["Typology"], 0.4, label="Typology")
-ax.set(title="Accuracy of imputation using LOO-CV", ylabel="Accuracy score")
-ax.set_xticks(np.array(years))
-plt.legend()
-plt.show()
-fig.savefig("output/streams_eco_imp_accuracy.pdf", bbox_inches="tight")
+# Read accuracy scores from CSV
+scores = pd.read_csv("output/streams_eco_imp_accuracy.csv", index_col=0)
+s = scores[["No typology", "Typology"]]
 
-for t in xval:
-    ax.bar(
-        t,
-        scores[t],
-        xerr=stds_diabetes[t],
-        color=colors[t],
-        alpha=0.6,
-        align="center",
-    )
-
-ax.set_title("Imputation Techniques with Diabetes Data")
-ax.set_xlim(left=np.min(mses_diabetes) * 0.9, right=np.max(mses_diabetes) * 1.1)
-ax.set_yticks(xval)
-ax.set_xlabel("MSE")
-ax.invert_yaxis()
-ax.set_yticklabels(x_labels)
+# Plot accuracy scores
+fig = s.plot(kind="bar", figsize=(12, 6), ylabel="Accuracy score").get_figure()
+fig.savefig("output/streams_eco_imp_accuracy.pdf", bbox_inches="tight")
diff --git a/gis/output/streams_VP_stats.csv b/gis/output/streams_VP_stats.csv
@@ -0,0 +1,35 @@
+,small,medium,large,softBottom
+1988,63.28125,33.125,2.8125,0.78125
+1989,58.7248322147651,37.58389261744966,3.0201342281879193,0.6711409395973155
+1990,61.75115207373272,34.86943164362519,2.9185867895545314,0.4608294930875576
+1991,62.10350584307179,34.390651085141904,3.005008347245409,0.5008347245409015
+1992,44.31934493346981,48.925281473899695,5.527123848515865,1.2282497441146367
+1993,46.25550660792952,46.96035242290749,4.845814977973569,1.9383259911894273
+1994,47.23101265822785,46.04430379746835,4.034810126582278,2.689873417721519
+1995,46.56716417910448,47.53731343283582,3.5074626865671643,2.388059701492537
+1996,46.88221709006928,46.805234795996924,3.849114703618168,2.4634334103156275
+1997,44.067796610169495,48.92141756548536,4.545454545454546,2.4653312788906008
+1998,47.22222222222222,46.36752136752137,4.344729344729345,2.0655270655270654
+1999,46.44107351225204,47.0828471411902,4.3173862310385065,2.1586931155192532
+2000,51.3051305130513,42.484248424842484,3.7353735373537353,2.4752475247524752
+2001,50.023030861354215,43.98894518654998,3.546752648549056,2.4412713035467526
+2002,49.20494699646643,44.21378091872791,4.019434628975265,2.5618374558303887
+2003,46.98852772466539,46.46271510516252,4.3021032504780115,2.2466539196940727
+2004,50.16519823788546,42.951541850220266,3.909691629955947,2.973568281938326
+2005,45.85908529048208,46.9097651421508,4.511742892459827,2.719406674907293
+2006,48.65042791310072,44.04213298222515,4.937458854509546,2.369980250164582
+2007,45.66133108677338,46.58803706823926,5.475989890480202,2.274641954507161
+2008,36.417910447761194,54.72636815920398,5.5721393034825875,3.283582089552239
+2009,40.433925049309664,51.18343195266272,5.9171597633136095,2.465483234714004
+2010,50.21422450728363,42.673521850899746,4.284490145672665,2.827763496143959
+2011,53.203743700503956,40.244780417566595,4.031677465802736,2.5197984161267097
+2012,47.003154574132495,45.74132492113565,4.495268138801261,2.7602523659305995
+2013,44.160583941605836,47.66423357664234,4.671532846715328,3.5036496350364965
+2014,55.47355473554735,36.039360393603936,4.612546125461255,3.874538745387454
+2015,44.80349344978166,46.98689956331878,5.152838427947598,3.056768558951965
+2016,44.37194127243067,48.28711256117455,5.791190864600326,1.5497553017944534
+2017,62.55119953188999,30.251609128145112,3.920421299005266,3.2767700409596254
+2018,59.14915427985648,35.161455663762176,2.92157867760123,2.7678113787801126
+2019,47.345454545454544,46.03636363636364,4.1454545454545455,2.4727272727272727
+2020,52.13864306784661,39.233038348082594,5.3097345132743365,3.3185840707964602
+all,65.56765627331046,28.270923467104282,2.2676413546173357,3.893778904967925