Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
thornoe committed Jan 16, 2024
1 parent 7ccdf47 commit 74151a1
Show file tree
Hide file tree
Showing 11 changed files with 7,044 additions and 6,979 deletions.
93 changes: 38 additions & 55 deletions gis/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,20 @@
########################################################################################
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm

# import matplotlib.colors
# import seaborn as sns
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score

# from sklearn.kernel_approximation import Nystroem
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.neighbors import KNeighborsRegressor


# Function for score
def AccuracyScore(y_true, y_pred):
"""Return average precision (AP) of predicted ecological status compared to known ecological status after converting DVFI fauna index for streams to index of ecological status."""
eco_true, eco_pred = [], []
"""Convert DVFI fauna index for streams to index of ecological status and return accuracy score, i.e., the share of observed streams each year where predicted ecological status matches the true ecological status (which LOO-CV omits)."""
eco_true, eco_pred = [], [] # empy lists for storing transformed observations
for a, b in zip([y_true, y_pred], [eco_true, eco_pred]):
# Categorical variable for ecological status: Bad, Poor, Moderate, Good, High
conditions = [
Expand All @@ -66,10 +59,6 @@ def AccuracyScore(y_true, y_pred):

# Create typology dummies
df_VP = pd.read_csv("output\\streams_VP.csv", index_col="wb")
for RW in ["RW1", "RW2", "RW3", "RW4", "RW5", "RW6"]:
len(df_VP[df_VP.ov_typ == RW]), round(
100 * len(df_VP[df_VP.ov_typ == RW]) / len(df_VP)
)
d = pd.get_dummies(df_VP["ov_typ"]).astype(int)
d["softBottom"] = d["RW4"] + d["RW5"]
d.columns = [
Expand All @@ -82,13 +71,19 @@ def AccuracyScore(y_true, y_pred):
]

# Merge DataFrames for typology and DVFI fauna index
dfTypology = dfIndicator.merge(
d[["small", "medium", "large", "softBottom"]], how="inner", on="wb"
)
col = ["small", "medium", "large", "softBottom"]
dfTypology = dfIndicator.merge(d[col], how="inner", on="wb")

# Typology for observed water bodies by year
typ = pd.DataFrame(index=years) # empty DataFrame to store typology by year
for c in col:
typ[c] = 100 * dfTypology[dfTypology[c] == 1].count() / dfTypology.count()
typ.loc["all", c] = 100 * len(d[d[c] == 1]) / len(d)
typ.to_csv("output/streams_VP_stats.csv")


########################################################################################
# 2. Multivariate feature imputation - leave-one-out CV makes it very slow!
# 2. Multivariate feature imputation (note: LOO CV takes ~1 day to run for each model)
########################################################################################
# Iterative imputer using the BayesianRidge() estimator with increased tolerance
imputer = IterativeImputer(random_state=0, tol=1e-1)
Expand Down Expand Up @@ -119,16 +114,26 @@ def AccuracyScore(y_true, y_pred):
# 1992: [np.nan, 1.4, 1.9, 2.4, 2.9],
# }
# )
# years = dfIndicator.columns
# dfTypology = dfIndicator.copy()
# dfTypology["small"] = [1, 1, 0, 0, 0]
# years = dfIndicator.columns

# Data subset to measure the prediction improvement from including 1988
# dfIndicator = dfIndicator.drop(columns=1988)
# dfTypology = dfTypology.drop(columns=1988)
# years = [1989]
# accuracy dfIndicator & dfTypology in 1989 drops by up to if omitting year 1988
# 0.6073825503355704 & 0.6191275167785235 respectively

# DataFrame for storing accuracy scores by year and calculating weighted average
scores = pd.DataFrame(dfIndicator.count(), index=years, columns=["obs"])

# Leave-one-out cross-validation (LOO-CV) loop over every observed stream and year
scores = pd.DataFrame() # empty DataFrame for accuracy scores by year
dfIndicator.name = "No typology"
dfTypology.name = "Typology"
for df in (dfIndicator, dfTypology):
print(df.name, "used for imputation. LOO-CV applied to observed streams each year:")
print("Total:", (scores[df.name] * scores["obs"]).sum() / scores["obs"].sum())
print(df.name, "used for imputation. LOO-CV of observed streams each year:")
for t in tqdm.tqdm(years):
y = df[df[t].notnull()].index
Y = pd.DataFrame(index=y)
Expand All @@ -144,48 +149,26 @@ def AccuracyScore(y_true, y_pred):
Y.loc[i, "pred"] = X_imp.loc[i, t]

# Accuracy of ecological status after converting DVFI fauna index for streams
acc = AccuracyScore(Y["true"], Y["pred"])
accuracy = AccuracyScore(Y["true"], Y["pred"])

# Save accuracy score each year to DataFrame for scores
scores.loc[t, df.name] = acc
scores.loc[t, df.name] = accuracy

scores
# Total accuracy (weighted by number of observations)
print("Total:", (scores[df.name] * scores["obs"]).sum() / scores["obs"].sum())
# accuracy dfIndicator - accuracy dfTypology (improvement of .14 percentage points)
# 0.6241328429537296 - 0.6255202942277622

# Save accuracy to CSV
# Save accuracy scores to CSV
scores.to_csv("output/streams_eco_imp_accuracy.csv")

# Total accuracy, i.e., weight by observations: dfIndicator.count()

########################################################################################
# 4. Visualization: Distribution by year #
########################################################################################
b = "#1f77b4" # blue from d3.scale.category10c()
o = "#ff7f0e" # orange from d3.scale.category10c()


# plot accuracy scores - find pd.plot() bar example!; wider page margins in appendix?
fig, ax = plt.figure(figsize=(12, 7.4)) # create new figure
plt.bar(ax - 0.2, scores["No typology"], 0.4, label="No typology")
plt.bar(ax + 0.2, scores["Typology"], 0.4, label="Typology")
ax.set(title="Accuracy of imputation using LOO-CV", ylabel="Accuracy score")
ax.set_xticks(np.array(years))
plt.legend()
plt.show()
fig.savefig("output/streams_eco_imp_accuracy.pdf", bbox_inches="tight")
# Read accuracy scores from CSV
scores = pd.read_csv("output/streams_eco_imp_accuracy.csv", index_col=0)
s = scores[["No typology", "Typology"]]

for t in xval:
ax.bar(
t,
scores[t],
xerr=stds_diabetes[t],
color=colors[t],
alpha=0.6,
align="center",
)

ax.set_title("Imputation Techniques with Diabetes Data")
ax.set_xlim(left=np.min(mses_diabetes) * 0.9, right=np.max(mses_diabetes) * 1.1)
ax.set_yticks(xval)
ax.set_xlabel("MSE")
ax.invert_yaxis()
ax.set_yticklabels(x_labels)
# Plot accuracy scores
fig = s.plot(kind="bar", figsize=(12, 6), ylabel="Accuracy score").get_figure()
fig.savefig("output/streams_eco_imp_accuracy.pdf", bbox_inches="tight")
35 changes: 35 additions & 0 deletions gis/output/streams_VP_stats.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
,small,medium,large,softBottom
1988,63.28125,33.125,2.8125,0.78125
1989,58.7248322147651,37.58389261744966,3.0201342281879193,0.6711409395973155
1990,61.75115207373272,34.86943164362519,2.9185867895545314,0.4608294930875576
1991,62.10350584307179,34.390651085141904,3.005008347245409,0.5008347245409015
1992,44.31934493346981,48.925281473899695,5.527123848515865,1.2282497441146367
1993,46.25550660792952,46.96035242290749,4.845814977973569,1.9383259911894273
1994,47.23101265822785,46.04430379746835,4.034810126582278,2.689873417721519
1995,46.56716417910448,47.53731343283582,3.5074626865671643,2.388059701492537
1996,46.88221709006928,46.805234795996924,3.849114703618168,2.4634334103156275
1997,44.067796610169495,48.92141756548536,4.545454545454546,2.4653312788906008
1998,47.22222222222222,46.36752136752137,4.344729344729345,2.0655270655270654
1999,46.44107351225204,47.0828471411902,4.3173862310385065,2.1586931155192532
2000,51.3051305130513,42.484248424842484,3.7353735373537353,2.4752475247524752
2001,50.023030861354215,43.98894518654998,3.546752648549056,2.4412713035467526
2002,49.20494699646643,44.21378091872791,4.019434628975265,2.5618374558303887
2003,46.98852772466539,46.46271510516252,4.3021032504780115,2.2466539196940727
2004,50.16519823788546,42.951541850220266,3.909691629955947,2.973568281938326
2005,45.85908529048208,46.9097651421508,4.511742892459827,2.719406674907293
2006,48.65042791310072,44.04213298222515,4.937458854509546,2.369980250164582
2007,45.66133108677338,46.58803706823926,5.475989890480202,2.274641954507161
2008,36.417910447761194,54.72636815920398,5.5721393034825875,3.283582089552239
2009,40.433925049309664,51.18343195266272,5.9171597633136095,2.465483234714004
2010,50.21422450728363,42.673521850899746,4.284490145672665,2.827763496143959
2011,53.203743700503956,40.244780417566595,4.031677465802736,2.5197984161267097
2012,47.003154574132495,45.74132492113565,4.495268138801261,2.7602523659305995
2013,44.160583941605836,47.66423357664234,4.671532846715328,3.5036496350364965
2014,55.47355473554735,36.039360393603936,4.612546125461255,3.874538745387454
2015,44.80349344978166,46.98689956331878,5.152838427947598,3.056768558951965
2016,44.37194127243067,48.28711256117455,5.791190864600326,1.5497553017944534
2017,62.55119953188999,30.251609128145112,3.920421299005266,3.2767700409596254
2018,59.14915427985648,35.161455663762176,2.92157867760123,2.7678113787801126
2019,47.345454545454544,46.03636363636364,4.1454545454545455,2.4727272727272727
2020,52.13864306784661,39.233038348082594,5.3097345132743365,3.3185840707964602
all,65.56765627331046,28.270923467104282,2.2676413546173357,3.893778904967925
Loading

0 comments on commit 74151a1

Please sign in to comment.