Merge pull request #10 from drewmee/develop

drewmee · web-flow · commit 00792f994063 · 2020-08-26T21:01:51.000-07:00
Develop
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![pypi version](https://img.shields.io/pypi/v/pyeem.svg 'pypi version')](https://pypi.org/project/pyeem/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pyeem.svg)](https://pypi.org/project/pyeem/)
 [![CircleCI](https://circleci.com/gh/drewmee/PyEEM.svg?style=shield&circle-token=ccdb16078dcb8ee4e4c9b923f547fc7cb2742aae)](https://app.circleci.com/pipelines/github/drewmee/PyEEM)
-[![Read the Docs](https://readthedocs.org/projects/drewmee-demo/badge/?version=latest)](https://pyeem.readthedocs.io/)
+[![Read the Docs](https://readthedocs.org/projects/pyeem/badge/?version=latest)](https://pyeem.readthedocs.io/)
 [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/drewmee/PyEEM/master?filepath=docs%2Fsource%2Ftutorials%2Fnotebooks)
 [![license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/drewmee/PyEEM/blob/master/LICENSE)
 <!--- Badge for codecov -->
@@ -22,7 +22,7 @@ Install via pip:
 
 ## Dependencies for Local Development
 
-If you wish to build the local documentation or run unit tests, there are a few additional dependencies that are required including:
+If you wish to build the local documentation or run unit tests, there are a few additional dependencies. Those can be installed by:
 
 pip install -e ".[docs, tests]"
 
diff --git a/docs/source/_static/logo.png b/docs/source/_static/logo.png
diff --git a/docs/source/tutorials/notebooks/tutorial_1.ipynb b/docs/source/tutorials/notebooks/tutorial_1.ipynb
diff --git a/pyeem/analysis/models/rutherfordnet.py b/pyeem/analysis/models/rutherfordnet.py
@@ -5,8 +5,15 @@
 import numpy as np
 import pandas as pd
 import tensorflow as tf
-from tensorflow.keras.layers import (Activation, Conv2D, Dense, Dropout,
-                                     Flatten, MaxPooling2D)
+from scipy import stats
+from tensorflow.keras.layers import (
+    Activation,
+    Conv2D,
+    Dense,
+    Dropout,
+    Flatten,
+    MaxPooling2D,
+)
 from tensorflow.keras.models import Sequential
 
 
@@ -97,6 +104,8 @@ def get_training_data(self, dataset, ss_results_df, mix_results_df):
         X, y = [], []
 
         aug_df.index = aug_df.index.droplevel(drop_indices)
+        # shuffle
+        aug_df = aug_df.sample(frac=1)
         for concentrations, eem_df in aug_df.groupby(
             sources + ["source"], as_index=False
         ):
@@ -110,7 +119,15 @@ def get_training_data(self, dataset, ss_results_df, mix_results_df):
             X.append(eem_np)
             y.append(concentrations[:-1])
 
-        return np.asarray(X), np.asarray(y)
+        X = np.asarray(X)
+        y = np.asarray(y)
+
+        randomize = np.arange(len(X))
+        np.random.shuffle(randomize)
+        X = X[randomize]
+        y = y[randomize]
+
+        return X, y
 
     def _isolate_test_samples(self, dataset, routine_results_df):
         # Isolate test samples from the metadata
@@ -239,8 +256,41 @@ def train(self, X, y, fit_kws={}):
             tensorflow.python.keras.callbacks.History: [description]
         """
         default_fit_kws = dict(
-            batch_size=32, epochs=5, validation_split=0.5, shuffle=True
+            batch_size=32, epochs=5, validation_split=0.3, shuffle=True
         )
         fit_kws = dict(default_fit_kws, **fit_kws)
         history = self.model.fit(X, y, **fit_kws)
         return history
+
+    def get_prediction_results(self, dataset, predictions, y):
+        cal_sources = list(dataset.calibration_sources.keys())
+        true_df = pd.DataFrame(y, columns=cal_sources)
+        pred_df = pd.DataFrame(predictions, columns=cal_sources)
+
+        results_df = pd.DataFrame()
+        for source, units in dataset.calibration_sources.items():
+            tmp_df = pd.concat(
+                [
+                    true_df[source].to_frame(name="true_concentration"),
+                    pred_df[source].to_frame(name="predicted_concentration"),
+                ],
+                axis=1,
+            )
+            tmp_df[["source", "units"]] = source, units
+            (
+                tmp_df["slope"],
+                tmp_df["intercept"],
+                tmp_df["r_value"],
+                _,
+                _,
+            ) = stats.linregress(
+                tmp_df["true_concentration"], tmp_df["predicted_concentration"]
+            )
+            tmp_df["r_squared"] = tmp_df["r_value"] ** 2
+            tmp_df = tmp_df.set_index(
+                ["source", "units", "slope", "intercept", "r_squared"]
+            )
+            tmp_df = tmp_df.drop(columns="r_value")
+            results_df = pd.concat([results_df, tmp_df])
+
+        return results_df
diff --git a/pyeem/instruments/__init__.py b/pyeem/instruments/__init__.py
@@ -3,4 +3,4 @@
 
 supported, _supported = get_supported_instruments()
 
-__all__ = ["agilent", "horiba", "supported"]
+__all__ = ["agilent", "horiba", "get_supported_instruments", "supported"]
diff --git a/pyeem/plots/__init__.py b/pyeem/plots/__init__.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 from pkg_resources import resource_filename
 
+from .analysis import model_history, prediction_parity_plots
 from .augmentations import (
     mixture_animation,
     plot_prototypical_spectra,
@@ -18,5 +19,5 @@
     "single_source_animation",
     "eem_plot",
     "plot_calibration_curves",
-    "plot_preprocessing"
+    "plot_preprocessing",
 ]
diff --git a/pyeem/plots/analysis.py b/pyeem/plots/analysis.py
@@ -0,0 +1,130 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .base import _get_subplot_dims
+
+
+def model_history(history):
+    """[summary]
+
+    Args:
+        history ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    fig, axes = plt.subplots(figsize=(8, 4), ncols=2, sharex=True)
+    for i, metric in enumerate(["accuracy", "loss"]):
+        ax = axes[i]
+        ax.plot(history.history[metric])
+        ax.plot(history.history["val_%s" % metric])
+        ax.set_title("Model %s" % metric.title())
+        ax.set_ylabel(metric.title())
+        ax.set_xlabel("Epoch")
+        ax.legend(["Train", "Val."], loc="upper left", fontsize=11)
+
+    plt.tight_layout()
+    return axes
+
+
+def prediction_parity_plots(
+    dataset, test_df, train_df=None, subplots=False, fig_kws={}, **kwargs
+):
+    """[summary]
+
+    Args:
+        dataset ([type]): [description]
+        test_df ([type]): [description]
+        train_df ([type], optional): [description]. Defaults to None.
+        subplots (bool, optional): [description]. Defaults to False.
+        fig_kws (dict, optional): [description]. Defaults to {}.
+
+    Returns:
+        [type]: [description]
+    """
+    colors = plt.rcParams["axes.prop_cycle"]()
+    sources = dataset.calibration_sources
+
+    nsources = len(sources)
+    nrows, ncols = _get_subplot_dims(nsources)
+    nplots = nrows * ncols
+
+    default_fig_kws = dict(figsize=(ncols ** 2, nrows * ncols), squeeze=False)
+    fig_kws = dict(default_fig_kws, **fig_kws)
+
+    fig, axes = plt.subplots(1, nsources, **fig_kws)
+
+    def _get_regression_metric(source_df, metric):
+        return source_df.index.get_level_values(level=metric).unique().item()
+
+    pred_dfs = {"test": test_df, "train": train_df}
+
+    ax_idx = 0
+    lines = []
+    labels = []
+    for source in sources:
+        for key, df in pred_dfs.items():
+            if df is None:
+                continue
+
+            if key == "test":
+                marker_color = next(colors)["color"]
+                line_color = "black"
+                alpha = 1
+                zorder = 1
+            else:
+                marker_color = "lightblue"
+                line_color = "grey"
+                alpha = 0.25
+                zorder = -1
+
+            source_df = df.xs(source, level="source")
+            source_units = _get_regression_metric(source_df, "units")
+            slope = _get_regression_metric(source_df, "slope")
+            y_intercept = _get_regression_metric(source_df, "intercept")
+            r_squared = _get_regression_metric(source_df, "r_squared")
+            cal_poly = np.poly1d([slope, y_intercept])
+
+            x = source_df["true_concentration"]
+            y = source_df["predicted_concentration"]
+            axes.flat[ax_idx].scatter(
+                x, y, label=key, color=marker_color, alpha=alpha, zorder=zorder
+            )
+
+            x = np.linspace(
+                source_df["true_concentration"].min(),
+                source_df["true_concentration"].max(),
+            )
+            axes.flat[ax_idx].plot(
+                x,
+                cal_poly(x),
+                label="y = %s\n$R^2=%.2f$"
+                % (str(cal_poly).replace("\n", ""), r_squared),
+                color=line_color,
+                linestyle="--",
+                zorder=zorder,
+            )
+
+        formatted_source_str = source.replace("_", " ").title()
+        xlabel_str = "True Conc., %s" % source_units
+        ylabel_str = "Predicted Conc., %s" % source_units
+        axes.flat[ax_idx].set_xlabel(xlabel_str, fontsize=14)
+        axes.flat[ax_idx].set_ylabel(ylabel_str, fontsize=14)
+        axes.flat[ax_idx].tick_params(axis="both", which="major", labelsize=12)
+        axes.flat[ax_idx].set_title(
+            "Parity Plot for\n%s Concentration" % formatted_source_str,
+            pad=10,
+            fontsize=16,
+        )
+        ax_line, ax_label = axes.flat[ax_idx].get_legend_handles_labels()
+        lines.extend(ax_line)
+        labels.extend(ax_label)
+        axes.flat[ax_idx].legend(
+            loc="upper center", bbox_to_anchor=(0.5, -0.25), ncol=2, fontsize=11
+        )
+        ax_idx += 1
+
+    hspace = kwargs.get("subplot_hspace", 0)
+    wspace = kwargs.get("subplot_wspace", 0.3)
+    plt.subplots_adjust(wspace=wspace, hspace=hspace)
+    return axes
diff --git a/pyeem/plots/preprocessing.py b/pyeem/plots/preprocessing.py
@@ -219,7 +219,10 @@ def _get_regression_metric(source_df, metric):
         ax_line, ax_label = axes.flat[ax_idx].get_legend_handles_labels()
         lines.extend(ax_line)
         labels.extend(ax_label)
-        axes.flat[ax_idx].legend(loc="upper left", fontsize=11)
+        #axes.flat[ax_idx].legend(loc="upper left", fontsize=11)
+        axes.flat[ax_idx].legend(
+            loc="upper center", bbox_to_anchor=(0.5, -0.25), ncol=2, fontsize=11
+        )
         ax_idx += 1
 
     hspace = kwargs.get("subplot_hspace", 0)
diff --git a/pyeem/preprocessing/calibration/calibration.py b/pyeem/preprocessing/calibration/calibration.py
@@ -124,10 +124,19 @@ def calibration_summary_info(cal_df):
 
     def _get_summary_info(row):
         source_df = cal_df.xs(row["source"], level="source")
-        row["Number of Samples"] = source_df.shape[0]
-        row["Min. Concentration"] = source_df["concentration"].min()
-        row["Max. Concentration"] = source_df["concentration"].max()
-        return row
+        num_samples = source_df.shape[0]
+        min_conc = source_df["concentration"].min()
+        max_conc = source_df["concentration"].max()
+        return pd.Series(
+            {
+                "Number of Samples": num_samples,
+                "Min. Concentration": min_conc,
+                "Max. Concentration": max_conc,
+            }
+        )
+
+    summary_df[
+        ["Number of Samples", "Min. Concentration", "Max. Concentration"]
+    ] = summary_df.apply(_get_summary_info, axis=1)
 
-    summary_df = summary_df.apply(_get_summary_info, axis=1)
     return summary_df
diff --git a/pyeem/preprocessing/corrections/corrections.py b/pyeem/preprocessing/corrections/corrections.py
@@ -133,6 +133,8 @@ def raman_normalization(eem_df, raman_source_type, raman_source, method="gradien
     # peak boundary definition (Murphy and others, 2011)
     # raman_sources = ['water_raman', 'blank', 'metadata']
 
+    # This really oughta be refactored ASAP
+
     if raman_source_type in ["blank", "water_raman"]:
         a = 371  # lower limit
         b = 428  # upper limit
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -8,8 +8,8 @@
 @pytest.fixture(scope="session", autouse=True)
 def tmp_dir_fixture(tmpdir_factory):
     # setup section
-    tmp_data_dir = tmpdir_factory.mktemp("demo_data")
-    # tmp_data_dir = "local_test_data"
+    #tmp_data_dir = tmpdir_factory.mktemp("demo_data")
+    tmp_data_dir = "local_test_data"
     yield tmp_data_dir
     # teardown section
     if tmp_data_dir != "local_test_data":
@@ -102,3 +102,19 @@ def demo_augmentation(tmp_dir_fixture, demo_preprocessed_dataset, demo_calibrati
         dataset, cal_df, conc_range=(0.01, 6.3), num_steps=5
     )
     return proto_results_df, ss_results_df, mix_results_df
+
+
+@pytest.fixture(scope="session", autouse=True)
+def demo_rutherfordnet(
+    tmp_dir_fixture, demo_preprocessed_dataset, demo_calibration, demo_augmentation
+):
+    dataset, routine_results_df = demo_preprocessed_dataset
+    cal_df = demo_calibration
+    (_, ss_results_df, mix_results_df,) = demo_augmentation
+
+    rutherfordnet = pyeem.analysis.models.RutherfordNet()
+    (x_train, y_train), (x_test, y_test) = rutherfordnet.prepare_data(
+        dataset, ss_results_df, mix_results_df, routine_results_df
+    )
+    rutherfordnet.train(x_train, y_train)
+    return rutherfordnet
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -60,4 +60,7 @@ def setup(self, demo_datasets):
         self.demo_datasets = demo_datasets
 
     def testFluorescenceRegionalIntegration(self):
+        #eem_df = 
+        #integ_result = pyeem.analysis.basic.fluorescence_regional_integration(eem_df)
+        #assert integ_result == 
         return
diff --git a/tests/test_plots.py b/tests/test_plots.py
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py

Original file line number	Diff line number	Diff line change
`@@ -3,4 +3,4 @@`
`3`	`3`
`4`	`4`	`supported, _supported = get_supported_instruments()`
`5`	`5`
`6`		`-__all__ = ["agilent", "horiba", "supported"]`
	`6`	`+__all__ = ["agilent", "horiba", "get_supported_instruments", "supported"]`