From a937db38f4b760336e755e49705c85255777a1e3 Mon Sep 17 00:00:00 2001 From: JessyD Date: Thu, 10 Feb 2022 10:35:20 -0500 Subject: [PATCH] Rename python files --- .../multiverse_analysis_classification.ipynb | 1984 +++++++++-------- .../multiverse_analysis_classification.py | 480 ++-- .../multiverse_analysis_regression.ipynb | 2 +- notebooks/multiverse_analysis_regression.py | 162 +- 4 files changed, 1325 insertions(+), 1303 deletions(-) diff --git a/notebooks/multiverse_analysis_classification.ipynb b/notebooks/multiverse_analysis_classification.ipynb index 87e1587..80773ac 100644 --- a/notebooks/multiverse_analysis_classification.ipynb +++ b/notebooks/multiverse_analysis_classification.ipynb @@ -1,963 +1,1027 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook runs the classification analysis using the ABIDE dataset.\n", - "\n", - "The data for this analysis should be downloaded from http://preprocessed-connectomes-project.org/abide/download.html before running this notebook. Because downloading the dataset might take a few hours, we recommend downloading the data locally. \n", - "\n", - "Similarly to the regression analysis, we have provided together with the gitrepository the intermediate steps of this analysis. Therefore, the researcher interested in replicating parts of this notebook can skip the most time consuming steps and run only specific sections. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "todG1Zr1KuNz" - }, - "source": [ - "# 1. Setting up the enviroment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "eJcI72vCYmnE", - "outputId": "27449105-0b71-45fd-c1e6-f77c5a469e60" - }, - "outputs": [], - "source": [ - "# Install necessary python dependencies. Only necessary if the dependencies have not been previously installed.\n", - "# If you are running this notebook locally make sure you have a virtual environment and are running this notebook\n", - "# from inside the virtual environment. \n", - "! pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Define key variables\n", - "# Add the into-the-multiverse folder to the Python path. This allows the helperfunction\n", - "# to be used\n", - "import sys\n", - "sys.path.insert(1, 'into-the-multiverse')\n", - "\n", - "import numpy as np\n", - "np.random.seed(1234)\n", - "\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\")\n", - "from pathlib import Path\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set up the local paths accordingly\n", - "# ----------------------------------------\n", - "# All paths are expected to be Path objects\n", - "# Specifiy the location of the code\n", - "path_to_project = Path.home() / 'Code'/ 'into-the-multiverse'\n", - "os.chdir(path_to_project)\n", - "PROJECT_ROOT = Path.cwd()\n", - "# Specify the path to where the data has been downloaded\n", - "data_root = Path('/Volumes/abide')\n", - "output_path = PROJECT_ROOT / 'output' / 'abide'\n", - "if not output_path.is_dir():\n", - " output_path.mkdir(parents=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import OrderedDict\n", - "import pickle\n", - "import json\n", - "from functools import partial\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "from tqdm import tqdm\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.patches as mpatches\n", - "import matplotlib.lines as mlines\n", - "from scipy.stats import spearmanr\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.metrics import roc_auc_score\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn import manifold\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.metrics.pairwise import cosine_similarity\n", - "from sklearn.decomposition import PCA\n", - "from sklearn.gaussian_process import GaussianProcessRegressor\n", - "\n", - "from nilearn.connectome import ConnectivityMeasure\n", - "from umap.umap_ import UMAP\n", - "import phate\n", - "\n", - "from helperfunctions import (initialize_bo, run_bo, load_abide_demographics, plot_bo_estimated_space, plot_bo_evolution,\n", - " posteriorOnlyModels, plot_bo_repetions, objective_func_class)\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "M8_vz3WpdZxi" - }, - "outputs": [], - "source": [ - "# Define the space variables\n", - "derivatives = ['rois_tt', 'rois_ho', 'rois_ez', 'rois_dosenbach160', 'rois_cc400', 'rois_cc200']\n", - "pipelines = ['cpac', 'ccs', 'dparsf', 'niak']\n", - "strategies = ['filt_global', 'nofilt_global', 'nofilt_noglobal', 'filt_noglobal']\n", - "conn_metrics = ['tangent', 'correlation', 'partial correlation', 'covariance']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Run the different analysis to bild the space" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next step assumes that the data has been downloaded. The data can be downloaded from (http://preprocessed-connectomes-project.org/abide/download.html). For time reasons, we will not download the data within this notebook. To run this script the code expects the files to be in the following structure:\n", - "\n", - "```\n", - "├── ccs\n", - "│   ├── filt_global\n", - "│   ├── filt_noglobal\n", - "│   ├── nofilt_global\n", - "│   └── nofilt_noglobal\n", - "├── cpac\n", - "│   ├── filt_global\n", - "│   ├── filt_noglobal\n", - "│   ├── nofilt_global\n", - "│   └── nofilt_noglobal\n", - "├── dparsf\n", - "│   ├── filt_global\n", - "│   ├── filt_noglobal\n", - "│   ├── nofilt_global\n", - "│   └── nofilt_noglobal\n", - "└── niak\n", - " ├── filt_global\n", - " ├── filt_noglobal\n", - " ├── nofilt_global\n", - " └── nofilt_noglobal\n", - "```\n", - "\n", - "However, to facilitate reproducibility together with this code. We are providing the file `output/abide/abide_space.pckl`, which contains the output from the next cell. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# select the subjects we want to use to create the space (about 20% of the total subjects) making sure that\n", - "# both classes are equally represented. Uuse the remaining 80% for the active learning step and as a holdout\n", - "# dataset to evaluate the performance of the active learning step.\n", - "\n", - "# Load data demographics\n", - "abide_df = load_abide_demographics(data_root)\n", - "indices = np.arange(len(abide_df))\n", - "idx_space, idx_train_holdout = train_test_split(indices, test_size=.8, train_size=.2, random_state=0,\n", - " shuffle=True, stratify=abide_df['DX_GROUP'])\n", - "# Split the training data again, to keep part of the dataset as a hold out dataset\n", - "\n", - "idx_train, idx_holdout = train_test_split(idx_train_holdout, test_size=.25, train_size=.75, random_state=0,\n", - " shuffle=True, stratify=abide_df['DX_GROUP'].iloc[idx_train_holdout])\n", - "# Visualise stratification\n", - "space_df = abide_df.iloc[idx_space]\n", - "print('Numbers on space df')\n", - "print(space_df['DX_GROUP'].value_counts())\n", - "\n", - "train_df = abide_df.iloc[idx_train]\n", - "print('Numbers on training df')\n", - "print(train_df['DX_GROUP'].value_counts())\n", - "\n", - "holdout_df = abide_df.iloc[idx_holdout]\n", - "print('Numbers on hold out df')\n", - "print(holdout_df['DX_GROUP'].value_counts())\n", - "\n", - "# save list of indexes of the data split\n", - "indices = {'idx_train': idx_train.tolist(),\n", - " 'idx_space': idx_space.tolist(),\n", - " 'idx_holdout': idx_holdout.tolist()}\n", - "with open((output_path / f'indices_space_train.json'), 'w') as handle:\n", - " json.dump(indices, handle)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next cell will create the space. This is a time consuming step and might take a few hours to run. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_idx_space = int(len(idx_space) * (len(idx_space) - 1) / 2)\n", - "count = 0\n", - "ResultsIndVar = np.zeros(((len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics)), n_idx_space))\n", - "methods_idx = {}\n", - "space_rois = {}\n", - "with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar:\n", - " for derivative in derivatives:\n", - " space_rois[derivative] = {}\n", - " for pipeline in pipelines:\n", - " space_rois[derivative][pipeline] = {}\n", - " for strategy in strategies:\n", - " space_rois[derivative][pipeline][strategy] = {}\n", - " for conn_metric in conn_metrics:\n", - " data_path = data_root / 'Outputs' / pipeline / strategy / derivative\n", - " space_rois[derivative][pipeline][strategy][conn_metric] = []\n", - " for subject_idx in idx_space:\n", - " subject = abide_df.iloc[subject_idx]['FILE_ID']\n", - " subject_path = data_path / f'{subject}_{derivative}.1D'\n", - " rois = pd.read_csv(subject_path, delimiter='\\t')\n", - " space_rois[derivative][pipeline][strategy][conn_metric].append(rois.to_numpy())\n", - " methods_idx[count] = [derivative, pipeline, strategy, conn_metric]\n", - " count += 1\n", - " pbar.update(1)\n", - "\n", - "count = 0\n", - "# Iterate over the possible configurations and calculate the connectivity metric.\n", - "with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar:\n", - " for derivative in derivatives:\n", - " for pipeline in pipelines:\n", - " for strategy in strategies:\n", - " for conn_metric in conn_metrics:\n", - " space_flat_rois = []\n", - " correlation_measure = ConnectivityMeasure(kind=conn_metric)\n", - " correlation_matrix = correlation_measure.fit_transform(\n", - " space_rois[derivative][pipeline][strategy][conn_metric])\n", - " # Plot the upper diagonal connectivity matrix, excluding the diagonal (k=1)\n", - " # correlation_matrix = np.triu(correlation_matrix, k=1)\n", - " # plotting.plot_matrix(correlation_matrix, colorbar=True, vmax=1, vmin=-1)\n", - " # plt.savefig(output_path / f'{subject}_{derivative}.png')\n", - " for subject_idx in range(len(idx_space)):\n", - " tmp = correlation_matrix[subject_idx][np.triu_indices(\n", - " space_rois[derivative][pipeline][strategy][conn_metric][0].shape[1], k=1)]\n", - " space_flat_rois.append(tmp)\n", - "\n", - " # Build an array of similarities between subjects for each analysis approach. This is used as a\n", - " # distance metric between the different subjects\n", - " cos_sim = cosine_similarity(space_flat_rois)\n", - " ResultsIndVar[count, :] = cos_sim[np.triu_indices(len(idx_space), k=1)]\n", - " count += 1\n", - " pbar.update(1)\n", - "\n", - "# Save results\n", - "save_results = {'Results': ResultsIndVar, 'methods_idx': methods_idx}\n", - "with open((output_path / 'abide_space.pckl'), 'wb') as handle:\n", - " pickle.dump(save_results, handle)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Building and analysing the low-dimensional space" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Lv6WVWIcaVMB", - "outputId": "c10d3fce-9925-4ef7-ca17-d2290466a72b" - }, - "outputs": [], - "source": [ - "# Load the indices we want to use for the analysis\n", - "with open((output_path / f'indices_space_train.json'), 'r') as handle:\n", - " indices = json.load(handle)\n", - "\n", - "idx_train = indices['idx_train']\n", - "idx_space = indices['idx_space']\n", - "\n", - "train_df = abide_df.iloc[idx_train]\n", - "print('Numbers on training df')\n", - "print(train_df['DX_GROUP'].value_counts())\n", - "space_df = abide_df.iloc[idx_space]\n", - "print('Numbers on space df')\n", - "print(space_df['DX_GROUP'].value_counts())\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3NiHFqyBfJuH", - "outputId": "92707d75-8be0-44df-b50d-34926c6e0f25" - }, - "outputs": [], - "source": [ - "with open((output_path / 'abide_space.pckl'), 'rb') as handle:\n", - " save_results = pickle.load(handle)\n", - "ResultsIndVar = save_results['Results']\n", - "methods_idx = save_results['methods_idx']\n", - "\n", - "# Reduced dataset\n", - "data_reduced = {}\n", - "\n", - "# plot tSNE\n", - "Results = ResultsIndVar\n", - "scaler = StandardScaler()\n", - "X = scaler.fit_transform(Results.T)\n", - "X = X.T\n", - "n_neighbors = 60\n", - "n_components = 2\n", - "#Define different dimensionality reduction techniques\n", - "methods = OrderedDict()\n", - "LLE = partial(manifold.LocallyLinearEmbedding,\n", - " n_neighbors, n_components, eigen_solver='dense')\n", - "methods['LLE'] = LLE(method='standard', random_state=0)\n", - "methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,\n", - " n_neighbors=n_neighbors, random_state=0)\n", - "methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca', perplexity=150,\n", - " random_state=0)\n", - "methods['UMAP'] = UMAP(random_state=40, n_components=2, n_neighbors=200,\n", - " min_dist=.8)\n", - "methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=10,\n", - " random_state=21, metric=True)\n", - "methods['PHATE'] = phate.PHATE()\n", - "\n", - "\n", - "methods['PCA'] = PCA(n_components=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_I6WJ3-BiYrv" - }, - "outputs": [], - "source": [ - "# Define markers for the derivatives\n", - "markers = ['s', 'o', '^', 'D', 'v', '*']\n", - "markers_order = np.array([pip[0] for pip in methods_idx.values()])\n", - "\n", - "# Define colors and markers for the pipeliens\n", - "#colourmaps = {'ccs': 'Greens', 'cpac': 'Purples', 'dparsf': 'Blues', 'niak': 'Reds'}\n", - "colourmaps = {'correlation': 'Greens', 'covariance': 'Purples', 'partial correlation': 'Blues', 'tangent': 'Reds'}\n", - "metrics_order = np.array([pip[3] for pip in methods_idx.values()])\n", - "\n", - "# Define colors and markers for the strategies\n", - "markers_strategies = {'filt_global': .7, 'nofilt_global': .4, 'nofilt_noglobal': .15, 'filt_noglobal': .55}\n", - "strategies_order = [pip[2] for pip in methods_idx.values()]\n", - "strategies_int = np.array([markers_strategies[x] for x in strategies_order])\n", - "\n", - "markers_metric = ['-', '/', '.', \"x\"]\n", - "markers_map = {'cpac': '-', 'ccs': '/', 'dparsf': '.', 'niak': 'x'}\n", - "pipeline_order = np.array([pip[1] for pip in methods_idx.values()])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 917 - }, - "id": "2nKCbKUGk8Ns", - "outputId": "6dade7a5-0163-4924-e505-d523bfe629b5" - }, - "outputs": [], - "source": [ - "selected_analysis = 'MDS'\n", - "Lines = {}\n", - "Y = methods[selected_analysis].fit_transform(X)\n", - "data_reduced[selected_analysis] = Y\n", - "figMDS = plt.figure(figsize=(21, 15))\n", - "gsMDS = figMDS.add_gridspec(nrows=15, ncols=20)\n", - "axs = figMDS.add_subplot(gsMDS[:, 0:15])\n", - "#for idx_pip, pipeline in enumerate(sorted(colourmaps)):\n", - "for idx_metric, conn_metric in enumerate(sorted(colourmaps)):\n", - " for idx_pipeline, pipeline in enumerate(sorted(pipelines)):\n", - " for idx_derivative, derivative in enumerate(sorted(derivatives)):\n", - " axs.scatter(Y[:, 0][(markers_order == derivative) & (metrics_order == conn_metric) & (pipeline_order == pipeline)],\n", - " Y[:, 1][(markers_order == derivative) & (metrics_order == conn_metric) & (pipeline_order == pipeline)],\n", - " c=strategies_int[(markers_order == derivative) & (metrics_order == conn_metric) & (pipeline_order == pipeline)],\n", - " s=180, marker=markers[idx_derivative], hatch=4*markers_metric[idx_pipeline],\n", - " norm=plt.Normalize(vmin=0, vmax=1),\n", - " cmap=colourmaps[conn_metric])\n", - " Lines[idx_derivative] = mlines.Line2D([], [], color='black', linestyle='None', marker=markers[idx_derivative],\n", - " markersize=10, label=derivative)\n", - "axs.spines['top'].set_linewidth(1.5)\n", - "axs.spines['right'].set_linewidth(1.5)\n", - "axs.spines['bottom'].set_linewidth(1.5)\n", - "axs.spines['left'].set_linewidth(1.5)\n", - "axs.set_xlabel('dimension 2', fontsize=25)\n", - "axs.set_ylabel('dimension 1', fontsize=25)\n", - "axs.tick_params(labelsize=15)\n", - "axs.set_title(f'{selected_analysis}', fontsize=20, fontweight=\"bold\")\n", - "plt.axis('tight')\n", - "GreenPatch = mpatches.Patch(color='#52b365', label='correlation')\n", - "PurplePatch = mpatches.Patch(color='#8a86bf', label='covariance')\n", - "BluesPatch = mpatches.Patch(color='#4f9bcb', label='partial correlation')\n", - "RedsPatch = mpatches.Patch(color='#f34a36', label='tangent')\n", - "IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and GSR',\n", - " alpha=1)\n", - "IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and no GSR',\n", - " alpha=0.5)\n", - "IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and GSR',\n", - " alpha=0.2)\n", - "IntensityPatch4 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and no GSR',\n", - " alpha=0.1)\n", - "line_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[0], label=sorted(pipelines)[0],\n", - " alpha=.1)\n", - "dot_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[1], label=sorted(pipelines)[1],\n", - " alpha=.1)\n", - "diagonal_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[2], label=sorted(pipelines)[2],\n", - " alpha=.1)\n", - "x_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[3], label=sorted(pipelines)[3],\n", - " alpha=.1)\n", - "BlankLine = mlines.Line2D([], [], linestyle='None')\n", - "\n", - "plt.legend(handles=[GreenPatch, BluesPatch, PurplePatch, RedsPatch, BlankLine, IntensityPatch1,\n", - " IntensityPatch2, IntensityPatch3, IntensityPatch4, BlankLine,\n", - " Lines[0], Lines[1], Lines[2], Lines[3], Lines[4], Lines[5], BlankLine,\n", - " line_patchPatch, dot_patchPatch, diagonal_patchPatch, x_patchPatch\n", - " ],\n", - " fontsize=24, frameon=False, bbox_to_anchor=(1.4, .97), bbox_transform=axs.transAxes)\n", - "plt.savefig(output_path / f'{selected_analysis}_v2.png', dpi=300)\n", - "plt.savefig(output_path / f'{selected_analysis}_v2.svg', format='svg')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "V6gXc-7vyMyj" + }, + "source": [ + "The data for this analysis should be downloaded from http://preprocessed-connectomes-project.org/abide/download.html before running this notebook. Because downloading the dataset might take a few hours, we recommend downloading the data locally. \n", + "\n", + "Similarly to the regression analysis, we have provided together with the gitrepository the intermediate steps of this analysis. Therefore, the researcher interested in replicating parts of this notebook can skip the most time-consuming steps and run only specific sections. \n", + "\n", + "Please note, that we recommend re-running this analysis locally. However, if the user is only interested in replicating our figures or is interested in only one of our sub-analysis, the intermediary steps are provided as pickles and can be run using this colab. \n", + "\n", + "Note: that this script was run using Python version 3.7.12" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "todG1Zr1KuNz" + }, + "source": [ + "# 1. Setting up the enviroment" + ] + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/Mind-the-Pineapple/into-the-multiverse/" + ], + "metadata": { + "id": "l-5hyNsAyWhP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eJcI72vCYmnE" + }, + "outputs": [], + "source": [ + "# Install necessary python dependencies. Only necessary if the dependencies have not been previously installed.\n", + "# If you are running this notebook locally make sure you have a virtual environment and are running this notebook\n", + "# from inside the virtual environment. \n", + "! pip install -r into-the-multiverse/requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OvjsBxZGyMyo" + }, + "outputs": [], + "source": [ + "# Define key variables\n", + "# Add the into-the-multiverse folder to the Python path. This allows the helperfunction\n", + "# to be used\n", + "import sys\n", + "sys.path.insert(1, 'into-the-multiverse')\n", + "\n", + "import numpy as np\n", + "np.random.seed(1234)\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "from pathlib import Path\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "source": [ + "This setup can be used to run the code. If the user has the abide downloaded locally. Uncomment these lines, if you are interested in re-running all the experiments. We recommend doing this in a notebook locally as only the download of the data will go over the allocated free time for colab. " + ], + "metadata": { + "id": "EyG6P8E5MAoc" + } + }, + { + "cell_type": "code", + "source": [ + "# Set up the local paths accordingly\n", + "# ----------------------------------------\n", + "\n", + "# Paths using Colab\n", + "from pathlib import Path\n", + "PROJECT_ROOT = Path.cwd()\n", + "data_root = PROJECT_ROOT / 'into-the-multiverse' /'data' / 'abide'\n", + "output_path = PROJECT_ROOT / 'into-the-multiverse' / 'output'/ 'abide'\n", + "re_runanalysis = False\n", + "\n", + "# Local paths\n", + "# All paths are expected to be Path objects\n", + "# Specifiy the location of the code\n", + "#path_to_project = Path.home() / 'Code'/ 'into-the-multiverse'\n", + "#os.chdir(path_to_project)\n", + "#PROJECT_ROOT = Path.cwd()\n", + "# Specify the path to where the data has been downloaded\n", + "#data_root = Path('/Volumes/abide')\n", + "#output_path = PROJECT_ROOT / 'output' / 'abide'\n", + "#if not output_path.is_dir():\n", + "# output_path.mkdir(parents=True)\n", + "# re_runanalysis = True" + ], + "metadata": { + "id": "kUHrKOcbym9t" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "am11shEOyMyq" + }, + "outputs": [], + "source": [ + "from collections import OrderedDict\n", + "import pickle\n", + "import json\n", + "from functools import partial\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as mpatches\n", + "import matplotlib.lines as mlines\n", + "from scipy.stats import spearmanr\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.metrics import roc_auc_score\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import manifold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.gaussian_process import GaussianProcessRegressor\n", + "\n", + "from nilearn.connectome import ConnectivityMeasure\n", + "from umap.umap_ import UMAP\n", + "import phate\n", + "\n", + "from helperfunctions import (initialize_bo, run_bo, load_abide_demographics, plot_bo_estimated_space, plot_bo_evolution,\n", + " posteriorOnlyModels, plot_bo_repetions, objective_func_class)\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "M8_vz3WpdZxi" + }, + "outputs": [], + "source": [ + "# Define the space variables\n", + "derivatives = ['rois_tt', 'rois_ho', 'rois_ez', 'rois_dosenbach160', 'rois_cc400', 'rois_cc200']\n", + "pipelines = ['cpac', 'ccs', 'dparsf', 'niak']\n", + "strategies = ['filt_global', 'nofilt_global', 'nofilt_noglobal', 'filt_noglobal']\n", + "conn_metrics = ['tangent', 'correlation', 'partial correlation', 'covariance']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e39Us6vbyMyu" + }, + "source": [ + "# 2. Run the different analysis to bild the space" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cpwXUSJ7yMyv" + }, + "source": [ + "The next step assumes that the data has been downloaded. The data can be downloaded from (http://preprocessed-connectomes-project.org/abide/download.html). For time reasons, we will not download the data within this notebook. To run this script the code expects the files to be in the following structure:\n", + "\n", + "```\n", + "├── ccs\n", + "│   ├── filt_global\n", + "│   ├── filt_noglobal\n", + "│   ├── nofilt_global\n", + "│   └── nofilt_noglobal\n", + "├── cpac\n", + "│   ├── filt_global\n", + "│   ├── filt_noglobal\n", + "│   ├── nofilt_global\n", + "│   └── nofilt_noglobal\n", + "├── dparsf\n", + "│   ├── filt_global\n", + "│   ├── filt_noglobal\n", + "│   ├── nofilt_global\n", + "│   └── nofilt_noglobal\n", + "└── niak\n", + " ├── filt_global\n", + " ├── filt_noglobal\n", + " ├── nofilt_global\n", + " └── nofilt_noglobal\n", + "```\n", + "\n", + "However, to facilitate reproducibility together with this code. We are providing the file `output/abide/abide_space.pckl`, which contains the output from the next cell. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "s3BQKAk9yMyw" + }, + "outputs": [], + "source": [ + "# select the subjects we want to use to create the space (about 20% of the total subjects) making sure that\n", + "# both classes are equally represented. Uuse the remaining 80% for the active learning step and as a holdout\n", + "# dataset to evaluate the performance of the active learning step.\n", + "\n", + "# Load data demographics\n", + "abide_df = load_abide_demographics(data_root)\n", + "indices = np.arange(len(abide_df))\n", + "idx_space, idx_train_holdout = train_test_split(indices, test_size=.8, train_size=.2, random_state=0,\n", + " shuffle=True, stratify=abide_df['DX_GROUP'])\n", + "# Split the training data again, to keep part of the dataset as a hold out dataset\n", + "\n", + "idx_train, idx_holdout = train_test_split(idx_train_holdout, test_size=.25, train_size=.75, random_state=0,\n", + " shuffle=True, stratify=abide_df['DX_GROUP'].iloc[idx_train_holdout])\n", + "# Visualise stratification\n", + "space_df = abide_df.iloc[idx_space]\n", + "print('Numbers on space df')\n", + "print(space_df['DX_GROUP'].value_counts())\n", + "\n", + "train_df = abide_df.iloc[idx_train]\n", + "print('Numbers on training df')\n", + "print(train_df['DX_GROUP'].value_counts())\n", + "\n", + "holdout_df = abide_df.iloc[idx_holdout]\n", + "print('Numbers on hold out df')\n", + "print(holdout_df['DX_GROUP'].value_counts())\n", + "\n", + "# save list of indexes of the data split\n", + "indices = {'idx_train': idx_train.tolist(),\n", + " 'idx_space': idx_space.tolist(),\n", + " 'idx_holdout': idx_holdout.tolist()}\n", + "with open((output_path / f'indices_space_train.json'), 'w') as handle:\n", + " json.dump(indices, handle)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "96LOy8fayMyx" + }, + "source": [ + "The next cell will create the space. This is a time consuming step and might take a few hours to run. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rsueufLdyMyz" + }, + "outputs": [], + "source": [ + "if re_runanalysis:\n", + " n_idx_space = int(len(idx_space) * (len(idx_space) - 1) / 2)\n", + " count = 0\n", + " ResultsIndVar = np.zeros(((len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics)), n_idx_space))\n", + " methods_idx = {}\n", + " space_rois = {}\n", + " with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar:\n", + " for derivative in derivatives:\n", + " space_rois[derivative] = {}\n", + " for pipeline in pipelines:\n", + " space_rois[derivative][pipeline] = {}\n", + " for strategy in strategies:\n", + " space_rois[derivative][pipeline][strategy] = {}\n", + " for conn_metric in conn_metrics:\n", + " data_path = data_root / 'Outputs' / pipeline / strategy / derivative\n", + " space_rois[derivative][pipeline][strategy][conn_metric] = []\n", + " for subject_idx in idx_space:\n", + " subject = abide_df.iloc[subject_idx]['FILE_ID']\n", + " subject_path = data_path / f'{subject}_{derivative}.1D'\n", + " rois = pd.read_csv(subject_path, delimiter='\\t')\n", + " space_rois[derivative][pipeline][strategy][conn_metric].append(rois.to_numpy())\n", + " methods_idx[count] = [derivative, pipeline, strategy, conn_metric]\n", + " count += 1\n", + " pbar.update(1)\n", + "\n", + " count = 0\n", + " # Iterate over the possible configurations and calculate the connectivity metric.\n", + " with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar:\n", + " for derivative in derivatives:\n", + " for pipeline in pipelines:\n", + " for strategy in strategies:\n", + " for conn_metric in conn_metrics:\n", + " space_flat_rois = []\n", + " correlation_measure = ConnectivityMeasure(kind=conn_metric)\n", + " correlation_matrix = correlation_measure.fit_transform(\n", + " space_rois[derivative][pipeline][strategy][conn_metric])\n", + " # Plot the upper diagonal connectivity matrix, excluding the diagonal (k=1)\n", + " # correlation_matrix = np.triu(correlation_matrix, k=1)\n", + " # plotting.plot_matrix(correlation_matrix, colorbar=True, vmax=1, vmin=-1)\n", + " # plt.savefig(output_path / f'{subject}_{derivative}.png')\n", + " for subject_idx in range(len(idx_space)):\n", + " tmp = correlation_matrix[subject_idx][np.triu_indices(\n", + " space_rois[derivative][pipeline][strategy][conn_metric][0].shape[1], k=1)]\n", + " space_flat_rois.append(tmp)\n", + "\n", + " # Build an array of similarities between subjects for each analysis approach. This is used as a\n", + " # distance metric between the different subjects\n", + " cos_sim = cosine_similarity(space_flat_rois)\n", + " ResultsIndVar[count, :] = cos_sim[np.triu_indices(len(idx_space), k=1)]\n", + " count += 1\n", + " pbar.update(1)\n", + "\n", + " # Save results\n", + " save_results = {'Results': ResultsIndVar, 'methods_idx': methods_idx}\n", + " with open((output_path / 'abide_space.pckl'), 'wb') as handle:\n", + " pickle.dump(save_results, handle)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0tS8y1KhyMy0" + }, + "source": [ + "# 3. Building and analysing the low-dimensional space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Lv6WVWIcaVMB" + }, + "outputs": [], + "source": [ + "# Load the indices we want to use for the analysis\n", + "abide_df = load_abide_demographics(data_root)\n", + "with open((output_path / f'indices_space_train.json'), 'r') as handle:\n", + " indices = json.load(handle)\n", + "\n", + "idx_train = indices['idx_train']\n", + "idx_space = indices['idx_space']\n", + "\n", + "train_df = abide_df.iloc[idx_train]\n", + "print('Numbers on training df')\n", + "print(train_df['DX_GROUP'].value_counts())\n", + "space_df = abide_df.iloc[idx_space]\n", + "print('Numbers on space df')\n", + "print(space_df['DX_GROUP'].value_counts())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3NiHFqyBfJuH" + }, + "outputs": [], + "source": [ + "with open((output_path / 'abide_space.pckl'), 'rb') as handle:\n", + " save_results = pickle.load(handle)\n", + "ResultsIndVar = save_results['Results']\n", + "methods_idx = save_results['methods_idx']\n", + "\n", + "# Reduced dataset\n", + "data_reduced = {}\n", + "\n", + "# plot tSNE\n", + "Results = ResultsIndVar\n", + "scaler = StandardScaler()\n", + "X = scaler.fit_transform(Results.T)\n", + "X = X.T\n", + "n_neighbors = 60\n", + "n_components = 2\n", + "#Define different dimensionality reduction techniques\n", + "methods = OrderedDict()\n", + "LLE = partial(manifold.LocallyLinearEmbedding,\n", + " n_neighbors, n_components, eigen_solver='dense')\n", + "methods['LLE'] = LLE(method='standard', random_state=0)\n", + "methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,\n", + " n_neighbors=n_neighbors, random_state=0)\n", + "methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca', perplexity=150,\n", + " random_state=0)\n", + "methods['UMAP'] = UMAP(random_state=40, n_components=2, n_neighbors=200,\n", + " min_dist=.8)\n", + "methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=10,\n", + " random_state=21, metric=True)\n", + "methods['PHATE'] = phate.PHATE()\n", + "\n", + "\n", + "methods['PCA'] = PCA(n_components=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_I6WJ3-BiYrv" + }, + "outputs": [], + "source": [ + "# Define markers for the derivatives\n", + "markers = ['s', 'o', '^', 'D', 'v', '*']\n", + "markers_order = np.array([pip[0] for pip in methods_idx.values()])\n", + "\n", + "# Define colors and markers for the pipeliens\n", + "#colourmaps = {'ccs': 'Greens', 'cpac': 'Purples', 'dparsf': 'Blues', 'niak': 'Reds'}\n", + "colourmaps = {'correlation': 'Greens', 'covariance': 'Purples', 'partial correlation': 'Blues', 'tangent': 'Reds'}\n", + "metrics_order = np.array([pip[3] for pip in methods_idx.values()])\n", + "\n", + "# Define colors and markers for the strategies\n", + "markers_strategies = {'filt_global': .7, 'nofilt_global': .4, 'nofilt_noglobal': .15, 'filt_noglobal': .55}\n", + "strategies_order = [pip[2] for pip in methods_idx.values()]\n", + "strategies_int = np.array([markers_strategies[x] for x in strategies_order])\n", + "\n", + "markers_metric = ['-', '/', '.', \"x\"]\n", + "markers_map = {'cpac': '-', 'ccs': '/', 'dparsf': '.', 'niak': 'x'}\n", + "pipeline_order = np.array([pip[1] for pip in methods_idx.values()])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2nKCbKUGk8Ns" + }, + "outputs": [], + "source": [ + "selected_analysis = 'MDS'\n", + "Lines = {}\n", + "Y = methods[selected_analysis].fit_transform(X)\n", + "data_reduced[selected_analysis] = Y\n", + "figMDS = plt.figure(figsize=(21, 15))\n", + "gsMDS = figMDS.add_gridspec(nrows=15, ncols=20)\n", + "axs = figMDS.add_subplot(gsMDS[:, 0:15])\n", + "#for idx_pip, pipeline in enumerate(sorted(colourmaps)):\n", + "for idx_metric, conn_metric in enumerate(sorted(colourmaps)):\n", + " for idx_pipeline, pipeline in enumerate(sorted(pipelines)):\n", + " for idx_derivative, derivative in enumerate(sorted(derivatives)):\n", + " axs.scatter(Y[:, 0][(markers_order == derivative) & (metrics_order == conn_metric) & (pipeline_order == pipeline)],\n", + " Y[:, 1][(markers_order == derivative) & (metrics_order == conn_metric) & (pipeline_order == pipeline)],\n", + " c=strategies_int[(markers_order == derivative) & (metrics_order == conn_metric) & (pipeline_order == pipeline)],\n", + " s=180, marker=markers[idx_derivative], hatch=4*markers_metric[idx_pipeline],\n", + " norm=plt.Normalize(vmin=0, vmax=1),\n", + " cmap=colourmaps[conn_metric])\n", + " Lines[idx_derivative] = mlines.Line2D([], [], color='black', linestyle='None', marker=markers[idx_derivative],\n", + " markersize=10, label=derivative)\n", + "axs.spines['top'].set_linewidth(1.5)\n", + "axs.spines['right'].set_linewidth(1.5)\n", + "axs.spines['bottom'].set_linewidth(1.5)\n", + "axs.spines['left'].set_linewidth(1.5)\n", + "axs.set_xlabel('dimension 2', fontsize=25)\n", + "axs.set_ylabel('dimension 1', fontsize=25)\n", + "axs.tick_params(labelsize=15)\n", + "axs.set_title(f'{selected_analysis}', fontsize=20, fontweight=\"bold\")\n", + "plt.axis('tight')\n", + "GreenPatch = mpatches.Patch(color='#52b365', label='correlation')\n", + "PurplePatch = mpatches.Patch(color='#8a86bf', label='covariance')\n", + "BluesPatch = mpatches.Patch(color='#4f9bcb', label='partial correlation')\n", + "RedsPatch = mpatches.Patch(color='#f34a36', label='tangent')\n", + "IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and GSR',\n", + " alpha=1)\n", + "IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and no GSR',\n", + " alpha=0.5)\n", + "IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and GSR',\n", + " alpha=0.2)\n", + "IntensityPatch4 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and no GSR',\n", + " alpha=0.1)\n", + "line_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[0], label=sorted(pipelines)[0],\n", + " alpha=.1)\n", + "dot_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[1], label=sorted(pipelines)[1],\n", + " alpha=.1)\n", + "diagonal_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[2], label=sorted(pipelines)[2],\n", + " alpha=.1)\n", + "x_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4*markers_metric[3], label=sorted(pipelines)[3],\n", + " alpha=.1)\n", + "BlankLine = mlines.Line2D([], [], linestyle='None')\n", + "\n", + "plt.legend(handles=[GreenPatch, BluesPatch, PurplePatch, RedsPatch, BlankLine, IntensityPatch1,\n", + " IntensityPatch2, IntensityPatch3, IntensityPatch4, BlankLine,\n", + " Lines[0], Lines[1], Lines[2], Lines[3], Lines[4], Lines[5], BlankLine,\n", + " line_patchPatch, dot_patchPatch, diagonal_patchPatch, x_patchPatch\n", + " ],\n", + " fontsize=24, frameon=False, bbox_to_anchor=(1.4, .97), bbox_transform=axs.transAxes)\n", + "plt.savefig(output_path / f'{selected_analysis}_v2.png', dpi=300)\n", + "plt.savefig(output_path / f'{selected_analysis}_v2.pdf', format='pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6Obx2j7pif1c" + }, + "outputs": [], + "source": [ + "# Plot the other methods\n", + "# Reduced dimensions\n", + "# As we already analysed the MDS drop it from the dictionary\n", + "methods.pop(selected_analysis)\n", + "gsDE, axs = plt.subplots(3, 2, figsize=(16, 16), constrained_layout=True)\n", + "axs = axs.ravel()\n", + "for idx_method, (label, method) in enumerate(methods.items()):\n", + " Y = method.fit_transform(X)\n", + " # Save the results\n", + " data_reduced[label] = Y\n", + " Lines = {}\n", + "\n", + " # for idx_pip, pipeline in enumerate(sorted(colourmaps)):\n", + " for idx_metric, conn_metric in enumerate(sorted(colourmaps)):\n", + " for idx_pipeline, pipeline in enumerate(sorted(pipelines)):\n", + " for idx_derivative, derivative in enumerate(sorted(derivatives)):\n", + " axs[idx_method].scatter(Y[:, 0][(markers_order == derivative) & (metrics_order == conn_metric) & (\n", + " pipeline_order == pipeline)],\n", + " Y[:, 1][(markers_order == derivative) & (metrics_order == conn_metric) & (\n", + " pipeline_order == pipeline)],\n", + " c=strategies_int[(markers_order == derivative) & (metrics_order == conn_metric) & (\n", + " pipeline_order == pipeline)],\n", + " s=180, marker=markers[idx_derivative], hatch=4 * markers_metric[idx_pipeline],\n", + " norm=plt.Normalize(vmin=0, vmax=1),\n", + " cmap=colourmaps[conn_metric])\n", + " Lines[idx_derivative] = mlines.Line2D([], [], color='black', linestyle='None',\n", + " marker=markers[idx_derivative],\n", + " markersize=10, label=derivative)\n", + " if idx_method %2 == 0:\n", + " axs[idx_method].set_xlabel('Dimension 1', fontsize=20)\n", + " if (idx_method == 4) or (idx_method == 5):\n", + " axs[idx_method].set_ylabel('Dimension 2', fontsize=20)\n", + "\n", + " axs[idx_method].set_title(f'{label}', fontsize=20, fontweight=\"bold\")\n", + " axs[idx_method].axis('tight')\n", + " axs[idx_method].tick_params(labelsize=15)\n", + "\n", + "GreenPatch = mpatches.Patch(color='#52b365', label='correlation')\n", + "PurplePatch = mpatches.Patch(color='#8a86bf', label='covariance')\n", + "BluesPatch = mpatches.Patch(color='#4f9bcb', label='partial correlation')\n", + "RedsPatch = mpatches.Patch(color='#f34a36', label='tangent')\n", + "IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and GSR',\n", + " alpha=1)\n", + "IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and no GSR',\n", + " alpha=0.5)\n", + "IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and GSR',\n", + " alpha=0.2)\n", + "IntensityPatch4 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and no GSR',\n", + " alpha=0.1)\n", + "line_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[0], label=sorted(pipelines)[0],\n", + " alpha=.1)\n", + "dot_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[1], label=sorted(pipelines)[1],\n", + " alpha=.1)\n", + "diagonal_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[2],\n", + " label=sorted(pipelines)[2],\n", + " alpha=.1)\n", + "x_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[3], label=sorted(pipelines)[3],\n", + " alpha=.1)\n", + "BlankLine = mlines.Line2D([], [], linestyle='None')\n", + "\n", + "gsDE.legend(handles=[GreenPatch, BluesPatch, PurplePatch, RedsPatch, BlankLine, IntensityPatch1,\n", + " IntensityPatch2, IntensityPatch3, IntensityPatch4, BlankLine,\n", + " Lines[0], Lines[1], Lines[2], Lines[3], Lines[4], Lines[5], BlankLine,\n", + " line_patchPatch, dot_patchPatch, diagonal_patchPatch, x_patchPatch],\n", + " fontsize=15, frameon=False, bbox_to_anchor=(1.25, 0.7))\n", + "\n", + "gsDE.savefig(str(output_path / 'dim_reduction.png'), dpi=300)\n", + "gsDE.savefig(str(output_path / 'dim_reduction.svg'), format='svg')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hg-IOOViimsO" + }, + "outputs": [], + "source": [ + "gsDE.savefig(str(output_path / 'dim_reduction.png'), dpi=300, bbox_inches='tight')\n", + "gsDE.savefig(str(output_path / 'dim_reduction.svg'), format='svg', bbox_inches='tight')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Paf6MLjVyMy7" + }, + "outputs": [], + "source": [ + "# save embeddings\n", + "if re_runanalysis:\n", + " with open((output_path / 'embeddings.pckl'), 'wb') as handle:\n", + " pickle.dump(data_reduced, handle)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ljknINWDyMy7" + }, + "source": [ + "# 4. Exhaustive Search" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MzoPBqJRyMy8" + }, + "source": [ + "As in step 1. this step also assumes that the data has been previously downloaded. If for computational purposes you do not want to download the data and re-calculate the predictions, we provide the exhaustively search spaced: `output/abide/predictedAcc.pckl`\n", + "\n", + "Note: This is also a time consuming step and might take about 28hrs to complete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6VwOkIHUyMy9" + }, + "outputs": [], + "source": [ + "# Load the embedding results\n", + "with open((output_path / 'embeddings.pckl'), 'rb') as handle:\n", + " embeddings = pickle.load(handle)\n", + "# Load the labels for the analysis\n", + "with open(output_path / 'abide_space.pckl', 'rb') as handle:\n", + " Data_Run = pickle.load(handle)\n", + "# Load indices of the subjects used for train and test\n", + "with open((output_path / f'indices_space_train.json'), 'rb') as handle:\n", + " indices = json.load(handle)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IjuhPVCKyMy9" + }, + "outputs": [], + "source": [ + "# We will use the MDS space\n", + "model_embedding = embeddings['MDS']\n", + "if re_runanalysis:\n", + "\n", + " abide_df = load_abide_demographics(data_root)\n", + " # Select only models to train on\n", + " train_df = abide_df.iloc[indices['idx_train']]\n", + " train_labels = train_df['DX_GROUP']\n", + " files_id = train_df['FILE_ID']\n", + "\n", + " PredictedAcc = np.zeros((len(Data_Run['Results'])))\n", + " for count in tqdm(range(len(Data_Run['Results']))):\n", + " PredictedAcc[count] = objective_func_class(Data_Run['methods_idx'], count, train_labels, files_id,\n", + " data_root, output_path)\n", + "\n", + " # Dump predictions\n", + " pickle.dump(PredictedAcc, open(str(output_path / 'predictedAcc.pckl'), 'wb'))\n", + "else:\n", + " with open((output_path / f'predictedAcc.pckl'), 'rb') as handle:\n", + " PredictedAcc = pickle.load(handle)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RNn9oPmTyMy9" + }, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.scatter(model_embedding[0: PredictedAcc.shape[0], 0],\n", + " model_embedding[0: PredictedAcc.shape[0], 1],\n", + " c=(PredictedAcc), cmap='bwr')\n", + "plt.colorbar()\n", + "plt.savefig(output_path / 'Predictions.png')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kemFPhRVXl07" + }, + "source": [ + "# 5. Active Learning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tQjGaERtyMy_" + }, + "source": [ + "Note: This step also requires the user to have previously downloaded the raw data and specified the path to it on top of this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j63NOZxuyMy_" + }, + "outputs": [], + "source": [ + "def compute_active_learning(kappa, model_config, CassOrRegression):\n", + " # Load data demographics\n", + " abide_df = load_abide_demographics(data_root)\n", + "\n", + " # Load the embedding results\n", + " with open((output_path / 'embeddings.pckl'), 'rb') as handle:\n", + " embeddings = pickle.load(handle)\n", + " with open(output_path / 'abide_space.pckl', 'rb') as handle:\n", + " Data_Run = pickle.load(handle)\n", + " with open((output_path / 'predictedAcc.pckl'), 'rb') as handle:\n", + " PredictedAcc = pickle.load(handle)\n", + " model_embedding = embeddings['MDS']\n", + " # Load indices of the subjects used for train and test\n", + " with open((output_path / f'indices_space_train.json'), 'rb') as handle:\n", + " indices = json.load(handle)\n", + " # Remove subjects that were used to create the space\n", + " train_df = abide_df.iloc[indices['idx_train']]\n", + " Y = train_df['DX_GROUP']\n", + " files_id = train_df['FILE_ID']\n", + "\n", + " # Check range of predictions\n", + " PredictedAcc = pickle.load(open(str(output_path / \"predictedAcc.pckl\"), \"rb\"))\n", + " print(f'Max {np.max(PredictedAcc)}')\n", + " print(f'Min {np.min(PredictedAcc)}')\n", + " print(f'Mean and std {np.mean(PredictedAcc)} and {np.std(PredictedAcc)}')\n", + "\n", + " model_config['Data_Run'] = Data_Run['methods_idx']\n", + " model_config['files_id'] = train_df['FILE_ID']\n", + " model_config['output_path'] = output_path\n", + "\n", + " kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \\\n", + " initialize_bo(model_embedding, kappa)\n", + "\n", + " BadIter = run_bo(optimizer, utility, init_points, n_iter,\n", + " pbounds, nbrs, RandomSeed, model_embedding, model_config,\n", + " Y, ClassOrRegression, MultivariateUnivariate=True,\n", + " repetitions=False, verbose=True)\n", + "\n", + " x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = \\\n", + " plot_bo_estimated_space(kappa, BadIter, optimizer, pbounds, model_embedding, PredictedAcc, kernel,\n", + " output_path, ClassOrRegression)\n", + "\n", + " corr = plot_bo_evolution(kappa, x_exploratory, y_exploratory, z_exploratory, x, y, gp,\n", + " vmax, vmin, model_embedding, PredictedAcc, output_path, ClassOrRegression)\n", + "\n", + " return corr\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qebDWGD7yMzA" + }, + "outputs": [], + "source": [ + "if re_runanalysis:\n", + " kappa = 10.0\n", + " # path to the raw data\n", + " model_config = {}\n", + " model_config['data_root'] = data_root\n", + " ClassOrRegression = 'Classification'\n", + " corr = compute_active_learning(kappa, model_config, ClassOrRegression)\n", + " print(f'Spearman correlation {corr}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "id": "3SFnYtBcyMzA" + }, + "outputs": [], + "source": [ + "if re_runanalysis:\n", + " kappa = .1\n", + " # path to the raw data\n", + " model_config = {}\n", + " model_config['data_root'] = data_root\n", + " ClassOrRegression = 'Classification'\n", + " corr = compute_active_learning(kappa, model_config, ClassOrRegression)\n", + " print(f'Spearman correlation {corr}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7v8DtnR2yMzB" + }, + "source": [ + "## Repetitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DrPpNR_XyMzC" + }, + "outputs": [], + "source": [ + "def calculate_conn(Y, files_id):\n", + " TotalSubjects = len(Y)\n", + " TempResults = []\n", + " pipeline = Data_Run['methods_idx'][TempModelNum][1]\n", + " strategy = Data_Run['methods_idx'][TempModelNum][2]\n", + " derivative = Data_Run['methods_idx'][TempModelNum][0]\n", + " data_path = data_root / 'Outputs' / pipeline / strategy / derivative\n", + " # Load the data for every subject.\n", + " for file_id in files_id:\n", + " subject_path = data_path / f'{file_id}_{derivative}.1D'\n", + " rois = pd.read_csv(subject_path, delimiter='\\t')\n", + " TempResults.append(rois.to_numpy())\n", + " # Calculate the correlation using the selected metric\n", + " correlation_measure = ConnectivityMeasure(kind=Data_Run['methods_idx'][TempModelNum][3])\n", + " correlation_matrix = correlation_measure.fit_transform(TempResults)\n", + " lower_diag_n = int(rois.shape[1] * (rois.shape[1] - 1) / 2)\n", + " rois_l = np.zeros((TotalSubjects, lower_diag_n))\n", + " for subject in range(TotalSubjects):\n", + " rois_l[subject, :] = correlation_matrix[subject, :, :][np.triu_indices(rois.shape[1], k=1)]\n", + " return rois_l\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iiISXMIRyMzD" + }, + "outputs": [], + "source": [ + "# Load data demographics\n", + "abide_df = load_abide_demographics(data_root)\n", + "\n", + "# Load the embedding results\n", + "with open((output_path / 'embeddings.pckl'), 'rb') as handle:\n", + " embeddings = pickle.load(handle)\n", + "# Load the labels for the analysis\n", + "with open(output_path / 'abide_space.pckl', 'rb') as handle:\n", + " Data_Run = pickle.load(handle)\n", + "# Load indices of the subjects used for train and test\n", + "with open((output_path / f'indices_space_train.json'), 'rb') as handle:\n", + " indices = json.load(handle)\n", + "\n", + "model_embedding = embeddings['MDS']\n", + "kappa = 10\n", + "\n", + "train_df = abide_df.iloc[indices['idx_train']]\n", + "train_Y = train_df['DX_GROUP']\n", + "train_files_id = train_df['FILE_ID']\n", + "holdout_df = abide_df.iloc[indices['idx_holdout']]\n", + "holdout_y = holdout_df['DX_GROUP']\n", + "holdout_files_id = holdout_df['FILE_ID']\n", + "\n", + "ClassOrRegress = 'Classification'\n", + "model_config = {}\n", + "model_config['Data_Run'] = Data_Run['methods_idx']\n", + "model_config['files_id'] = train_df['FILE_ID']\n", + "model_config['data_root'] = data_root\n", + "model_config['output_path'] = output_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pbhck23CyMzF" + }, + "outputs": [], + "source": [ + "# Check range of predictions\n", + "PredictedAcc = pickle.load(open(str(output_path / \"predictedAcc.pckl\"), \"rb\"))\n", + "print(f'Max {np.max(PredictedAcc)}')\n", + "print(f'Min {np.min(PredictedAcc)}')\n", + "print(f'Mean and std {np.mean(PredictedAcc)} and {np.std(PredictedAcc)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jeiKalroyMzG" + }, + "source": [ + "Note the next steps might take a few hours to run and it requires the ABIDE dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VzwbrDQGyMzH" + }, + "outputs": [], + "source": [ + "if re_runanalysis:\n", + " n_repetitions = 20\n", + "\n", + " BestModelGPSpace = np.zeros(n_repetitions)\n", + " BestModelGPSpaceModIndex = np.zeros(n_repetitions)\n", + " BestModelEmpirical = np.zeros(n_repetitions)\n", + " BestModelEmpiricalModIndex = np.zeros(n_repetitions)\n", + " ModelActualAccuracyCorrelation = np.zeros(n_repetitions)\n", + " cv_scores = np.zeros(n_repetitions)\n", + "\n", + " for DiffInit in range(n_repetitions):\n", + " print(f'Repetiton #: {DiffInit}')\n", + " # Define settings for the analysis\n", + " kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \\\n", + " initialize_bo(model_embedding, kappa, repetitions=True,\n", + " DiffInit=DiffInit)\n", + "\n", + " FailedIters = run_bo(optimizer, utility, init_points,\n", + " n_iter, pbounds, nbrs, RandomSeed,\n", + " model_embedding, model_config, train_Y,\n", + " ClassOrRegress,\n", + " MultivariateUnivariate=True,\n", + " repetitions=True,\n", + " verbose=False)\n", + " gp = GaussianProcessRegressor(kernel=kernel, normalize_y=True,\n", + " n_restarts_optimizer=10)\n", + "\n", + " x_temp = np.array([[res[\"params\"][\"b1\"]] for res in optimizer.res])\n", + " y_temp = np.array([[res[\"params\"][\"b2\"]] for res in optimizer.res])\n", + " z_temp = np.array([res[\"target\"] for res in optimizer.res])\n", + "\n", + " x_obs = x_temp[FailedIters == 0]\n", + " y_obs = y_temp[FailedIters == 0]\n", + " z_obs = z_temp[FailedIters == 0]\n", + "\n", + " muModEmb, sigmaModEmb, gpModEmb = posteriorOnlyModels(gp, x_obs, y_obs, z_obs,\n", + " model_embedding)\n", + "\n", + " BestModelGPSpace[DiffInit] = muModEmb.max()\n", + " BestModelGPSpaceModIndex[DiffInit] = muModEmb.argmax()\n", + " BestModelEmpirical[DiffInit] = z_obs.max()\n", + " Model_coord = np.array([[x_obs[z_obs.argmax()][-1], y_obs[z_obs.argmax()][-1]]])\n", + " BestModelEmpiricalModIndex[DiffInit] = nbrs.kneighbors(Model_coord)[1][0][0]\n", + " ModelActualAccuracyCorrelation[DiffInit] = spearmanr(muModEmb, PredictedAcc)[0]\n", + " TempModelNum = muModEmb.argmax()\n", + "\n", + " train_rois_l = calculate_conn(train_Y, train_files_id)\n", + " holdout_rois_l = calculate_conn(holdout_y, holdout_files_id)\n", + "\n", + " model = Pipeline([('scaler', StandardScaler()), ('reg', LogisticRegression(penalty='l2', random_state=0))])\n", + "\n", + " model.fit(train_rois_l, train_Y.ravel())\n", + " pred = model.predict(holdout_rois_l)\n", + " y_proba = model.predict_proba(holdout_rois_l)[:, 1]\n", + " score = roc_auc_score(holdout_y.ravel(), y_proba)\n", + "\n", + " #CVPValBestModels[DiffInit] = pvalue\n", + " cv_scores[DiffInit] = score\n", + "\n", + " df_best = pd.DataFrame(columns=['repetition', 'pipeline', 'derivatives', 'strategies', 'conn_metrics', 'score'])\n", + " for n in range(n_repetitions):\n", + " n_results = {}\n", + " n_results['repetition'] = n\n", + " n_results['pipeline'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][1]\n", + " n_results['derivatives'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][0]\n", + " n_results['strategies'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][2]\n", + " n_results['conn_metrics'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][3]\n", + " n_results['score'] = cv_scores[n]\n", + " df_best = df_best.append(n_results, ignore_index=True)\n", + " df_best = df_best.set_index('repetition')\n", + " # format the score column to a 3 digits\n", + " df_best['score'] = df_best['score'].apply('{:.3f}'.format)\n" + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "vZeXcpXbPp9T" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "6Obx2j7pif1c", - "outputId": "fe5209ab-57ec-4617-a7bb-9379d260de3c" - }, - "outputs": [], - "source": [ - "# Plot the other methods\n", - "# Reduced dimensions\n", - "# As we already analysed the MDS drop it from the dictionary\n", - "methods.pop(selected_analysis)\n", - "gsDE, axs = plt.subplots(3, 2, figsize=(16, 16), constrained_layout=True)\n", - "axs = axs.ravel()\n", - "for idx_method, (label, method) in enumerate(methods.items()):\n", - " Y = method.fit_transform(X)\n", - " # Save the results\n", - " data_reduced[label] = Y\n", - " Lines = {}\n", - "\n", - " # for idx_pip, pipeline in enumerate(sorted(colourmaps)):\n", - " for idx_metric, conn_metric in enumerate(sorted(colourmaps)):\n", - " for idx_pipeline, pipeline in enumerate(sorted(pipelines)):\n", - " for idx_derivative, derivative in enumerate(sorted(derivatives)):\n", - " axs[idx_method].scatter(Y[:, 0][(markers_order == derivative) & (metrics_order == conn_metric) & (\n", - " pipeline_order == pipeline)],\n", - " Y[:, 1][(markers_order == derivative) & (metrics_order == conn_metric) & (\n", - " pipeline_order == pipeline)],\n", - " c=strategies_int[(markers_order == derivative) & (metrics_order == conn_metric) & (\n", - " pipeline_order == pipeline)],\n", - " s=180, marker=markers[idx_derivative], hatch=4 * markers_metric[idx_pipeline],\n", - " norm=plt.Normalize(vmin=0, vmax=1),\n", - " cmap=colourmaps[conn_metric])\n", - " Lines[idx_derivative] = mlines.Line2D([], [], color='black', linestyle='None',\n", - " marker=markers[idx_derivative],\n", - " markersize=10, label=derivative)\n", - " if idx_method %2 == 0:\n", - " axs[idx_method].set_xlabel('Dimension 1', fontsize=20)\n", - " if (idx_method == 4) or (idx_method == 5):\n", - " axs[idx_method].set_ylabel('Dimension 2', fontsize=20)\n", - "\n", - " axs[idx_method].set_title(f'{label}', fontsize=20, fontweight=\"bold\")\n", - " axs[idx_method].axis('tight')\n", - " axs[idx_method].tick_params(labelsize=15)\n", - "\n", - "GreenPatch = mpatches.Patch(color='#52b365', label='correlation')\n", - "PurplePatch = mpatches.Patch(color='#8a86bf', label='covariance')\n", - "BluesPatch = mpatches.Patch(color='#4f9bcb', label='partial correlation')\n", - "RedsPatch = mpatches.Patch(color='#f34a36', label='tangent')\n", - "IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and GSR',\n", - " alpha=1)\n", - "IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='filter and no GSR',\n", - " alpha=0.5)\n", - "IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and GSR',\n", - " alpha=0.2)\n", - "IntensityPatch4 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='no filter and no GSR',\n", - " alpha=0.1)\n", - "line_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[0], label=sorted(pipelines)[0],\n", - " alpha=.1)\n", - "dot_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[1], label=sorted(pipelines)[1],\n", - " alpha=.1)\n", - "diagonal_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[2],\n", - " label=sorted(pipelines)[2],\n", - " alpha=.1)\n", - "x_patchPatch = mpatches.Patch(facecolor=[0.1, 0.1, 0.1], hatch=4 * markers_metric[3], label=sorted(pipelines)[3],\n", - " alpha=.1)\n", - "BlankLine = mlines.Line2D([], [], linestyle='None')\n", - "\n", - "gsDE.legend(handles=[GreenPatch, BluesPatch, PurplePatch, RedsPatch, BlankLine, IntensityPatch1,\n", - " IntensityPatch2, IntensityPatch3, IntensityPatch4, BlankLine,\n", - " Lines[0], Lines[1], Lines[2], Lines[3], Lines[4], Lines[5], BlankLine,\n", - " line_patchPatch, dot_patchPatch, diagonal_patchPatch, x_patchPatch],\n", - " fontsize=15, frameon=False, bbox_to_anchor=(1.25, 0.7))\n", - "\n", - "gsDE.savefig(str(output_path / 'dim_reduction.png'), dpi=300)\n", - "gsDE.savefig(str(output_path / 'dim_reduction.svg'), format='svg')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hg-IOOViimsO" - }, - "outputs": [], - "source": [ - "gsDE.savefig(str(output_path / 'dim_reduction.png'), dpi=300, bbox_inches='tight')\n", - "gsDE.savefig(str(output_path / 'dim_reduction.svg'), format='svg', bbox_inches='tight')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# save embeddings\n", - "with open((output_path / 'embeddings.pckl'), 'wb') as handle:\n", - " pickle.dump(data_reduced, handle)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Exhaustive Search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As in step 1. this step also assumes that the data has been previously downloaded. If for computational purposes you do not want to download the data and re-calculate the predictions, we provide the exhaustively search spaced: `output/abide/predictedAcc.pckl`\n", - "\n", - "Note: This is also a time consuming step and might take about 28hrs to complete" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load the embedding results\n", - "with open((output_path / 'embeddings.pckl'), 'rb') as handle:\n", - " embeddings = pickle.load(handle)\n", - "# Load the labels for the analysis\n", - "with open(output_path / 'abide_space.pckl', 'rb') as handle:\n", - " Data_Run = pickle.load(handle)\n", - "# Load indices of the subjects used for train and test\n", - "with open((output_path / f'indices_space_train.json'), 'rb') as handle:\n", - " indices = json.load(handle)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# We will use the MDS space\n", - "model_embedding = embeddings['MDS']\n", - "\n", - "abide_df = load_abide_demographics(data_root)\n", - "# Select only models to train on\n", - "train_df = abide_df.iloc[indices['idx_train']]\n", - "train_labels = train_df['DX_GROUP']\n", - "files_id = train_df['FILE_ID']\n", - "\n", - "PredictedAcc = np.zeros((len(Data_Run['Results'])))\n", - "for count in tqdm(range(len(Data_Run['Results']))):\n", - " PredictedAcc[count] = objective_func_class(Data_Run['methods_idx'], count, train_labels, files_id,\n", - " data_root, output_path)\n", - "\n", - "# Dump predictions\n", - "pickle.dump(PredictedAcc, open(str(output_path / 'predictedAcc.pckl'), 'wb'))\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure()\n", - "plt.scatter(model_embedding[0: PredictedAcc.shape[0], 0],\n", - " model_embedding[0: PredictedAcc.shape[0], 1],\n", - " c=(PredictedAcc), cmap='bwr')\n", - "plt.colorbar()\n", - "plt.savefig(output_path / 'Predictions.png')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kemFPhRVXl07" - }, - "source": [ - "# 5. Active Learning" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: This step also requires the user to have previously downloaded the raw data and specified the path to it on top of this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_active_learning(kappa, model_config, CassOrRegression):\n", - " # Load data demographics\n", - " abide_df = load_abide_demographics(data_root)\n", - "\n", - " # Load the embedding results\n", - " with open((output_path / 'embeddings.pckl'), 'rb') as handle:\n", - " embeddings = pickle.load(handle)\n", - " with open(output_path / 'abide_space.pckl', 'rb') as handle:\n", - " Data_Run = pickle.load(handle)\n", - " with open((output_path / 'predictedAcc.pckl'), 'rb') as handle:\n", - " PredictedAcc = pickle.load(handle)\n", - " model_embedding = embeddings['MDS']\n", - " # Load indices of the subjects used for train and test\n", - " with open((output_path / f'indices_space_train.json'), 'rb') as handle:\n", - " indices = json.load(handle)\n", - " # Remove subjects that were used to create the space\n", - " train_df = abide_df.iloc[indices['idx_train']]\n", - " Y = train_df['DX_GROUP']\n", - " files_id = train_df['FILE_ID']\n", - "\n", - " # Check range of predictions\n", - " PredictedAcc = pickle.load(open(str(output_path / \"predictedAcc.pckl\"), \"rb\"))\n", - " print(f'Max {np.max(PredictedAcc)}')\n", - " print(f'Min {np.min(PredictedAcc)}')\n", - " print(f'Mean and std {np.mean(PredictedAcc)} and {np.std(PredictedAcc)}')\n", - "\n", - " model_config['Data_Run'] = Data_Run['methods_idx']\n", - " model_config['files_id'] = train_df['FILE_ID']\n", - " model_config['output_path'] = output_path\n", - "\n", - " kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \\\n", - " initialize_bo(model_embedding, kappa)\n", - "\n", - " BadIter = run_bo(optimizer, utility, init_points, n_iter,\n", - " pbounds, nbrs, RandomSeed, model_embedding, model_config,\n", - " Y, ClassOrRegression, MultivariateUnivariate=True,\n", - " repetitions=False, verbose=True)\n", - "\n", - " x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = \\\n", - " plot_bo_estimated_space(kappa, BadIter, optimizer, pbounds, model_embedding, PredictedAcc, kernel,\n", - " output_path, ClassOrRegression)\n", - "\n", - " corr = plot_bo_evolution(kappa, x_exploratory, y_exploratory, z_exploratory, x, y, gp,\n", - " vmax, vmin, model_embedding, PredictedAcc, output_path, ClassOrRegression)\n", - "\n", - " return corr\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kappa = 10.0\n", - "# path to the raw data\n", - "model_config = {}\n", - "model_config['data_root'] = data_root\n", - "ClassOrRegression = 'Classification'\n", - "corr = compute_active_learning(kappa, model_config, ClassOrRegression)\n", - "print(f'Spearman correlation {corr}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "kappa = .1\n", - "# path to the raw data\n", - "model_config = {}\n", - "model_config['data_root'] = data_root\n", - "ClassOrRegression = 'Classification'\n", - "corr = compute_active_learning(kappa, model_config, ClassOrRegression)\n", - "print(f'Spearman correlation {corr}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Repetitions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_conn(Y, files_id):\n", - " TotalSubjects = len(Y)\n", - " TempResults = []\n", - " pipeline = Data_Run['methods_idx'][TempModelNum][1]\n", - " strategy = Data_Run['methods_idx'][TempModelNum][2]\n", - " derivative = Data_Run['methods_idx'][TempModelNum][0]\n", - " data_path = data_root / 'Outputs' / pipeline / strategy / derivative\n", - " # Load the data for every subject.\n", - " for file_id in files_id:\n", - " subject_path = data_path / f'{file_id}_{derivative}.1D'\n", - " rois = pd.read_csv(subject_path, delimiter='\\t')\n", - " TempResults.append(rois.to_numpy())\n", - " # Calculate the correlation using the selected metric\n", - " correlation_measure = ConnectivityMeasure(kind=Data_Run['methods_idx'][TempModelNum][3])\n", - " correlation_matrix = correlation_measure.fit_transform(TempResults)\n", - " lower_diag_n = int(rois.shape[1] * (rois.shape[1] - 1) / 2)\n", - " rois_l = np.zeros((TotalSubjects, lower_diag_n))\n", - " for subject in range(TotalSubjects):\n", - " rois_l[subject, :] = correlation_matrix[subject, :, :][np.triu_indices(rois.shape[1], k=1)]\n", - " return rois_l\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load data demographics\n", - "abide_df = load_abide_demographics(data_root)\n", - "\n", - "# Load the embedding results\n", - "with open((output_path / 'embeddings.pckl'), 'rb') as handle:\n", - " embeddings = pickle.load(handle)\n", - "# Load the labels for the analysis\n", - "with open(output_path / 'abide_space.pckl', 'rb') as handle:\n", - " Data_Run = pickle.load(handle)\n", - "# Load indices of the subjects used for train and test\n", - "with open((output_path / f'indices_space_train.json'), 'rb') as handle:\n", - " indices = json.load(handle)\n", - "\n", - "# TODO: make this more generalisable. We will use the MDS space\n", - "model_embedding = embeddings['MDS']\n", - "kappa = 10\n", - "\n", - "train_df = abide_df.iloc[indices['idx_train']]\n", - "train_Y = train_df['DX_GROUP']\n", - "train_files_id = train_df['FILE_ID']\n", - "holdout_df = abide_df.iloc[indices['idx_holdout']]\n", - "holdout_y = holdout_df['DX_GROUP']\n", - "holdout_files_id = holdout_df['FILE_ID']\n", - "\n", - "ClassOrRegress = 'Classification'\n", - "model_config = {}\n", - "model_config['Data_Run'] = Data_Run['methods_idx']\n", - "model_config['files_id'] = train_df['FILE_ID']\n", - "model_config['data_root'] = data_root\n", - "model_config['output_path'] = output_path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check range of predictions\n", - "PredictedAcc = pickle.load(open(str(output_path / \"predictedAcc.pckl\"), \"rb\"))\n", - "print(f'Max {np.max(PredictedAcc)}')\n", - "print(f'Min {np.min(PredictedAcc)}')\n", - "print(f'Mean and std {np.mean(PredictedAcc)} and {np.std(PredictedAcc)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note the next steps might take a few hours to run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_repetitions = 20\n", - "\n", - "BestModelGPSpace = np.zeros(n_repetitions)\n", - "BestModelGPSpaceModIndex = np.zeros(n_repetitions)\n", - "BestModelEmpirical = np.zeros(n_repetitions)\n", - "BestModelEmpiricalModIndex = np.zeros(n_repetitions)\n", - "ModelActualAccuracyCorrelation = np.zeros(n_repetitions)\n", - "cv_scores = np.zeros(n_repetitions)\n", - "\n", - "for DiffInit in range(n_repetitions):\n", - " print(f'Repetiton #: {DiffInit}')\n", - " # Define settings for the analysis\n", - " kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \\\n", - " initialize_bo(model_embedding, kappa, repetitions=True,\n", - " DiffInit=DiffInit)\n", - "\n", - " FailedIters = run_bo(optimizer, utility, init_points,\n", - " n_iter, pbounds, nbrs, RandomSeed,\n", - " model_embedding, model_config, train_Y,\n", - " ClassOrRegress,\n", - " MultivariateUnivariate=True,\n", - " repetitions=True,\n", - " verbose=False)\n", - " gp = GaussianProcessRegressor(kernel=kernel, normalize_y=True,\n", - " n_restarts_optimizer=10)\n", - "\n", - " x_temp = np.array([[res[\"params\"][\"b1\"]] for res in optimizer.res])\n", - " y_temp = np.array([[res[\"params\"][\"b2\"]] for res in optimizer.res])\n", - " z_temp = np.array([res[\"target\"] for res in optimizer.res])\n", - "\n", - " x_obs = x_temp[FailedIters == 0]\n", - " y_obs = y_temp[FailedIters == 0]\n", - " z_obs = z_temp[FailedIters == 0]\n", - "\n", - " muModEmb, sigmaModEmb, gpModEmb = posteriorOnlyModels(gp, x_obs, y_obs, z_obs,\n", - " model_embedding)\n", - "\n", - " BestModelGPSpace[DiffInit] = muModEmb.max()\n", - " BestModelGPSpaceModIndex[DiffInit] = muModEmb.argmax()\n", - " BestModelEmpirical[DiffInit] = z_obs.max()\n", - " Model_coord = np.array([[x_obs[z_obs.argmax()][-1], y_obs[z_obs.argmax()][-1]]])\n", - " BestModelEmpiricalModIndex[DiffInit] = nbrs.kneighbors(Model_coord)[1][0][0]\n", - " ModelActualAccuracyCorrelation[DiffInit] = spearmanr(muModEmb, PredictedAcc)[0]\n", - " TempModelNum = muModEmb.argmax()\n", - "\n", - " train_rois_l = calculate_conn(train_Y, train_files_id)\n", - " holdout_rois_l = calculate_conn(holdout_y, holdout_files_id)\n", - "\n", - " model = Pipeline([('scaler', StandardScaler()), ('reg', LogisticRegression(penalty='l2', random_state=0))])\n", - "\n", - " model.fit(train_rois_l, train_Y.ravel())\n", - " pred = model.predict(holdout_rois_l)\n", - " y_proba = model.predict_proba(holdout_rois_l)[:, 1]\n", - " score = roc_auc_score(holdout_y.ravel(), y_proba)\n", - "\n", - " #CVPValBestModels[DiffInit] = pvalue\n", - " cv_scores[DiffInit] = score\n", - "\n", - "df_best = pd.DataFrame(columns=['repetition', 'pipeline', 'derivatives', 'strategies', 'conn_metrics', 'score'])\n", - "for n in range(n_repetitions):\n", - " n_results = {}\n", - " n_results['repetition'] = n\n", - " n_results['pipeline'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][1]\n", - " n_results['derivatives'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][0]\n", - " n_results['strategies'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][2]\n", - " n_results['conn_metrics'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][3]\n", - " n_results['score'] = cv_scores[n]\n", - " df_best = df_best.append(n_results, ignore_index=True)\n", - "df_best = df_best.set_index('repetition')\n", - "# format the score column to a 3 digits\n", - "df_best['score'] = df_best['score'].apply('{:.3f}'.format)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_best" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Copy of multiverse_classification", - "provenance": [] - }, - "kernelspec": { - "display_name": "multiverse", - "language": "python", - "name": "multiverse" + "collapsed_sections": [], + "name": "into-the-multiverse-classification", + "provenance": [] + }, + "kernelspec": { + "display_name": "multiverse", + "language": "python", + "name": "multiverse" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/notebooks/multiverse_analysis_classification.py b/notebooks/multiverse_analysis_classification.py index 3a4b4e1..1ccb6ae 100644 --- a/notebooks/multiverse_analysis_classification.py +++ b/notebooks/multiverse_analysis_classification.py @@ -1,25 +1,28 @@ -#!/usr/bin/env python -# coding: utf-8 +# -*- coding: utf-8 -*- +"""into-the-multiverse-classification -# This notebook runs the classification analysis using the ABIDE dataset. -# -# The data for this analysis should be downloaded from http://preprocessed-connectomes-project.org/abide/download.html before running this notebook. Because downloading the dataset might take a few hours, we recommend downloading the data locally. -# -# Similarly to the regression analysis, we have provided together with the gitrepository the intermediate steps of this analysis. Therefore, the researcher interested in replicating parts of this notebook can skip the most time consuming steps and run only specific sections. +Automatically generated by Colaboratory. -# # 1. Setting up the enviroment +Original file is located at + https://colab.research.google.com/drive/1bKh49f1UgJOu3qOmRPg-YJaONF-KUOGh -# In[ ]: +The data for this analysis should be downloaded from http://preprocessed-connectomes-project.org/abide/download.html before running this notebook. Because downloading the dataset might take a few hours, we recommend downloading the data locally. +Similarly to the regression analysis, we have provided together with the gitrepository the intermediate steps of this analysis. Therefore, the researcher interested in replicating parts of this notebook can skip the most time-consuming steps and run only specific sections. -# Install necessary python dependencies. Only necessary if the dependencies have not been previously installed. -# If you are running this notebook locally make sure you have a virtual environment and are running this notebook -# from inside the virtual environment. -get_ipython().system(' pip install -r requirements.txt') +Please note, that we recommend re-running this analysis locally. However, if the user is only interested in replicating our figures or is interested in only one of our sub-analysis, the intermediary steps are provided as pickles and can be run using this colab. + +Note: that this script was run using Python version 3.7.12 +# 1. Setting up the enviroment +""" -# In[ ]: +!git clone https://github.com/Mind-the-Pineapple/into-the-multiverse/ +# Install necessary python dependencies. Only necessary if the dependencies have not been previously installed. +# If you are running this notebook locally make sure you have a virtual environment and are running this notebook +# from inside the virtual environment. +! pip install -r into-the-multiverse/requirements.txt # Define key variables # Add the into-the-multiverse folder to the Python path. This allows the helperfunction @@ -35,27 +38,32 @@ from pathlib import Path import os - -# In[ ]: - +"""This setup can be used to run the code. If the user has the abide downloaded locally. Uncomment these lines, if you are interested in re-running all the experiments. We recommend doing this in a notebook locally as only the download of the data will go over the allocated free time for colab. """ # Set up the local paths accordingly # ---------------------------------------- + +# Paths using Colab +from pathlib import Path +PROJECT_ROOT = Path.cwd() +data_root = PROJECT_ROOT / 'into-the-multiverse' /'data' / 'abide' +output_path = PROJECT_ROOT / 'into-the-multiverse' / 'output'/ 'abide' +re_runanalysis = False + +# Local paths # All paths are expected to be Path objects # Specifiy the location of the code -path_to_project = Path.home() / 'Code'/ 'into-the-multiverse' -os.chdir(path_to_project) -PROJECT_ROOT = Path.cwd() +#path_to_project = Path.home() / 'Code'/ 'into-the-multiverse' +#os.chdir(path_to_project) +#PROJECT_ROOT = Path.cwd() # Specify the path to where the data has been downloaded -data_root = Path('/Volumes/abide') -output_path = PROJECT_ROOT / 'output' / 'abide' -if not output_path.is_dir(): - output_path.mkdir(parents=True) - - -# In[ ]: - +#data_root = Path('/Volumes/abide') +#output_path = PROJECT_ROOT / 'output' / 'abide' +#if not output_path.is_dir(): +# output_path.mkdir(parents=True) +# re_runanalysis = True +# Commented out IPython magic to ensure Python compatibility. from collections import OrderedDict import pickle import json @@ -85,12 +93,8 @@ from helperfunctions import (initialize_bo, run_bo, load_abide_demographics, plot_bo_estimated_space, plot_bo_evolution, posteriorOnlyModels, plot_bo_repetions, objective_func_class) -get_ipython().run_line_magic('load_ext', 'autoreload') -get_ipython().run_line_magic('autoreload', '2') - - -# In[ ]: - +# %load_ext autoreload +# %autoreload 2 # Define the space variables derivatives = ['rois_tt', 'rois_ho', 'rois_ez', 'rois_dosenbach160', 'rois_cc400', 'rois_cc200'] @@ -98,38 +102,35 @@ strategies = ['filt_global', 'nofilt_global', 'nofilt_noglobal', 'filt_noglobal'] conn_metrics = ['tangent', 'correlation', 'partial correlation', 'covariance'] - -# # 2. Run the different analysis to bild the space - -# The next step assumes that the data has been downloaded. The data can be downloaded from (http://preprocessed-connectomes-project.org/abide/download.html). For time reasons, we will not download the data within this notebook. To run this script the code expects the files to be in the following structure: -# -# ``` -# ├── ccs -# │   ├── filt_global -# │   ├── filt_noglobal -# │   ├── nofilt_global -# │   └── nofilt_noglobal -# ├── cpac -# │   ├── filt_global -# │   ├── filt_noglobal -# │   ├── nofilt_global -# │   └── nofilt_noglobal -# ├── dparsf -# │   ├── filt_global -# │   ├── filt_noglobal -# │   ├── nofilt_global -# │   └── nofilt_noglobal -# └── niak -# ├── filt_global -# ├── filt_noglobal -# ├── nofilt_global -# └── nofilt_noglobal -# ``` -# -# However, to facilitate reproducibility together with this code. We are providing the file `output/abide/abide_space.pckl`, which contains the output from the next cell. - -# In[ ]: - +"""# 2. Run the different analysis to bild the space + +The next step assumes that the data has been downloaded. The data can be downloaded from (http://preprocessed-connectomes-project.org/abide/download.html). For time reasons, we will not download the data within this notebook. To run this script the code expects the files to be in the following structure: + +``` +├── ccs +│   ├── filt_global +│   ├── filt_noglobal +│   ├── nofilt_global +│   └── nofilt_noglobal +├── cpac +│   ├── filt_global +│   ├── filt_noglobal +│   ├── nofilt_global +│   └── nofilt_noglobal +├── dparsf +│   ├── filt_global +│   ├── filt_noglobal +│   ├── nofilt_global +│   └── nofilt_noglobal +└── niak + ├── filt_global + ├── filt_noglobal + ├── nofilt_global + └── nofilt_noglobal +``` + +However, to facilitate reproducibility together with this code. We are providing the file `output/abide/abide_space.pckl`, which contains the output from the next cell. +""" # select the subjects we want to use to create the space (about 20% of the total subjects) making sure that # both classes are equally represented. Uuse the remaining 80% for the active learning step and as a holdout @@ -164,18 +165,15 @@ with open((output_path / f'indices_space_train.json'), 'w') as handle: json.dump(indices, handle) +"""The next cell will create the space. This is a time consuming step and might take a few hours to run. """ -# The next cell will create the space. This is a time consuming step and might take a few hours to run. - -# In[ ]: - - -n_idx_space = int(len(idx_space) * (len(idx_space) - 1) / 2) -count = 0 -ResultsIndVar = np.zeros(((len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics)), n_idx_space)) -methods_idx = {} -space_rois = {} -with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar: +if re_runanalysis: + n_idx_space = int(len(idx_space) * (len(idx_space) - 1) / 2) + count = 0 + ResultsIndVar = np.zeros(((len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics)), n_idx_space)) + methods_idx = {} + space_rois = {} + with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar: for derivative in derivatives: space_rois[derivative] = {} for pipeline in pipelines: @@ -194,9 +192,9 @@ count += 1 pbar.update(1) -count = 0 -# Iterate over the possible configurations and calculate the connectivity metric. -with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar: + count = 0 + # Iterate over the possible configurations and calculate the connectivity metric. + with tqdm(range(len(derivatives) * len(pipelines) * len(strategies) * len(conn_metrics))) as pbar: for derivative in derivatives: for pipeline in pipelines: for strategy in strategies: @@ -221,18 +219,15 @@ count += 1 pbar.update(1) -# Save results -save_results = {'Results': ResultsIndVar, 'methods_idx': methods_idx} -with open((output_path / 'abide_space.pckl'), 'wb') as handle: + # Save results + save_results = {'Results': ResultsIndVar, 'methods_idx': methods_idx} + with open((output_path / 'abide_space.pckl'), 'wb') as handle: pickle.dump(save_results, handle) - -# # 3. Building and analysing the low-dimensional space - -# In[ ]: - +"""# 3. Building and analysing the low-dimensional space""" # Load the indices we want to use for the analysis +abide_df = load_abide_demographics(data_root) with open((output_path / f'indices_space_train.json'), 'r') as handle: indices = json.load(handle) @@ -246,10 +241,6 @@ print('Numbers on space df') print(space_df['DX_GROUP'].value_counts()) - -# In[ ]: - - with open((output_path / 'abide_space.pckl'), 'rb') as handle: save_results = pickle.load(handle) ResultsIndVar = save_results['Results'] @@ -283,10 +274,6 @@ methods['PCA'] = PCA(n_components=2) - -# In[ ]: - - # Define markers for the derivatives markers = ['s', 'o', '^', 'D', 'v', '*'] markers_order = np.array([pip[0] for pip in methods_idx.values()]) @@ -305,10 +292,6 @@ markers_map = {'cpac': '-', 'ccs': '/', 'dparsf': '.', 'niak': 'x'} pipeline_order = np.array([pip[1] for pip in methods_idx.values()]) - -# In[ ]: - - selected_analysis = 'MDS' Lines = {} Y = methods[selected_analysis].fit_transform(X) @@ -366,11 +349,7 @@ ], fontsize=24, frameon=False, bbox_to_anchor=(1.4, .97), bbox_transform=axs.transAxes) plt.savefig(output_path / f'{selected_analysis}_v2.png', dpi=300) -plt.savefig(output_path / f'{selected_analysis}_v2.svg', format='svg') - - -# In[ ]: - +plt.savefig(output_path / f'{selected_analysis}_v2.pdf', format='pdf') # Plot the other methods # Reduced dimensions @@ -441,30 +420,20 @@ gsDE.savefig(str(output_path / 'dim_reduction.png'), dpi=300) gsDE.savefig(str(output_path / 'dim_reduction.svg'), format='svg') - -# In[ ]: - - gsDE.savefig(str(output_path / 'dim_reduction.png'), dpi=300, bbox_inches='tight') gsDE.savefig(str(output_path / 'dim_reduction.svg'), format='svg', bbox_inches='tight') - -# In[ ]: - - # save embeddings -with open((output_path / 'embeddings.pckl'), 'wb') as handle: +if re_runanalysis: + with open((output_path / 'embeddings.pckl'), 'wb') as handle: pickle.dump(data_reduced, handle) +"""# 4. Exhaustive Search -# # 4. Exhaustive Search - -# As in step 1. this step also assumes that the data has been previously downloaded. If for computational purposes you do not want to download the data and re-calculate the predictions, we provide the exhaustively search spaced: `output/abide/predictedAcc.pckl` -# -# Note: This is also a time consuming step and might take about 28hrs to complete - -# In[ ]: +As in step 1. this step also assumes that the data has been previously downloaded. If for computational purposes you do not want to download the data and re-calculate the predictions, we provide the exhaustively search spaced: `output/abide/predictedAcc.pckl` +Note: This is also a time consuming step and might take about 28hrs to complete +""" # Load the embedding results with open((output_path / 'embeddings.pckl'), 'rb') as handle: @@ -476,30 +445,26 @@ with open((output_path / f'indices_space_train.json'), 'rb') as handle: indices = json.load(handle) - -# In[ ]: - - # We will use the MDS space model_embedding = embeddings['MDS'] +if re_runanalysis: -abide_df = load_abide_demographics(data_root) -# Select only models to train on -train_df = abide_df.iloc[indices['idx_train']] -train_labels = train_df['DX_GROUP'] -files_id = train_df['FILE_ID'] - -PredictedAcc = np.zeros((len(Data_Run['Results']))) -for count in tqdm(range(len(Data_Run['Results']))): - PredictedAcc[count] = objective_func_class(Data_Run['methods_idx'], count, train_labels, files_id, - data_root, output_path) - -# Dump predictions -pickle.dump(PredictedAcc, open(str(output_path / 'predictedAcc.pckl'), 'wb')) - + abide_df = load_abide_demographics(data_root) + # Select only models to train on + train_df = abide_df.iloc[indices['idx_train']] + train_labels = train_df['DX_GROUP'] + files_id = train_df['FILE_ID'] -# In[ ]: + PredictedAcc = np.zeros((len(Data_Run['Results']))) + for count in tqdm(range(len(Data_Run['Results']))): + PredictedAcc[count] = objective_func_class(Data_Run['methods_idx'], count, train_labels, files_id, + data_root, output_path) + # Dump predictions + pickle.dump(PredictedAcc, open(str(output_path / 'predictedAcc.pckl'), 'wb')) +else: + with open((output_path / f'predictedAcc.pckl'), 'rb') as handle: + PredictedAcc = pickle.load(handle) plt.figure() plt.scatter(model_embedding[0: PredictedAcc.shape[0], 0], @@ -508,13 +473,10 @@ plt.colorbar() plt.savefig(output_path / 'Predictions.png') +"""# 5. Active Learning -# # 5. Active Learning - -# Note: This step also requires the user to have previously downloaded the raw data and specified the path to it on top of this notebook. - -# In[ ]: - +Note: This step also requires the user to have previously downloaded the raw data and specified the path to it on top of this notebook. +""" def compute_active_learning(kappa, model_config, CassOrRegression): # Load data demographics @@ -546,51 +508,42 @@ def compute_active_learning(kappa, model_config, CassOrRegression): model_config['files_id'] = train_df['FILE_ID'] model_config['output_path'] = output_path - kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = initialize_bo(model_embedding, kappa) + kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \ + initialize_bo(model_embedding, kappa) BadIter = run_bo(optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed, model_embedding, model_config, Y, ClassOrRegression, MultivariateUnivariate=True, repetitions=False, verbose=True) - x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = plot_bo_estimated_space(kappa, BadIter, optimizer, pbounds, model_embedding, PredictedAcc, kernel, + x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = \ + plot_bo_estimated_space(kappa, BadIter, optimizer, pbounds, model_embedding, PredictedAcc, kernel, output_path, ClassOrRegression) corr = plot_bo_evolution(kappa, x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin, model_embedding, PredictedAcc, output_path, ClassOrRegression) return corr - - - -# In[ ]: - - -kappa = 10.0 -# path to the raw data -model_config = {} -model_config['data_root'] = data_root -ClassOrRegression = 'Classification' -corr = compute_active_learning(kappa, model_config, ClassOrRegression) -print(f'Spearman correlation {corr}') - - -# In[ ]: - - -kappa = .1 -# path to the raw data -model_config = {} -model_config['data_root'] = data_root -ClassOrRegression = 'Classification' -corr = compute_active_learning(kappa, model_config, ClassOrRegression) -print(f'Spearman correlation {corr}') - - -# ## Repetitions - -# In[ ]: +if re_runanalysis: + kappa = 10.0 + # path to the raw data + model_config = {} + model_config['data_root'] = data_root + ClassOrRegression = 'Classification' + corr = compute_active_learning(kappa, model_config, ClassOrRegression) + print(f'Spearman correlation {corr}') + +if re_runanalysis: + kappa = .1 + # path to the raw data + model_config = {} + model_config['data_root'] = data_root + ClassOrRegression = 'Classification' + corr = compute_active_learning(kappa, model_config, ClassOrRegression) + print(f'Spearman correlation {corr}') + +"""## Repetitions""" def calculate_conn(Y, files_id): TotalSubjects = len(Y) @@ -613,10 +566,6 @@ def calculate_conn(Y, files_id): rois_l[subject, :] = correlation_matrix[subject, :, :][np.triu_indices(rois.shape[1], k=1)] return rois_l - -# In[ ]: - - # Load data demographics abide_df = load_abide_demographics(data_root) @@ -630,7 +579,6 @@ def calculate_conn(Y, files_id): with open((output_path / f'indices_space_train.json'), 'rb') as handle: indices = json.load(handle) -# TODO: make this more generalisable. We will use the MDS space model_embedding = embeddings['MDS'] kappa = 10 @@ -648,102 +596,84 @@ def calculate_conn(Y, files_id): model_config['data_root'] = data_root model_config['output_path'] = output_path - -# In[ ]: - - # Check range of predictions PredictedAcc = pickle.load(open(str(output_path / "predictedAcc.pckl"), "rb")) print(f'Max {np.max(PredictedAcc)}') print(f'Min {np.min(PredictedAcc)}') print(f'Mean and std {np.mean(PredictedAcc)} and {np.std(PredictedAcc)}') - -# Note the next steps might take a few hours to run. - -# In[ ]: - - -n_repetitions = 20 - -BestModelGPSpace = np.zeros(n_repetitions) -BestModelGPSpaceModIndex = np.zeros(n_repetitions) -BestModelEmpirical = np.zeros(n_repetitions) -BestModelEmpiricalModIndex = np.zeros(n_repetitions) -ModelActualAccuracyCorrelation = np.zeros(n_repetitions) -cv_scores = np.zeros(n_repetitions) - -for DiffInit in range(n_repetitions): - print(f'Repetiton #: {DiffInit}') - # Define settings for the analysis - kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = initialize_bo(model_embedding, kappa, repetitions=True, - DiffInit=DiffInit) - - FailedIters = run_bo(optimizer, utility, init_points, - n_iter, pbounds, nbrs, RandomSeed, - model_embedding, model_config, train_Y, - ClassOrRegress, - MultivariateUnivariate=True, - repetitions=True, - verbose=False) - gp = GaussianProcessRegressor(kernel=kernel, normalize_y=True, - n_restarts_optimizer=10) - - x_temp = np.array([[res["params"]["b1"]] for res in optimizer.res]) - y_temp = np.array([[res["params"]["b2"]] for res in optimizer.res]) - z_temp = np.array([res["target"] for res in optimizer.res]) - - x_obs = x_temp[FailedIters == 0] - y_obs = y_temp[FailedIters == 0] - z_obs = z_temp[FailedIters == 0] - - muModEmb, sigmaModEmb, gpModEmb = posteriorOnlyModels(gp, x_obs, y_obs, z_obs, - model_embedding) - - BestModelGPSpace[DiffInit] = muModEmb.max() - BestModelGPSpaceModIndex[DiffInit] = muModEmb.argmax() - BestModelEmpirical[DiffInit] = z_obs.max() - Model_coord = np.array([[x_obs[z_obs.argmax()][-1], y_obs[z_obs.argmax()][-1]]]) - BestModelEmpiricalModIndex[DiffInit] = nbrs.kneighbors(Model_coord)[1][0][0] - ModelActualAccuracyCorrelation[DiffInit] = spearmanr(muModEmb, PredictedAcc)[0] - TempModelNum = muModEmb.argmax() - - train_rois_l = calculate_conn(train_Y, train_files_id) - holdout_rois_l = calculate_conn(holdout_y, holdout_files_id) - - model = Pipeline([('scaler', StandardScaler()), ('reg', LogisticRegression(penalty='l2', random_state=0))]) - - model.fit(train_rois_l, train_Y.ravel()) - pred = model.predict(holdout_rois_l) - y_proba = model.predict_proba(holdout_rois_l)[:, 1] - score = roc_auc_score(holdout_y.ravel(), y_proba) - - #CVPValBestModels[DiffInit] = pvalue - cv_scores[DiffInit] = score - -df_best = pd.DataFrame(columns=['repetition', 'pipeline', 'derivatives', 'strategies', 'conn_metrics', 'score']) -for n in range(n_repetitions): - n_results = {} - n_results['repetition'] = n - n_results['pipeline'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][1] - n_results['derivatives'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][0] - n_results['strategies'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][2] - n_results['conn_metrics'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][3] - n_results['score'] = cv_scores[n] - df_best = df_best.append(n_results, ignore_index=True) -df_best = df_best.set_index('repetition') -# format the score column to a 3 digits -df_best['score'] = df_best['score'].apply('{:.3f}'.format) - - -# In[ ]: - - -df_best - - -# In[ ]: - - - +"""Note the next steps might take a few hours to run and it requires the ABIDE dataset.""" + +if re_runanalysis: + n_repetitions = 20 + + BestModelGPSpace = np.zeros(n_repetitions) + BestModelGPSpaceModIndex = np.zeros(n_repetitions) + BestModelEmpirical = np.zeros(n_repetitions) + BestModelEmpiricalModIndex = np.zeros(n_repetitions) + ModelActualAccuracyCorrelation = np.zeros(n_repetitions) + cv_scores = np.zeros(n_repetitions) + + for DiffInit in range(n_repetitions): + print(f'Repetiton #: {DiffInit}') + # Define settings for the analysis + kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \ + initialize_bo(model_embedding, kappa, repetitions=True, + DiffInit=DiffInit) + + FailedIters = run_bo(optimizer, utility, init_points, + n_iter, pbounds, nbrs, RandomSeed, + model_embedding, model_config, train_Y, + ClassOrRegress, + MultivariateUnivariate=True, + repetitions=True, + verbose=False) + gp = GaussianProcessRegressor(kernel=kernel, normalize_y=True, + n_restarts_optimizer=10) + + x_temp = np.array([[res["params"]["b1"]] for res in optimizer.res]) + y_temp = np.array([[res["params"]["b2"]] for res in optimizer.res]) + z_temp = np.array([res["target"] for res in optimizer.res]) + + x_obs = x_temp[FailedIters == 0] + y_obs = y_temp[FailedIters == 0] + z_obs = z_temp[FailedIters == 0] + + muModEmb, sigmaModEmb, gpModEmb = posteriorOnlyModels(gp, x_obs, y_obs, z_obs, + model_embedding) + + BestModelGPSpace[DiffInit] = muModEmb.max() + BestModelGPSpaceModIndex[DiffInit] = muModEmb.argmax() + BestModelEmpirical[DiffInit] = z_obs.max() + Model_coord = np.array([[x_obs[z_obs.argmax()][-1], y_obs[z_obs.argmax()][-1]]]) + BestModelEmpiricalModIndex[DiffInit] = nbrs.kneighbors(Model_coord)[1][0][0] + ModelActualAccuracyCorrelation[DiffInit] = spearmanr(muModEmb, PredictedAcc)[0] + TempModelNum = muModEmb.argmax() + + train_rois_l = calculate_conn(train_Y, train_files_id) + holdout_rois_l = calculate_conn(holdout_y, holdout_files_id) + + model = Pipeline([('scaler', StandardScaler()), ('reg', LogisticRegression(penalty='l2', random_state=0))]) + + model.fit(train_rois_l, train_Y.ravel()) + pred = model.predict(holdout_rois_l) + y_proba = model.predict_proba(holdout_rois_l)[:, 1] + score = roc_auc_score(holdout_y.ravel(), y_proba) + + #CVPValBestModels[DiffInit] = pvalue + cv_scores[DiffInit] = score + + df_best = pd.DataFrame(columns=['repetition', 'pipeline', 'derivatives', 'strategies', 'conn_metrics', 'score']) + for n in range(n_repetitions): + n_results = {} + n_results['repetition'] = n + n_results['pipeline'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][1] + n_results['derivatives'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][0] + n_results['strategies'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][2] + n_results['conn_metrics'] = Data_Run['methods_idx'][int(BestModelGPSpaceModIndex[n])][3] + n_results['score'] = cv_scores[n] + df_best = df_best.append(n_results, ignore_index=True) + df_best = df_best.set_index('repetition') + # format the score column to a 3 digits + df_best['score'] = df_best['score'].apply('{:.3f}'.format) diff --git a/notebooks/multiverse_analysis_regression.ipynb b/notebooks/multiverse_analysis_regression.ipynb index a1259fe..cede9c4 100644 --- a/notebooks/multiverse_analysis_regression.ipynb +++ b/notebooks/multiverse_analysis_regression.ipynb @@ -3,7 +3,7 @@ "nbformat_minor": 0, "metadata": { "colab": { - "name": "Copy of multiverse_analysis cleaner try", + "name": "into-the-multiverse-regression", "provenance": [], "collapsed_sections": [] }, diff --git a/notebooks/multiverse_analysis_regression.py b/notebooks/multiverse_analysis_regression.py index 6771eae..1b20c1f 100644 --- a/notebooks/multiverse_analysis_regression.py +++ b/notebooks/multiverse_analysis_regression.py @@ -1,19 +1,40 @@ # -*- coding: utf-8 -*- -"""Script to run the multiverse regression analysis. This was autogenerated from -the jupyter notebook. +"""into-the-multiverse-regression -Instead of running the notebok this python file can be run. However, take into -account that it mike take about 24hrs to run the entire script. +Automatically generated by Colaboratory. -This script assumes: -1. that the into=the-multiverse github repo -(git clone https://github.com/Mind-the-Pineapple/into-the-multiverse/) -has been already cloned locally. -2. There is a python enviroment with the required packaged installed. -3. That the data has been downloaded into the path into-the-multiverse/data/age. -The data can be downloaded from here: -https://figshare.com/articles/Data_for_Conservative_and_disruptive_modes_of_adolescent_change_in_human_brain_functional_connectivity_/11551602 +Original file is located at + https://colab.research.google.com/drive/1dcijYwsjw6eGoTBjEq9RkrliJwxL6FH- +This notebook can be run in two ways: +- Run all cells from beginning to end. However, this is a time-consuming process that will take about 10hrs. Note that the maximal runtime of a colab notebook is 12hrs. If a section is time-consuming, then the estimated time will be reported at the beginning of the section. + +- Run the different sections independently. The necessary files are provided in the output folder and are saved as pickle at the end of each section. + +If not downloaded to the your machine all the new generated pickles will be lost once the colab session is terminated. +If you want to download a file to your machine you can use: +```python +from google.colab import files +files.download(str(output_path / 'file_name.p')) +``` + +# 1. Setting up the enviroment +""" + +# Cloning the git repo with the data structure and complentary files +# Note: to run the code in this notebook, you will have to accept that the +# code was not developed by google. +!git clone https://github.com/Mind-the-Pineapple/into-the-multiverse/ + +# Install necessary python dependencies +! pip install -r into-the-multiverse/requirements.txt + +"""Note: Remember to restart the runtime by clicking on the button above to have the same version of matplotlib, mpl_toolkits, numpy as specified in the requirement.txt file. + +## Download the Data + +All the data used for this project is [public availiable](https://figshare.com/articles/Data_for_Conservative_and_disruptive_modes_of_adolescent_change_in_human_brain_functional_connectivity_/11551602) and consists of 520 scans from 298 healthy [Váša et. al, 2020](https://www.pnas.org/content/117/6/3248) individuals (age 14-26, mean age = 19.24, see for +details) """ from pathlib import Path @@ -23,6 +44,13 @@ if not data_path.is_dir(): data_path.mkdir(parents=True) +!wget -O into-the-multiverse/data/age/nspn.fmri.main.RData https://ndownloader.figshare.com/files/20958708 + +!wget -O into-the-multiverse/data/age/nspn.fmri.gsr.RData https://ndownloader.figshare.com/files/20958699 + +!wget -O into-the-multiverse/data/age/nspn.fmri.lowmot.RData https://ndownloader.figshare.com/files/20958702 + +!wget -O into-the-multiverse/data/age/nspn.fmri.general.vars.RData https://ndownloader.figshare.com/files/20819796 """## Define key variables @@ -38,7 +66,7 @@ import pickle import random -import pyreadr +import pyreadr import numpy as np import matplotlib.pyplot as plt import matplotlib.colorbar @@ -90,7 +118,7 @@ def get_variables_of_interest(): 'gateway degree': gateway_coef_sign, 'k-core centrality': bct.kcoreness_centrality_bu, } - #Get info about brain regions and find Yeo network IDs; useful later on for + #Get info about brain regions and find Yeo network IDs; useful later on for # graph metrics that need community labels. KeptIDs = np.asarray(genVar['hcp.keep.id']) YeoIDs = np.asarray(genVar['yeo.id.subc']) @@ -100,7 +128,7 @@ def get_variables_of_interest(): n_regions = 346 subject_array = 520 - #Get motion regression functional connectivity data and reshape into + #Get motion regression functional connectivity data and reshape into # region x region x subject array FC = np.asarray(data1['fc.main']) MainNoNan = np.nan_to_num(FC,copy=True,nan=1.0) @@ -118,15 +146,15 @@ def get_variables_of_interest(): IDMain=np.asarray(data1['id.main']) ages=np.asarray(data1['age.main']) - #Find unique subject IDs and index of first instance and find FC data + #Find unique subject IDs and index of first instance and find FC data # corresponding to these indices IDs,IDIndexUnique = np.unique(IDMain,return_index=True) MainNoNanReshapeUnique = MainNoNanReshape[:,:,IDIndexUnique] GSRNoNanReshapeUnique = GSRNoNanReshape[:,:,IDIndexUnique] AgesUnique = ages[IDIndexUnique] - - # Number of randomly selected subjects to be used to define the low-dimensional - # space then split FC data and age data into two: 50 for defining space and + + # Number of randomly selected subjects to be used to define the low-dimensional + # space then split FC data and age data into two: 50 for defining space and #remaining 248 for subsequent prediction SpaceDefineIdx = 50 LockBoxDataIdx = 100 @@ -134,11 +162,11 @@ def get_variables_of_interest(): MainNoNanModelSpace = MainNoNanReshapeUnique[:,:,RandomIndexes[0:SpaceDefineIdx]] MainNoNanLockBoxData = MainNoNanReshapeUnique[:, :, RandomIndexes[SpaceDefineIdx:LockBoxDataIdx]] MainNoNanPrediction = MainNoNanReshapeUnique[:,:,RandomIndexes[LockBoxDataIdx:]] - + GSRNoNanModelSpace = GSRNoNanReshapeUnique[:,:,RandomIndexes[0:SpaceDefineIdx]] GSRNoNanLockBoxData = GSRNoNanReshapeUnique[:,:,RandomIndexes[SpaceDefineIdx:LockBoxDataIdx]] GSRNoNanPrediction = GSRNoNanReshapeUnique[:,:,RandomIndexes[LockBoxDataIdx:]] - + AgesModelSpace = AgesUnique[RandomIndexes[0:SpaceDefineIdx]] AgesLockBoxData = AgesUnique[RandomIndexes[SpaceDefineIdx:LockBoxDataIdx]] AgesPrediction = AgesUnique[RandomIndexes[LockBoxDataIdx:]] @@ -199,28 +227,28 @@ def get_variables_of_interest(): x = bct.threshold_proportional(TempData[:,:,SubNum], TempThreshold, copy=True) ss = analysis_space(BCT_Num, BCT_models, x, KeptYeoIDs) - #For each subject for each approach keep the 346 regional values. - TempResults[SubNum, :] = ss + #For each subject for each approach keep the 346 regional values. + TempResults[SubNum, :] = ss BCT_Run[count] = BCT_Num; Sparsities_Run[count] = TempThreshold Data_Run[count] = DataPreproc GroupSummary[count] ='Mean' # Build an array of similarities between subjects for each - # analysis approach - cos_sim = cosine_similarity(TempResults, TempResults) + # analysis approach + cos_sim = cosine_similarity(TempResults, TempResults) Results[count, :] = np.mean(TempResults, axis=0) - ResultsIndVar[count, :] = cos_sim[np.triu_indices(TotalSubjects, k=1)].T + ResultsIndVar[count, :] = cos_sim[np.triu_indices(TotalSubjects, k=1)].T count += 1 - pbar.update(1) - + pbar.update(1) + ModelsResults={"Results": Results, "ResultsIndVar": ResultsIndVar, "BCT": BCT_Run, - "Sparsities": Sparsities_Run, - "Data": Data_Run, + "Sparsities": Sparsities_Run, + "Data": Data_Run, "SummaryStat": GroupSummary} - + pickle.dump( ModelsResults, open(str(output_path / "ModelsResults.p"), "wb" ) ) """# 3. Building and analysing the low-dimensional space @@ -261,7 +289,7 @@ def get_variables_of_interest(): n_neighbors = 20 n_components = 2 #number of components requested. In this case for a 2D space. -#Define different dimensionality reduction techniques +#Define different dimensionality reduction techniques methods = OrderedDict() LLE = partial(manifold.LocallyLinearEmbedding, n_neighbors, n_components, eigen_solver='dense') @@ -302,14 +330,14 @@ def get_variables_of_interest(): for idx_bct, bct_model in enumerate(BCT_models): axs[idx_method].scatter(YTemp[:,0][BCTTemp==bct_model], YTemp[:,1][BCTTemp==bct_model], - c=SparsitiesTemp[BCTTemp==bct_model], + c=SparsitiesTemp[BCTTemp==bct_model], marker=markers[idx_bct], cmap=colourmaps[preprocessing], s=80) Lines[idx_bct] = mlines.Line2D([], [], color='black', linestyle='None', - marker=markers[idx_bct], markersize=10, + marker=markers[idx_bct], markersize=10, label=bct_model) # For visualisation purposes show the y and x labels only ons specific plots - if idx_method % 2 == 0: + if idx_method % 2 == 0: axs[idx_method].set_ylabel('Dimension 1',fontsize=20) if (idx_method == 4) or (idx_method == 5): axs[idx_method].set_xlabel('Dimension 2',fontsize=20) @@ -326,9 +354,9 @@ def get_variables_of_interest(): IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.4', alpha=1) -IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.1', +IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.1', alpha=0.4) -IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.01', +IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.01', alpha=0.1) BlankLine=mlines.Line2D([], [], linestyle='None') @@ -344,7 +372,7 @@ def get_variables_of_interest(): gsDE.savefig(str(output_path / 'DifferentEmbeddings.svg'), format="svg", bbox_inches='tight') gsDE.show() -methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=10, +methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=10, random_state=21, metric=True) #Do the same as above but for MDS @@ -360,18 +388,18 @@ def get_variables_of_interest(): SparsitiesTemp=Sparsities[Data==preprocessing] YTemp=Y[Data==preprocessing,:] Lines={} - + for idx_bct, bct_model in enumerate(BCT_models): axs.scatter(YTemp[:,0][BCTTemp==bct_model], YTemp[:,1][BCTTemp==bct_model], - c=SparsitiesTemp[BCTTemp==bct_model], + c=SparsitiesTemp[BCTTemp==bct_model], marker=markers[idx_bct], norm=matplotlib.colors.Normalize( vmin=np.min(SparsitiesTemp[BCTTemp==bct_model]), vmax=np.max(SparsitiesTemp[BCTTemp==bct_model])), cmap=colourmaps[preprocessing], s=120) Lines[idx_bct] = mlines.Line2D([], [], color='black', linestyle='None', - marker=markers[idx_bct], markersize=10, + marker=markers[idx_bct], markersize=10, label=bct_model) axs.spines['top'].set_linewidth(1.5) @@ -391,9 +419,9 @@ def get_variables_of_interest(): IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.4', alpha=1) -IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.1', +IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.1', alpha=0.4) -IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.01', +IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.01', alpha=0.1) BlankLine=mlines.Line2D([], [], linestyle='None') @@ -405,7 +433,7 @@ def get_variables_of_interest(): Lines[12],Lines[13],Lines[14],Lines[15]],fontsize=15, frameon=False,bbox_to_anchor=(1.4, 0.8),bbox_transform=axs.transAxes) - + figMDS.savefig(str(output_path / 'MDSSpace.png'), dpi=300) figMDS.savefig(str(output_path /'MDSSpace.svg'), format="svg") @@ -427,7 +455,7 @@ def get_variables_of_interest(): diss_tsne = get_dissimilarity_n_neighbours(neighbours_orig, neighbours_tsne) del neighbours_tsne -neighbours_lle, _ = get_models_neighbours(N, n_neighbors_step, +neighbours_lle, _ = get_models_neighbours(N, n_neighbors_step, data_reduced['LLE']) diss_lle = get_dissimilarity_n_neighbours(neighbours_orig,neighbours_lle) del neighbours_lle @@ -536,7 +564,7 @@ def get_variables_of_interest(): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline -from helperfunctions import (initialize_bo, run_bo, posterior, +from helperfunctions import (initialize_bo, run_bo, posterior, posteriorOnlyModels, display_gp_mean_uncertainty, plot_bo_estimated_space, plot_bo_evolution, analysis_space, plot_bo_repetions) @@ -578,22 +606,22 @@ def get_variables_of_interest(): kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \ initialize_bo(ModelEmbedding, kappa) -# Perform optimization. Given that the space is continuous and the analysis -# approaches are not, we penalize suggestions that are far from any actual +# Perform optimization. Given that the space is continuous and the analysis +# approaches are not, we penalize suggestions that are far from any actual # analysis approaches. For these suggestions the registered value is set to the # lowest value from the burn in. These points (BadIters) are only used # during search but exluded when recalculating the GP regression after search. BadIter = run_bo(optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed, - ModelEmbedding, model_config, + ModelEmbedding, model_config, AgesPrediction, ClassOrRegression, MultivariateUnivariate=True, verbose=False) x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = \ plot_bo_estimated_space(kappa, BadIter, - optimizer, pbounds, - ModelEmbedding, PredictedAcc, + optimizer, pbounds, + ModelEmbedding, PredictedAcc, kernel, output_path, ClassOrRegression) # Display the results of the active search and the evolution of the search @@ -611,22 +639,22 @@ def get_variables_of_interest(): kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \ initialize_bo(ModelEmbedding, kappa) -# Perform optimization. Given that the space is continuous and the analysis -# approaches are not, we penalize suggestions that are far from any actual +# Perform optimization. Given that the space is continuous and the analysis +# approaches are not, we penalize suggestions that are far from any actual # analysis approaches. For these suggestions the registered value is set to the # lowest value from the burn in. These points (BadIters) are only used # during search but exluded when recalculating the GP regression after search. BadIter = run_bo(optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed, - ModelEmbedding, model_config, + ModelEmbedding, model_config, AgesPrediction, ClassOrRegression, MultivariateUnivariate=True, verbose=False) x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = \ plot_bo_estimated_space(kappa, BadIter, - optimizer, pbounds, - ModelEmbedding, PredictedAcc, + optimizer, pbounds, + ModelEmbedding, PredictedAcc, kernel, output_path, ClassOrRegression) # Display the results of the active search and the evolution of the search @@ -670,11 +698,11 @@ def get_variables_of_interest(): kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \ initialize_bo(ModelEmbedding, kappa, repetitions=True, DiffInit=DiffInit) - + # Run BO on the Prediction again FailedIters = run_bo(optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed, - ModelEmbedding, model_config, + ModelEmbedding, model_config, AgesPrediction, ClassOrRegression, MultivariateUnivariate=True, repetitions=True, @@ -689,7 +717,7 @@ def get_variables_of_interest(): x_obs = x_temp[FailedIters==0] y_obs = y_temp[FailedIters==0] z_obs = z_temp[FailedIters==0] - + muModEmb, sigmaModEmb, gpModEmb = posteriorOnlyModels(gp, x_obs, y_obs, z_obs, ModelEmbedding) @@ -699,7 +727,7 @@ def get_variables_of_interest(): Model_coord = np.array([[x_obs[z_obs.argmax()][-1], y_obs[z_obs.argmax()][-1]]]) BestModelEmpiricalModIndex[DiffInit] = nbrs.kneighbors(Model_coord)[1][0][0] ModelActualAccuracyCorrelation[DiffInit] = spearmanr(muModEmb, PredictedAcc)[0] - + TempModelNum = muModEmb.argmax() TempThreshold = Sparsities_Run[TempModelNum] BCT_Num = BCT_Run[TempModelNum] @@ -716,7 +744,7 @@ def get_variables_of_interest(): TempDataPredictions = MainNoNanPrediction TotalSubjectslock = TempDataLockBox.shape[2] - TotalSubjectsPredictions = TempDataPredictions.shape[2] + TotalSubjectsPredictions = TempDataPredictions.shape[2] TempResultsLockData = np.zeros([TotalSubjectslock, n_regions]) for SubNum in range(0, TotalSubjectslock): @@ -724,7 +752,7 @@ def get_variables_of_interest(): x = bct.threshold_proportional(TempDataLockBox[:, :, SubNum], TempThreshold, copy=True) ss = analysis_space(BCT_Num, BCT_models, x, KeptYeoIDs) - TempResultsLockData[SubNum, :] = ss + TempResultsLockData[SubNum, :] = ss TempPredictionsData = np.zeros([TotalSubjectsPredictions, n_regions]) for SubNum in range(0, TotalSubjectsPredictions): @@ -732,7 +760,7 @@ def get_variables_of_interest(): x = bct.threshold_proportional(TempDataPredictions[:, :, SubNum], TempThreshold, copy=True) ss = analysis_space(BCT_Num, BCT_models, x, KeptYeoIDs) - TempPredictionsData[SubNum, :] = ss + TempPredictionsData[SubNum, :] = ss model = Pipeline([('scaler', StandardScaler()), ('svr', SVR())]) all_data = np.concatenate((TempPredictionsData, TempResultsLockData)) @@ -741,7 +769,7 @@ def get_variables_of_interest(): ps = PredefinedSplit(test_fold) mae, perm_score, p_val = permutation_test_score(model, all_data, all_ages, n_jobs=None, random_state=5, verbose=0, - groups=None, cv=ps, n_permutations=n_permutations, + groups=None, cv=ps, n_permutations=n_permutations, scoring="neg_mean_absolute_error") cv_mae[DiffInit] = mae CVPValBestModels[DiffInit] = p_val @@ -757,17 +785,17 @@ def get_variables_of_interest(): import pandas as pd # Obtain the list of 20 models that were defined as the best models -df = pd.DataFrame({'Data_Run': Data_Run,'sparsities': Sparsities_Run, +df = pd.DataFrame({'Data_Run': Data_Run,'sparsities': Sparsities_Run, 'bct': BCT_Run}) df_best = df.iloc[BestModelEmpiricalModIndex] -df_best['mae']= cv_mae +df_best['mae']= cv_mae df_best['p-val'] = CVPValBestModels df_best repetions_results = { 'dataframe': df_best, 'BestModelGPSpaceModIndex': BestModelGPSpaceModIndex, - 'BestModelEmpiricalIndex': BestModelEmpiricalModIndex, + 'BestModelEmpiricalIndex': BestModelEmpiricalModIndex, 'BestModelEmpirical': BestModelEmpirical, 'ModelActualAccuracyCorrelation': ModelActualAccuracyCorrelation }