add new scripts

neuromodulation · Oct 15, 2024 · aa78bc9 · aa78bc9
1 parent 670a7dd
commit aa78bc9
Show file tree

Hide file tree

Showing 18 changed files with 5,484 additions and 132 deletions.
diff --git a/combine_data_Catboost_multiple_timepoints.py b/combine_data_Catboost_multiple_timepoints.py
@@ -0,0 +1,74 @@
+import pandas as pd
+import numpy as np
+from matplotlib import pyplot as plt
+import os
+
+
+if __name__ == "__main__":
+    PATH_FIGURES = "/Users/Timon/Library/CloudStorage/OneDrive-Charité-UniversitätsmedizinBerlin/Shared Documents - ICN Data World/General/Data/UCSF_OLARU/figures_ucsf"
+    PATH_READ = "/Users/Timon/Library/CloudStorage/OneDrive-Charité-UniversitätsmedizinBerlin/Shared Documents - ICN Data World/General/Data/UCSF_OLARU/features/merged_normalized_10s_window_length/480"
+    df_all = pd.read_csv(os.path.join(PATH_READ, "all_merged_normed.csv"), index_col=0)
+    df_all["pkg_dt"] = pd.to_datetime(df_all["pkg_dt"])
+    subs = df_all["sub"].unique()
+
+    def running_counter(values):
+        counter = 0
+        result = []
+        for value in values:
+            if value == 0:
+                counter = 0  # Reset counter on zero
+            else:
+                counter += 1
+            result.append(counter)
+        return result
+
+    df_all["time_diff"] = df_all["pkg_dt"].diff().dt.total_seconds() / 60 
+    df_all["counter"] = 0
+    df_all.loc[df_all["time_diff"] == 2, "counter"] = 1
+
+    df_all["cum_sum"] = running_counter(df_all["counter"].values)
+    # fileter from df_all only the columns that start with pkg, cum_sum or that contain fft
+    df_all_filter = df_all[[c for c in df_all.columns if c.startswith("pkg") or c.startswith("cum_sum") or "fft" in c or "sub" == c]]
+
+    for consec_duration in [2, 5, 10, 20, 50, 100, 200, 500]:
+        idx_larger_ = np.where(df_all["cum_sum"] >= consec_duration)
+        # for each index larger, extract the previous conec_duration dataframe and append it to a list
+        l_df = []
+        for idx in idx_larger_[0]:
+            df_ = df_all_filter.iloc[idx-consec_duration:idx]
+            # for each row, add the rows as columns with "lag_" prefix
+            l_row = []
+            for i in range(consec_duration):
+                if i > 0:
+                    df_ = df_.rename(columns={c: f"lag_{i}_{c}" for c in df_.columns})
+                    l_row.append(df_.iloc[i])
+                else:
+                    l_row.append(df_.iloc[i])
+            l_df.append(pd.concat(l_row, axis=0))
+        df_duration = pd.DataFrame(l_df)
+        print(f"Duration: {consec_duration}")
+        print(df_duration.shape)
+        df_duration.to_csv(os.path.join(PATH_READ, f"all_merged_normed_merge_{consec_duration}_consec.csv"))
+
+
+
+    PLT_ = False
+    if PLT_:
+        # set a column cum_sum to count counter iteratively, but reset when counter is 0
+        plt.hist(df_all["cum_sum"]*2, bins=50, density=False)
+        plt.xlabel("Minutes")
+        plt.title("Histogram of time series lengths")
+        plt.savefig(os.path.join(PATH_FIGURES, "histogram_time_series_lengths.pdf"))
+        plt.show(block=True)
+
+        # make the upper plot as a cdf
+        plt.hist(df_all["cum_sum"]*2, bins=50, density=True, cumulative=True, histtype='step')
+        # flip the y axis
+        # make the y axis percentage
+        plt.xlabel("Minutes")
+        # make x axis log
+        plt.xlim(0, 610)
+        # put for each tick a label
+        plt.title("CDF of time series lengths")
+        #plt.savefig(os.path.join(PATH_FIGURES, "cdf_time_series_lengths.pdf"))
+        plt.show(block=True)
diff --git a/connectome_lead_mapper_compute.py b/connectome_lead_mapper_compute.py
@@ -0,0 +1,105 @@
+import pickle
+import pandas as pd
+import numpy as np
+from py_neuromodulation import nm_RMAP, nm_stats
+from matplotlib import pyplot as plt
+import os
+import nibabel as nib
+import seaborn as sns
+
+PATH_CONNECTIVITY = r"/Users/Timon/Library/CloudStorage/OneDrive-Charité-UniversitätsmedizinBerlin/Shared Documents - ICN Data World/General/Data/UCSF_OLARU/out_per/out_connectivity/out_conn"
+PATH_PER = r'/Users/Timon/Library/CloudStorage/OneDrive-Charité-UniversitätsmedizinBerlin/Shared Documents - ICN Data World/General/Data/UCSF_OLARU/out_per/out_dir/df_per_ind_all_coords.csv'
+
+PLT_ = False
+
+df_per = pd.read_csv(PATH_PER, index_col=0)
+l_ch_sel = []
+
+for class_idx, CLASSIFICATION in enumerate([True, False]):
+    plt.figure(figsize=(10, 10))
+    plt_idx = 0
+    for label_idx, label_name in enumerate(["pkg_dk", "pkg_bk", "pkg_tremor"]):
+        for loc_idx, loc in enumerate(["ECOG", "STN", "GP"]):
+            df_disease = df_per.query(f"classification == {CLASSIFICATION} and label == '{label_name}' and loc == '{loc}'")
+
+            subs = np.unique([sub[:-1] for sub in df_disease["sub"].unique()])
+            df_disease["sub_ind"] = df_disease["sub"].apply(lambda x: x[:-1])
+            df_disease["hemisphere"] = df_disease["sub"].apply(lambda x: x[-1])
+            per_test_ = []
+            pred_corr_test = []
+
+
+            for sub_test in df_disease["sub_ind"].unique():
+                l_fp_train = []
+                per_train = []
+                l_fp_test = []
+                per_test = []
+                chs_test = []
+                hemispheres_test = []
+
+                affine = None
+
+                for idx, row in df_disease.iterrows():
+                    f_name = f'{row["sub"]}_ROI_{row["ch_orig"]}_conn-PPMI74P15CxPatients_desc-AvgRFz_funcmap.nii'
+                    f_path = os.path.join(PATH_CONNECTIVITY, f_name)
+                    rmapsel = nm_RMAP.RMAPCross_Val_ChannelSelector()
+                    fp = rmapsel.load_fingerprint(f_path)
+                    if affine is None:
+                        affine = rmapsel.affine
+
+                    if row["sub_ind"] == sub_test:
+                        l_fp_test.append(fp)
+                        per_test.append(row["per"])
+                        chs_test.append(row["ch_orig"])
+                        hemispheres_test.append(row["hemisphere"])
+                    else:
+                        l_fp_train.append(fp)
+                        per_train.append(row["per"])
+
+                rmap = nm_RMAP.RMAPCross_Val_ChannelSelector().get_RMAP(np.array(l_fp_train).T, np.array(per_train))
+
+                #nm_RMAP.RMAPCross_Val_ChannelSelector().save_Nii(rmap, affine, os.path.join(PATH_CONNECTIVITY, f"rmap_func_{label_name}_class_{CLASSIFICATION}_loc_{loc}.nii"))
+                corr_test = [np.corrcoef(
+                    np.nan_to_num(rmap.flatten()),
+                    np.nan_to_num(fp.flatten())
+                    )[0][1] for fp in l_fp_test]
+                per_test_.append(per_test)
+                pred_corr_test.append(corr_test)
+
+                df_test_pr = pd.DataFrame({"per": per_test, "corr": corr_test, "hemisphere": hemispheres_test, "ch": chs_test})
+
+                for hem in df_test_pr["hemisphere"].unique():
+                    df_test_sub = df_test_pr.query(f"hemisphere == '{hem}'")
+                    ch_sel = df_test_sub["ch"].values[np.argmax(df_test_sub["corr"].values)]
+                    l_ch_sel.append({
+                        "label": label_name,
+                        "CLASSIFICATION": CLASSIFICATION,
+                        "loc": loc,
+                        "sub": sub_test,
+                        "ch": ch_sel,
+                        "hemisphere": hem,
+                        "corr": np.max(df_test_sub["corr"].values),
+                        "per": df_test_sub.query(f"ch == '{ch_sel}'")["per"].values[0]
+                    })
+                    #chs_test[np.argmax(corr_test)]
+
+            plt_idx += 1
+            if PLT_:
+                plt.subplot(3, 3, plt_idx)
+                df_plt = pd.DataFrame({"per": np.concatenate(per_test_), "corr": np.concatenate(pred_corr_test)})
+                sns.regplot(data=df_plt, x="corr", y="per")
+                corr___ = np.corrcoef(np.concatenate(per_test_), np.concatenate(pred_corr_test))[0, 1]
+                print(corr___)
+                _, p = nm_stats.permutationTestSpearmansRho(np.concatenate(per_test_), np.concatenate(pred_corr_test), False, None, 1000)
+                plt.title(f"{label_name} {loc}\n" + 
+                        f"cor = {np.round(corr___, 2)} p={np.round(p, 3)}"
+                )
+    if PLT_:
+        plt.suptitle(f"CLASS: {CLASSIFICATION}")
+        plt.tight_layout()
+        plt.show(block=True)
+print("ha")
+df_ch_sel = pd.DataFrame(l_ch_sel)
+df_ch_sel.to_csv(os.path.join(PATH_CONNECTIVITY, "df_ch_sel_RMAP.csv"))
+
+# df_ch_sel.query("CLASSIFICATION == True and loc == 'STN' and label == 'pkg_dk'").sort_values("sub")
diff --git a/connectome_lead_mapper_write_data.py b/connectome_lead_mapper_write_data.py
@@ -0,0 +1,122 @@
+import pandas as pd
+import numpy as np
+import os
+
+PATH_RMAP_CH = '/Users/Timon/Library/CloudStorage/OneDrive-Charité-UniversitätsmedizinBerlin/Shared Documents - ICN Data World/General/Data/UCSF_OLARU/out_per/out_connectivity/out_conn/df_ch_sel_RMAP.csv'
+PATH_DATA = '/Users/Timon/Library/CloudStorage/OneDrive-Charité-UniversitätsmedizinBerlin/Shared Documents - ICN Data World/General/Data/UCSF_OLARU/features/merged_std_10s_window_length_all_ch'
+PATH_OUT = '/Users/Timon/Library/CloudStorage/OneDrive-Charité-UniversitätsmedizinBerlin/Shared Documents - ICN Data World/General/Data/UCSF_OLARU/features/merged_rmap'
+
+cortex_ch_names = ['8-9', '8-10', '10-11', '9-11', '8-11', '9-10']
+subcortex_ch_names = ['0-2', '0-1', '1-2', '1-3', '2-3', '0-3']
+
+def get_most_recorded_chs(df_all):
+    # get two most recorded cortex channels
+    chs_available = [f[:f.find("_raw_")] for f in df_all.columns if "raw_" in f and "mean" in f]
+    size_ = []
+    for ch in cortex_ch_names:
+        if ch in chs_available:
+            dat_ = df_all[f"{ch}_raw_mean"]
+            # count the number of non-NaN values
+            size_.append(dat_.dropna().shape[0])
+        else:
+            size_.append(0)
+    # get the names of the two most recorded channels
+    ch_cortex_sel = [cortex_ch_names[i] for i in np.argsort(size_)[::-1][:2]]
+    ch_cortex_sel = sorted(ch_cortex_sel, key=lambda x: int(x.split('-')[0]))
+
+    size_ = []
+    for ch in subcortex_ch_names:
+        if ch in chs_available:
+            dat_ = df_all[f"{ch}_raw_mean"]
+            # count the number of non-NaN values
+            size_.append(dat_.dropna().shape[0])
+        else:
+            size_.append(0)
+    # get the names of the two most recorded channels
+    ch_subcortex_sel = [subcortex_ch_names[i] for i in np.argsort(size_)[::-1][:2]]
+    ch_subcortex_sel = sorted(ch_subcortex_sel, key=lambda x: int(x.split('-')[0]))
+    return ch_cortex_sel, ch_subcortex_sel
+
+df_rmap_ch = pd.read_csv(PATH_RMAP_CH)
+
+# compute the performance 1. renaming the columns 
+
+SELECT_ALL = False
+subs = df_rmap_ch["sub"].unique()
+
+if SELECT_ALL:
+
+    dfs_save = []
+    cols  = None
+    for sub in subs:
+        print(sub)
+        for hem in ["l", "r"]:
+            if not os.path.exists(os.path.join(PATH_DATA, f"{sub}{hem}_merged.csv")):
+                continue
+            df_sub = pd.read_csv(os.path.join(PATH_DATA, f"{sub}{hem}_merged.csv"))
+            ch_cortex, ch_subcortex = get_most_recorded_chs(df_sub)
+            ch_used = ch_subcortex + ch_cortex 
+            # select all columns of df_sub that contain the ch_used
+            col_used = [c for c in df_sub.columns if any([ch in c for ch in ch_used]) and "coh" not in c] + list(df_sub.columns[-4:])
+            df_sub_used = df_sub[col_used]
+            df_sub_orig = df_sub_used.copy()
+            ch_names_replace = ["ch_subcortex_1", "ch_subcortex_2", "ch_cortex_1", "ch_cortex_2"]
+
+            # for each column replace the ch_used with the ch_names_replace
+            for ch, ch_name in zip(ch_used, ch_names_replace):
+                df_sub_used.columns = [c.replace(ch+"_", ch_name+"_") for c in df_sub_used.columns]
+            if cols is None:
+                cols = df_sub_used.columns
+            else:
+                if not np.all(cols == df_sub_used.columns):
+                    df_sub_used = df_sub_used[cols]
+
+            df_sub_used["sub"] = sub+hem
+            dfs_save.append(df_sub_used)
+            print(df_sub_used.columns.shape)
+            if df_sub_used.columns.shape[0] != 2725:
+                print("break here")
+            # df_sub_used can be saved
+
+    df_all = pd.concat(dfs_save) 
+    df_all.to_csv(os.path.join(PATH_OUT, "all_ch_renamed_no_rmap.csv"))
+
+
+for CLASSIFICATION in [True, False]:
+    print(CLASSIFICATION)
+    for label in ["pkg_dk", "pkg_bk", "pkg_tremor"]:
+        print(label)
+        df_l = []
+        cols = None
+        for sub in subs:
+            print(sub)
+            for hem in ["l", "r"]:
+                if not os.path.exists(os.path.join(PATH_DATA, f"{sub}{hem}_merged.csv")):
+                    continue
+                df_sub = pd.read_csv(os.path.join(PATH_DATA, f"{sub}{hem}_merged.csv"))
+
+                ch_sel = list(df_rmap_ch.query(f"sub == '{sub}' and hemisphere == '{hem}' and label == '{label}' and CLASSIFICATION == {CLASSIFICATION}")["ch"])
+                # sort the channels
+                ch_sel = sorted(ch_sel, key=lambda x: int(x.split('-')[0]))
+                # select only columns that start with one ch in ch_sel
+                col_used = [c for c in df_sub.columns if any([ch in c for ch in ch_sel]) and "coh" not in c] + list(df_sub.columns[-4:])
+                df_sub_sel = df_sub[col_used]
+                ch_replace = ["ch_subcortex_1", "ch_cortex_1"]
+                for ch, ch_name in zip(ch_sel, ch_replace):
+                    df_sub_sel.columns = [c.replace(ch+"_", ch_name+"_") for c in df_sub_sel.columns]
+                if cols is None:
+                    cols = df_sub_sel.columns
+                else:
+                    if len(cols) != len(df_sub_sel.columns):
+                        continue
+                    if not np.all(cols == df_sub_sel.columns):
+                        try:
+                            df_sub_sel = df_sub_sel[cols]
+                        except:
+                            continue
+                df_sub_sel["sub"] = sub+hem
+                df_l.append(df_sub_sel)
+        df_rmap_cond = pd.concat(df_l)
+        # remove entries where the label is NaN
+        df_rmap_cond = df_rmap_cond[~df_rmap_cond[label].isna()]
+        df_rmap_cond.to_csv(os.path.join(PATH_OUT, f"rmap_ch_{label}_class_{CLASSIFICATION}.csv"))
diff --git a/figure_27_get_ch_ind_per_all_ch.py b/figure_27_get_ch_ind_per_all_ch.py
@@ -42,12 +42,15 @@ def run_channel(sub, ch, ch_idx):
             X = X[~idx_nan]
             y = y[np.array(~idx_nan)]
 
-            y_pr = model_selection.cross_val_predict(
-                model,
-                X,
-                y,
-                cv=model_selection.KFold(n_splits=3, shuffle=False),
-            )
+            try:
+                y_pr = model_selection.cross_val_predict(
+                    model,
+                    X,
+                    y,
+                    cv=model_selection.KFold(n_splits=3, shuffle=False),
+                )
+            except:
+                continue
 
 
             if CLASSIFICATION: