From c2b24a39aa2291a99d6fbd24e5563b089353ddb0 Mon Sep 17 00:00:00 2001 From: dPys Date: Tue, 22 Sep 2020 01:25:12 -0500 Subject: [PATCH] [ENH] Update prediction to accomodate for 0 smoothing metaparam --- pynets/core/utils.py | 1 + pynets/core/workflows.py | 3 +- pynets/runconfig.yaml | 2 +- pynets/stats/prediction.py | 104 ++++++++++++++++++++++++------------- 4 files changed, 70 insertions(+), 40 deletions(-) diff --git a/pynets/core/utils.py b/pynets/core/utils.py index c4784610..1483bd5c 100644 --- a/pynets/core/utils.py +++ b/pynets/core/utils.py @@ -1822,6 +1822,7 @@ def filter_cols_from_targets(df, targets): ''.join( expr.format(w) for w in targets)))] + return out diff --git a/pynets/core/workflows.py b/pynets/core/workflows.py index c04f106b..b1f6d935 100644 --- a/pynets/core/workflows.py +++ b/pynets/core/workflows.py @@ -4646,8 +4646,7 @@ def fmri_connectometry( ), (check_orient_and_dims_roi_node, register_roi_node, - [("outfile", - "roi")], + [("outfile", "roi")], ), ]) diff --git a/pynets/runconfig.yaml b/pynets/runconfig.yaml index f88cae7f..d083481d 100755 --- a/pynets/runconfig.yaml +++ b/pynets/runconfig.yaml @@ -253,7 +253,7 @@ execution_dict: # Nipype workflow global settings - 'keep_inputs': - False - 'remove_unnecessary_outputs': - - True + - False - 'remove_node_directories': - False - 'raise_insufficient': diff --git a/pynets/stats/prediction.py b/pynets/stats/prediction.py index c8dae5a4..3b9f93a2 100644 --- a/pynets/stats/prediction.py +++ b/pynets/stats/prediction.py @@ -684,7 +684,8 @@ def nested_fit(X, y, regressors, boot, pca_reduce, k_folds, if pca_reduce is True and X.shape[0] < X.shape[1]: est.alpha = float(best_regressor.split("alpha-")[1].split('_')[0]) pca = decomposition.PCA( - n_components=int(best_regressor.split("nfeatures-")[1].split('_')[0]), whiten=True + n_components=int(best_regressor.split("nfeatures-")[1].split('_')[0]), + whiten=True ) reg = Pipeline([("feature_select", pca), (best_regressor.split("regressor-")[1].split('_')[0], est)]) @@ -714,11 +715,14 @@ def build_grid(modality, hyperparam_dict, hyperparams, ensembles): print(f"Failed to parse ensemble {ensemble}...") if "rsn" in hyperparam_dict.keys(): - hyperparam_dict["rsn"] = [i for i in hyperparam_dict["rsn"] if "res" not in i] + hyperparam_dict["rsn"] = [i for i in hyperparam_dict["rsn"] if "res" + not in i] - hyperparam_dict = OrderedDict(sorted(hyperparam_dict.items(), key=lambda x: x[0])) + hyperparam_dict = OrderedDict(sorted(hyperparam_dict.items(), + key=lambda x: x[0])) grid = list( - itertools.product(*(hyperparam_dict[param] for param in hyperparam_dict.keys())) + itertools.product(*(hyperparam_dict[param] for param in + hyperparam_dict.keys())) ) return hyperparam_dict, grid @@ -732,11 +736,14 @@ def get_coords_labels(embedding): return coords_file, labels_file -def flatten_latent_positions(rsn, subject_dict, ID, ses, modality, grid_param, alg): +def flatten_latent_positions(rsn, subject_dict, ID, ses, modality, grid_param, + alg): import pickle - if ((rsn,) + grid_param) in subject_dict[ID][str(ses)][modality][alg].keys(): - rsn_dict = subject_dict[ID][str(ses)][modality][alg][((rsn,) + grid_param)] + if ((rsn,) + grid_param) in \ + subject_dict[ID][str(ses)][modality][alg].keys(): + rsn_dict = subject_dict[ID][str(ses)][modality][alg][((rsn,) + + grid_param)] if not isinstance(rsn_dict["coords"], list): if os.path.isfile(rsn_dict["coords"]): with open(rsn_dict["coords"], "rb") as file_: @@ -755,7 +762,8 @@ def flatten_latent_positions(rsn, subject_dict, ID, ses, modality, grid_param, a 1, rsn_dict["data"].T.shape[0] * rsn_dict["data"].T.shape[1] ) if rsn_dict["data"].shape[1] == 1: - df_lps = pd.DataFrame(rsn_arr, columns=[f"{rsn}_{i}_dim1" for i in ixs]) + df_lps = pd.DataFrame(rsn_arr, columns=[f"{rsn}_{i}_dim1" + for i in ixs]) elif rsn_dict["data"].shape[1] == 3: df_lps = pd.DataFrame( rsn_arr, @@ -765,7 +773,8 @@ def flatten_latent_positions(rsn, subject_dict, ID, ses, modality, grid_param, a ) else: raise ValueError( - f"Number of dimensions {rsn_dict['data'].shape[1]} not supported. See flatten_latent_positions function..." + f"Number of dimensions {rsn_dict['data'].shape[1]} " + f"not supported. See flatten_latent_positions function..." ) # print(df_lps) else: @@ -1033,7 +1042,9 @@ def bootstrapped_nested_cv( except: pass - if X.empty: + if X.empty or len(X.columns) < 5: + print("Low feature count. Setting performance on this feature-space " + "to NA...") return ( grand_mean_best_estimator, grand_mean_best_Rsquared, @@ -1047,9 +1058,6 @@ def bootstrapped_nested_cv( # y = pd.DataFrame(scaler.fit_transform(np.array(y).reshape(-1, 1))) y = pd.DataFrame(np.array(y).reshape(-1, 1)) - if X.shape[1] < 5: - print(f"Low feature count: {X.shape[1]}") - print(f"\nX: {X}\ny: {y}\n") # Bootstrap nested CV's "simulates" the variability of incoming data, @@ -1219,6 +1227,8 @@ def make_subject_dict( if os.path.basename(i).startswith("sub") ] + ids = [i for i in ids if 's030' not in i] + if alg == "ASE" or alg == "OMNI" or alg == "vectorize": ensembles = get_ensembles_embedding(modality, alg, base_dir) @@ -1446,12 +1456,14 @@ def populate_subject_dict( elif len(embeddings) == 1: embedding = embeddings[0] else: - embeddings_raw = [i for i in embeddings if "thrtype" not in i] + embeddings_raw = [i for i in embeddings if "thrtype" + not in i] if len(embeddings_raw) == 1: embedding = embeddings[0] elif len(embeddings_raw) > 1: - sorted_embeddings = sorted(embeddings_raw, key=os.path.getmtime) + sorted_embeddings = sorted(embeddings_raw, + key=os.path.getmtime) print( f"Multiple functional embeddings found for {id} and" f" recipe {comb_tuple}:\n{embeddings}\nTaking the most" @@ -1500,7 +1512,7 @@ def populate_subject_dict( elif alg == "topology": data = np.empty([len(mets), 1], dtype=np.float32) data[:] = np.nan - if smooth == 0: + if smooth == '0': targets = [ f"extract-{extract}", f"hpass-{hpass}Hz", @@ -1528,7 +1540,7 @@ def populate_subject_dict( if len(col_met) == 1: col = col_met[0] elif len(col_met) > 1: - if comb_tuple[-1] == 0: + if comb_tuple[-1] == '0': col = [i for i in col_met if "fwhm" not in i][0] else: print(f"Multiple columns detected: {col_met}") @@ -1620,12 +1632,14 @@ def populate_subject_dict( elif len(embeddings) == 1: embedding = embeddings[0] else: - embeddings_raw = [i for i in embeddings if "thrtype" not in i] + embeddings_raw = [i for i in embeddings if "thrtype" not + in i] if len(embeddings_raw) == 1: embedding = embeddings[0] elif len(embeddings_raw) > 1: - sorted_embeddings = sorted(embeddings_raw, key=os.path.getmtime) + sorted_embeddings = sorted(embeddings_raw, + key=os.path.getmtime) print( f"Multiple functional embeddings found for {id} and" f" recipe {comb_tuple}:\n{embeddings}\nTaking the most" @@ -1633,7 +1647,8 @@ def populate_subject_dict( ) embedding = sorted_embeddings[0] else: - sorted_embeddings = sorted(embeddings, key=os.path.getmtime) + sorted_embeddings = sorted(embeddings, + key=os.path.getmtime) print( f"Multiple functional embeddings found for {id} and" f" recipe {comb_tuple}:\n{embeddings}\nTaking the most" @@ -1831,7 +1846,8 @@ def make_x_y(input_dict, drop_cols, target_var, alg, grid_param): df_all = pd.Series() else: df_all.drop(columns=["id"], inplace=True) - if len(df_all.columns) < 10: + if len(df_all.columns) < 5: + print(f"Too few columns detected for {grid_param}...") df_all = pd.Series() except: df_all = pd.Series() @@ -1861,7 +1877,8 @@ def concatenate_frames(out_dir, modality, alg, target_var, files_): except BaseException: pass dfs.append(df) - frame = pd.concat(dfs, axis=0, join="outer", sort=True, ignore_index=False) + frame = pd.concat(dfs, axis=0, join="outer", sort=True, + ignore_index=False) out_path = f"{out_dir}/final_df_{modality}_{alg}_{target_var}.csv" print(f"Saving to {out_path}...") if os.path.isfile(out_path): @@ -1925,9 +1942,11 @@ def _run_interface(self, runtime): self.inputs.target_var == "rum_1": drop_cols = [self.inputs.target_var, "dep_persist", "rum_persist"] elif self.inputs.target_var == "dep_2": - drop_cols = [self.inputs.target_var, "dep_persist", "rum_persist", "rum_2"] + drop_cols = [self.inputs.target_var, "dep_persist", "rum_persist", + "rum_2"] elif self.inputs.target_var == "rum_2": - drop_cols = [self.inputs.target_var, "dep_persist", "rum_persist", "dep_2"] + drop_cols = [self.inputs.target_var, "dep_persist", "rum_persist", + "dep_2"] else: drop_cols = [self.inputs.target_var, "rum_persist", "dep_persist", "dep_1", "rum_1"] @@ -1941,7 +1960,11 @@ def _run_interface(self, runtime): ) if isinstance(X, pd.DataFrame): - out_X = f"{runtime.cwd}/X_{self.inputs.target_var}_{self.inputs.modality}_{self.inputs.alg}_" f"{'_'.join(str(v) for v in self.inputs.grid_param)}.csv" + out_X = f"{runtime.cwd}/X_" \ + f"{self.inputs.target_var}_" \ + f"{self.inputs.modality}_" \ + f"{self.inputs.alg}_" \ + f"{'_'.join(str(v) for v in self.inputs.grid_param)}.csv" if os.path.isfile(out_X): os.remove(out_X) @@ -2156,7 +2179,11 @@ def _run_interface(self, runtime): df_summary.at[0, "MSE"] = np.nan df_summary.at[0, "lp_importance"] = np.nan - out_df_summary = f"{runtime.cwd}/df_summary_{self.inputs.target_var}_{self.inputs.modality}_{self.inputs.alg}_{'_'.join(str(v) for v in self.inputs.grid_param)}.csv" + out_df_summary = f"{runtime.cwd}/df_summary_" \ + f"{self.inputs.target_var}_" \ + f"{self.inputs.modality}_" \ + f"{self.inputs.alg}_" \ + f"{'_'.join(str(v) for v in self.inputs.grid_param)}.csv" if os.path.isfile(out_df_summary): os.remove(out_df_summary) df_summary.to_csv(out_df_summary, index=False) @@ -2331,7 +2358,7 @@ def build_predict_workflow(args, retval): modality = args["modality"] run_uuid = f"{strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4()}" - ml_meta_wf = pe.Workflow(name="pynets_multiperform") + ml_meta_wf = pe.Workflow(name="pynets_multipredict") ml_meta_wf.base_dir = f"{base_dir}/pynets_multiperform_{run_uuid}" os.makedirs(ml_meta_wf.base_dir, exist_ok=True) @@ -2424,7 +2451,7 @@ def build_predict_workflow(args, retval): ) execution_dict = {} execution_dict["crashdump_dir"] = str(ml_meta_wf.base_dir) - execution_dict["poll_sleep_duration"] = 1 + execution_dict["poll_sleep_duration"] = 0.5 execution_dict["crashfile_format"] = "txt" execution_dict["local_hash_check"] = False execution_dict["stop_on_first_crash"] = False @@ -2463,7 +2490,8 @@ def main(): ) # Hard-Coded # - embedding_types = ['OMNI', 'ASE', 'topology'] + #embedding_types = ['OMNI', 'ASE', 'topology'] + embedding_types = ['topology'] #modalities = ["func", "dwi"] modalities = ["func"] thr_type = "MST" @@ -2482,7 +2510,9 @@ def main(): # User-Specified # target_modality = 'func' - target_vars = ["rum_2", "dep_2"] + #target_vars = ["rum_2", "dep_2"] + target_vars = ["rum_persist", "dep_persist", "rum_2", "dep_2", "dep_1", + "rum_1"] sessions = ["1"] subject_dict_file_path = ( @@ -2503,10 +2533,13 @@ def main(): sessions ) sub_dict_clean = cleanNullTerms(subject_dict) - final_missingness_summary = pd.concat( - [i for i in missingness_frames if isinstance(i, pd.DataFrame)] - ) - final_missingness_summary.to_csv(missingness_summary, index=False) + missingness_frames = [i for i in missingness_frames if isinstance(i, pd.DataFrame)] + if len(missingness_frames) != 0: + if len(missingness_frames) > 1: + final_missingness_summary = pd.concat(missingness_frames) + elif len(missingness_frames) == 1: + final_missingness_summary = missingness_frames[0] + final_missingness_summary.to_csv(missingness_summary, index=False) with open(subject_dict_file_path, "wb") as f: dill.dump(sub_dict_clean, f) f.close() @@ -2576,9 +2609,6 @@ def main(): # with open('pynets_ml_dict_func_topology.pkl', "rb") as f: # ml_dfs = dill.load(f) # f.close() - # with open('pynets_ml_dict_dwi_topology_bak.pkl', "rb") as f: - # ml_dfs = dill.load(f) - # f.close() tables = list(itertools.product(modalities, embedding_types))