diff --git a/src/ageml/ui.py b/src/ageml/ui.py index 0f22fb4..fbb59d7 100644 --- a/src/ageml/ui.py +++ b/src/ageml/ui.py @@ -44,6 +44,8 @@ class Interface: --------------- setup(self): Creates required directories and files to store results. + command_setup(self, dir_path): Create required directories and files to store results for command. + set_flags(self): Set flags. set_visualizer(self): Set visualizer with output directory. @@ -52,6 +54,8 @@ class Interface: set_classifier(self): Set classifier with parameters. + update_params(self): Update initial parameters after load. + check_file(self, file): Check that file exists. load_csv(self, file): Use panda to load csv into dataframe. @@ -117,13 +121,11 @@ def __init__(self, args): def setup(self): """Create required directories and files to store results.""" - # Create directories + # Create directory self.dir_path = os.path.join(self.args.output, "ageml") if os.path.exists(self.dir_path): - warnings.warn( - "Directory %s already exists files may be overwritten." % self.dir_path, - category=UserWarning, - ) + warnings.warn("Directory %s already exists files may be overwritten." % self.dir_path, + category=UserWarning) create_directory(self.dir_path) # Create .txt log file and log time @@ -132,10 +134,32 @@ def setup(self): current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") f.write(current_time + "\n") + def command_setup(self, dir_path): + """Create required directories and files to store results for command. + + Parameters + ---------- + dir_path: directory path to create""" + + # Create directory + command_dir = os.path.join(self.dir_path, dir_path) + create_directory(command_dir) + self.set_visualizer(command_dir) + + # Reset flags + self.set_flags() + + # Set initial parameters for model to defaults + self.naming = "" + self.subject_types = ['cn'] + self.covars = ['all'] + self.systems = ['all'] + def set_flags(self): """Set flags.""" - self.flags = {"clinical": False, "covariates": False, "covarname": False, "systems": False} + self.flags = {"clinical": False, "covariates": False, "covarname": False, + "systems": False, "ages": False} def set_visualizer(self, dir): """Set visualizer with output directory.""" @@ -168,6 +192,21 @@ def generate_classifier(self): return classifier + def update_params(self): + """Update initial parameters after load.""" + + # Check possible flags of interest + if self.flags['clinical']: + self.subject_types = self.df_clinical.columns.to_list() + if self.flags['covarname']: + self.covars = pd.unique(self.df_covariates[self.args.covar_name]).tolist() + self.naming += f"_{self.args.covar_name}" + if self.flags['systems']: + self.systems = list(self.dict_systems.keys()) + self.naming += "_multisystem" + if self.flags['ages']: + self.systems = [col[6:] for col in self.df_ages.columns if "delta" in col] + def check_file(self, file): """Check that file exists.""" if not os.path.exists(file): @@ -331,6 +370,7 @@ def load_ages(self, required=False): return df # Required columns + self.flags['ages'] = True req_cols = ["age", "predicted_age", "corrected_age", "delta"] cols = [col.lower() for col in df.columns.to_list()] @@ -512,6 +552,9 @@ def load_data(self, required=None): index = self.df_ages.index self.df_clinical = pd.DataFrame(index=index, columns=['cn'], data=True) + # Update initial parameters after load + self.update_params() + def age_distribution(self, ages_dict: dict, name=""): """Use visualizer to show age distribution. @@ -818,50 +861,31 @@ def run_age(self): print("Running age modelling...") # Set up directory - command_dir = os.path.join(self.dir_path, "model_age") - create_directory(command_dir) - self.set_visualizer(command_dir) - - # Reset flags - self.set_flags() - - # Set initial parameters for model to defaults - naming = "" - subject_types = ['cn'] - covars = ['all'] - systems = ['all'] + self.command_setup('model_age') # Load data self.load_data(required=["features"]) - # Check possible flags of interest - if self.flags['clinical']: - subject_types = self.df_clinical.columns.to_list() - if self.flags['covarname']: - covars = pd.unique(self.df_covariates[self.args.covar_name]).tolist() - naming += f"_{self.args.covar_name}" - if self.flags['systems']: - systems = list(self.dict_systems.keys()) - naming += "_multisystem" - # Initialized dictionaries - dfs = {subject_type: {covar: {system: {} for system in systems} for covar in covars} for subject_type in subject_types} - preds = {subject_type: {covar: {system: {} for system in systems} for covar in covars} for subject_type in subject_types} - models = {covar: {system: {} for system in systems} for covar in covars} - betas = {covar: {system: {} for system in systems} for covar in covars} + dfs = {subject_type: {covar: {system: {} for system in self.systems} + for covar in self.covars} for subject_type in self.subject_types} + preds = {subject_type: {covar: {system: {} for system in self.systems} + for covar in self.covars} for subject_type in self.subject_types} + models = {covar: {system: {} for system in self.systems} for covar in self.covars} + betas = {covar: {system: {} for system in self.systems} for covar in self.covars} # Obtain dataframes for each subject type, covariate and system - for subject_type in subject_types: + for subject_type in self.subject_types: # Keep only the subjects of the specified type df_sub = self.df_features[self.df_clinical[subject_type]] - for covar in covars: + for covar in self.covars: # Keep subjects with the specified covariate if self.flags['covarname']: covar_index = set(self.df_covariates[self.df_covariates[self.args.covar_name] == covar].index) df_cov = df_sub[df_sub.index.isin(covar_index)] else: df_cov = df_sub - for system in systems: + for system in self.systems: # Keep only the features of the system if self.flags['systems']: df_sys = df_cov[['age'] + self.dict_systems[system]] @@ -871,17 +895,17 @@ def run_age(self): dfs[subject_type][covar][system] = df_sys # Use visualizer to show age distribution of controls per covariate (all systems share the age distribution) - cn_ages = {covar: dfs['cn'][covar][systems[0]]['age'].to_list() for covar in covars} - self.age_distribution(cn_ages, name="controls" + naming) + cn_ages = {covar: dfs['cn'][covar][self.systems[0]]['age'].to_list() for covar in self.covars} + self.age_distribution(cn_ages, name="controls" + self.naming) # Show features vs age for controls for each system - for system in systems: - cn_features = {covar: dfs['cn'][covar][system] for covar in covars} - self.features_vs_age(cn_features, name="controls" + naming + "_" + system) + for system in self.systems: + cn_features = {covar: dfs['cn'][covar][system] for covar in self.covars} + self.features_vs_age(cn_features, name="controls" + self.naming + "_" + system) # Model age for each system on controls - for covar in covars: - for system in systems: + for covar in self.covars: + for system in self.systems: model_name = f"{covar}_{system}" ageml_model = self.generate_model() models[covar][system], df_pred, betas[covar][system] = self.model_age(dfs['cn'][covar][system], @@ -891,12 +915,12 @@ def run_age(self): preds['cn'][covar][system] = df_pred # Apply to all other subject types - for subject_type in subject_types: + for subject_type in self.subject_types: # Do not apply to controls if subject_type == 'cn': continue - for covar in covars: - for system in systems: + for covar in self.covars: + for system in self.systems: model_name = f"{covar}_{system}" df_pred = self.predict_age(dfs[subject_type][covar][system], models[covar][system], betas[covar][system], model_name=model_name) @@ -906,9 +930,9 @@ def run_age(self): # Concatenate predictions into a DataFrame stack = [] - for subject_type in subject_types: - for covar in covars: - df_systems = pd.concat([preds[subject_type][covar][system] for system in systems], axis=1) + for subject_type in self.subject_types: + for covar in self.covars: + df_systems = pd.concat([preds[subject_type][covar][system] for system in self.systems], axis=1) stack.append(df_systems) df_ages = pd.concat(stack, axis=0) @@ -919,7 +943,7 @@ def run_age(self): df_ages = pd.concat([self.df_features['age'], df_ages], axis=1) # Save dataframe to csv - filename = "predicted_age" + naming + ".csv" + filename = "predicted_age" + self.naming + ".csv" df_ages.to_csv(os.path.join(self.dir_path, filename)) def run_factor_correlation(self): @@ -927,33 +951,18 @@ def run_factor_correlation(self): print("Running factors correlation analysis...") - # Set up directory - command_dir = os.path.join(self.dir_path, "factor_correlation") - create_directory(command_dir) - self.set_visualizer(command_dir) - - # Reset flags - self.set_flags() - - # Initial parameters - subject_types = ['cn'] + # Set up + self.command_setup('factor_correlation') # Load data self.load_data(required=["ages", "factors"]) - # Check possible flags of interest - if self.flags['clinical']: - subject_types = self.df_clinical.columns.to_list() - - # Obtain systems - systems = [col[6:] for col in self.df_ages.columns if "delta" in col] - - # For each subject type - for subject_type in subject_types: + # For each subject type and system run correlation analysis + for subject_type in self.subject_types: dfs_systems = {} df_sub = self.df_ages.loc[self.df_clinical[subject_type]] df_factors = self.df_factors.loc[df_sub.index] - for system in systems: + for system in self.systems: df_sys = df_sub[[col for col in df_sub.columns if system in col]] dfs_systems[system] = df_sys self.factors_vs_deltas(dfs_systems, df_factors, subject_type) @@ -963,29 +972,22 @@ def run_clinical(self): print("Running clinical outcomes...") - # Set up directory - command_dir = os.path.join(self.dir_path, "clinical_groups") - create_directory(command_dir) - self.set_visualizer(command_dir) - - # Reset flags - self.set_flags() + # Set up + self.command_setup('clinical_groups') # Load data self.load_data(required=["ages", "clinical"]) # Obtain dataframes for each group - groups = self.df_clinical.columns.to_list() - dfs = {g: self.df_ages.loc[self.df_clinical[g]] for g in groups} + dfs = {g: self.df_ages.loc[self.df_clinical[g]] for g in self.subject_types} # Use visualizer to show age distribution per clinical group - ages = {g: dfs[g].iloc[:, 0].to_list() for g in groups} + ages = {g: dfs[g].iloc[:, 0].to_list() for g in self.subject_types} self.age_distribution(ages, name="clinical_groups") # Show differences in groups per system - systems = [col[6:] for col in self.df_ages.columns if "delta" in col] - for system in systems: - dfs_systems = {g: dfs[g][[col for col in dfs[g].columns if system in col]] for g in groups} + for system in self.systems: + dfs_systems = {g: dfs[g][[col for col in dfs[g].columns if system in col]] for g in self.subject_types} self.deltas_by_group(dfs_systems, system=system) def run_classification(self): @@ -993,13 +995,8 @@ def run_classification(self): print("Running classification...") - # Set up directory - command_dir = os.path.join(self.dir_path, "clinical_classify") - create_directory(command_dir) - self.set_visualizer(command_dir) - - # Reset flags - self.set_flags() + # Set up + self.command_setup('clinical_classify') # Load data self.load_data(required=["ages", "clinical"]) @@ -1009,22 +1006,18 @@ def run_classification(self): raise ValueError("Must provide two groups to classify.") elif self.args.group1 not in self.df_clinical.columns or self.args.group2 not in self.df_clinical.columns: raise ValueError("Classes must be one of the following: %s" % self.df_clinical.columns.to_list()) - - # Obtain dataframes for each clinical group - df_group1 = self.df_ages[self.df_clinical[self.args.group1]] - df_group2 = self.df_ages[self.df_clinical[self.args.group2]] - - # Obtain systems - systems = [col[6:] for col in self.df_ages.columns if "delta" in col] + else: + df_group1 = self.df_ages[self.df_clinical[self.args.group1]] + df_group2 = self.df_ages[self.df_clinical[self.args.group2]] # Create a classifier for each system - for system in systems: + for system in self.systems: df_group1_system = df_group1[[col for col in df_group1.columns if system in col]] df_group2_system = df_group2[[col for col in df_group2.columns if system in col]] self.classify(df_group1_system, df_group2_system, [self.args.group1, self.args.group2], system=system) # Create a classifier for all systems - if len(systems) > 1: + if len(self.systems) > 1: self.classify(df_group1, df_group2, [self.args.group1, self.args.group2], system="all")