Skip to content

Commit

Permalink
[ENH] Create setup and update params functions
Browse files Browse the repository at this point in the history
  • Loading branch information
JGarciaCondado committed Mar 7, 2024
1 parent 22f1d1d commit 963c8da
Showing 1 changed file with 91 additions and 98 deletions.
189 changes: 91 additions & 98 deletions src/ageml/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class Interface:
---------------
setup(self): Creates required directories and files to store results.
command_setup(self, dir_path): Create required directories and files to store results for command.
set_flags(self): Set flags.
set_visualizer(self): Set visualizer with output directory.
Expand All @@ -52,6 +54,8 @@ class Interface:
set_classifier(self): Set classifier with parameters.
update_params(self): Update initial parameters after load.
check_file(self, file): Check that file exists.
load_csv(self, file): Use panda to load csv into dataframe.
Expand Down Expand Up @@ -117,13 +121,11 @@ def __init__(self, args):
def setup(self):
"""Create required directories and files to store results."""

# Create directories
# Create directory
self.dir_path = os.path.join(self.args.output, "ageml")
if os.path.exists(self.dir_path):
warnings.warn(
"Directory %s already exists files may be overwritten." % self.dir_path,
category=UserWarning,
)
warnings.warn("Directory %s already exists files may be overwritten." % self.dir_path,
category=UserWarning)
create_directory(self.dir_path)

# Create .txt log file and log time
Expand All @@ -132,10 +134,32 @@ def setup(self):
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
f.write(current_time + "\n")

def command_setup(self, dir_path):
"""Create required directories and files to store results for command.
Parameters
----------
dir_path: directory path to create"""

# Create directory
command_dir = os.path.join(self.dir_path, dir_path)
create_directory(command_dir)
self.set_visualizer(command_dir)

# Reset flags
self.set_flags()

# Set initial parameters for model to defaults
self.naming = ""
self.subject_types = ['cn']
self.covars = ['all']
self.systems = ['all']

def set_flags(self):
"""Set flags."""

self.flags = {"clinical": False, "covariates": False, "covarname": False, "systems": False}
self.flags = {"clinical": False, "covariates": False, "covarname": False,
"systems": False, "ages": False}

def set_visualizer(self, dir):
"""Set visualizer with output directory."""
Expand Down Expand Up @@ -168,6 +192,21 @@ def generate_classifier(self):

return classifier

def update_params(self):
"""Update initial parameters after load."""

# Check possible flags of interest
if self.flags['clinical']:
self.subject_types = self.df_clinical.columns.to_list()
if self.flags['covarname']:
self.covars = pd.unique(self.df_covariates[self.args.covar_name]).tolist()
self.naming += f"_{self.args.covar_name}"
if self.flags['systems']:
self.systems = list(self.dict_systems.keys())
self.naming += "_multisystem"
if self.flags['ages']:
self.systems = [col[6:] for col in self.df_ages.columns if "delta" in col]

def check_file(self, file):
"""Check that file exists."""
if not os.path.exists(file):
Expand Down Expand Up @@ -331,6 +370,7 @@ def load_ages(self, required=False):
return df

# Required columns
self.flags['ages'] = True
req_cols = ["age", "predicted_age", "corrected_age", "delta"]
cols = [col.lower() for col in df.columns.to_list()]

Expand Down Expand Up @@ -512,6 +552,9 @@ def load_data(self, required=None):
index = self.df_ages.index
self.df_clinical = pd.DataFrame(index=index, columns=['cn'], data=True)

# Update initial parameters after load
self.update_params()

def age_distribution(self, ages_dict: dict, name=""):
"""Use visualizer to show age distribution.
Expand Down Expand Up @@ -818,50 +861,31 @@ def run_age(self):
print("Running age modelling...")

# Set up directory
command_dir = os.path.join(self.dir_path, "model_age")
create_directory(command_dir)
self.set_visualizer(command_dir)

# Reset flags
self.set_flags()

# Set initial parameters for model to defaults
naming = ""
subject_types = ['cn']
covars = ['all']
systems = ['all']
self.command_setup('model_age')

# Load data
self.load_data(required=["features"])

# Check possible flags of interest
if self.flags['clinical']:
subject_types = self.df_clinical.columns.to_list()
if self.flags['covarname']:
covars = pd.unique(self.df_covariates[self.args.covar_name]).tolist()
naming += f"_{self.args.covar_name}"
if self.flags['systems']:
systems = list(self.dict_systems.keys())
naming += "_multisystem"

# Initialized dictionaries
dfs = {subject_type: {covar: {system: {} for system in systems} for covar in covars} for subject_type in subject_types}
preds = {subject_type: {covar: {system: {} for system in systems} for covar in covars} for subject_type in subject_types}
models = {covar: {system: {} for system in systems} for covar in covars}
betas = {covar: {system: {} for system in systems} for covar in covars}
dfs = {subject_type: {covar: {system: {} for system in self.systems}
for covar in self.covars} for subject_type in self.subject_types}
preds = {subject_type: {covar: {system: {} for system in self.systems}
for covar in self.covars} for subject_type in self.subject_types}
models = {covar: {system: {} for system in self.systems} for covar in self.covars}
betas = {covar: {system: {} for system in self.systems} for covar in self.covars}

# Obtain dataframes for each subject type, covariate and system
for subject_type in subject_types:
for subject_type in self.subject_types:
# Keep only the subjects of the specified type
df_sub = self.df_features[self.df_clinical[subject_type]]
for covar in covars:
for covar in self.covars:
# Keep subjects with the specified covariate
if self.flags['covarname']:
covar_index = set(self.df_covariates[self.df_covariates[self.args.covar_name] == covar].index)
df_cov = df_sub[df_sub.index.isin(covar_index)]
else:
df_cov = df_sub
for system in systems:
for system in self.systems:
# Keep only the features of the system
if self.flags['systems']:
df_sys = df_cov[['age'] + self.dict_systems[system]]
Expand All @@ -871,17 +895,17 @@ def run_age(self):
dfs[subject_type][covar][system] = df_sys

# Use visualizer to show age distribution of controls per covariate (all systems share the age distribution)
cn_ages = {covar: dfs['cn'][covar][systems[0]]['age'].to_list() for covar in covars}
self.age_distribution(cn_ages, name="controls" + naming)
cn_ages = {covar: dfs['cn'][covar][self.systems[0]]['age'].to_list() for covar in self.covars}
self.age_distribution(cn_ages, name="controls" + self.naming)

# Show features vs age for controls for each system
for system in systems:
cn_features = {covar: dfs['cn'][covar][system] for covar in covars}
self.features_vs_age(cn_features, name="controls" + naming + "_" + system)
for system in self.systems:
cn_features = {covar: dfs['cn'][covar][system] for covar in self.covars}
self.features_vs_age(cn_features, name="controls" + self.naming + "_" + system)

# Model age for each system on controls
for covar in covars:
for system in systems:
for covar in self.covars:
for system in self.systems:
model_name = f"{covar}_{system}"
ageml_model = self.generate_model()
models[covar][system], df_pred, betas[covar][system] = self.model_age(dfs['cn'][covar][system],
Expand All @@ -891,12 +915,12 @@ def run_age(self):
preds['cn'][covar][system] = df_pred

# Apply to all other subject types
for subject_type in subject_types:
for subject_type in self.subject_types:
# Do not apply to controls
if subject_type == 'cn':
continue
for covar in covars:
for system in systems:
for covar in self.covars:
for system in self.systems:
model_name = f"{covar}_{system}"
df_pred = self.predict_age(dfs[subject_type][covar][system], models[covar][system],
betas[covar][system], model_name=model_name)
Expand All @@ -906,9 +930,9 @@ def run_age(self):

# Concatenate predictions into a DataFrame
stack = []
for subject_type in subject_types:
for covar in covars:
df_systems = pd.concat([preds[subject_type][covar][system] for system in systems], axis=1)
for subject_type in self.subject_types:
for covar in self.covars:
df_systems = pd.concat([preds[subject_type][covar][system] for system in self.systems], axis=1)
stack.append(df_systems)
df_ages = pd.concat(stack, axis=0)

Expand All @@ -919,41 +943,26 @@ def run_age(self):
df_ages = pd.concat([self.df_features['age'], df_ages], axis=1)

# Save dataframe to csv
filename = "predicted_age" + naming + ".csv"
filename = "predicted_age" + self.naming + ".csv"
df_ages.to_csv(os.path.join(self.dir_path, filename))

def run_factor_correlation(self):
"""Run factor correlation analysis between deltas and factors."""

print("Running factors correlation analysis...")

# Set up directory
command_dir = os.path.join(self.dir_path, "factor_correlation")
create_directory(command_dir)
self.set_visualizer(command_dir)

# Reset flags
self.set_flags()

# Initial parameters
subject_types = ['cn']
# Set up
self.command_setup('factor_correlation')

# Load data
self.load_data(required=["ages", "factors"])

# Check possible flags of interest
if self.flags['clinical']:
subject_types = self.df_clinical.columns.to_list()

# Obtain systems
systems = [col[6:] for col in self.df_ages.columns if "delta" in col]

# For each subject type
for subject_type in subject_types:
# For each subject type and system run correlation analysis
for subject_type in self.subject_types:
dfs_systems = {}
df_sub = self.df_ages.loc[self.df_clinical[subject_type]]
df_factors = self.df_factors.loc[df_sub.index]
for system in systems:
for system in self.systems:
df_sys = df_sub[[col for col in df_sub.columns if system in col]]
dfs_systems[system] = df_sys
self.factors_vs_deltas(dfs_systems, df_factors, subject_type)
Expand All @@ -963,43 +972,31 @@ def run_clinical(self):

print("Running clinical outcomes...")

# Set up directory
command_dir = os.path.join(self.dir_path, "clinical_groups")
create_directory(command_dir)
self.set_visualizer(command_dir)

# Reset flags
self.set_flags()
# Set up
self.command_setup('clinical_groups')

# Load data
self.load_data(required=["ages", "clinical"])

# Obtain dataframes for each group
groups = self.df_clinical.columns.to_list()
dfs = {g: self.df_ages.loc[self.df_clinical[g]] for g in groups}
dfs = {g: self.df_ages.loc[self.df_clinical[g]] for g in self.subject_types}

# Use visualizer to show age distribution per clinical group
ages = {g: dfs[g].iloc[:, 0].to_list() for g in groups}
ages = {g: dfs[g].iloc[:, 0].to_list() for g in self.subject_types}
self.age_distribution(ages, name="clinical_groups")

# Show differences in groups per system
systems = [col[6:] for col in self.df_ages.columns if "delta" in col]
for system in systems:
dfs_systems = {g: dfs[g][[col for col in dfs[g].columns if system in col]] for g in groups}
for system in self.systems:
dfs_systems = {g: dfs[g][[col for col in dfs[g].columns if system in col]] for g in self.subject_types}
self.deltas_by_group(dfs_systems, system=system)

def run_classification(self):
"""Run classification between two different clinical groups."""

print("Running classification...")

# Set up directory
command_dir = os.path.join(self.dir_path, "clinical_classify")
create_directory(command_dir)
self.set_visualizer(command_dir)

# Reset flags
self.set_flags()
# Set up
self.command_setup('clinical_classify')

# Load data
self.load_data(required=["ages", "clinical"])
Expand All @@ -1009,22 +1006,18 @@ def run_classification(self):
raise ValueError("Must provide two groups to classify.")
elif self.args.group1 not in self.df_clinical.columns or self.args.group2 not in self.df_clinical.columns:
raise ValueError("Classes must be one of the following: %s" % self.df_clinical.columns.to_list())

# Obtain dataframes for each clinical group
df_group1 = self.df_ages[self.df_clinical[self.args.group1]]
df_group2 = self.df_ages[self.df_clinical[self.args.group2]]

# Obtain systems
systems = [col[6:] for col in self.df_ages.columns if "delta" in col]
else:
df_group1 = self.df_ages[self.df_clinical[self.args.group1]]
df_group2 = self.df_ages[self.df_clinical[self.args.group2]]

# Create a classifier for each system
for system in systems:
for system in self.systems:
df_group1_system = df_group1[[col for col in df_group1.columns if system in col]]
df_group2_system = df_group2[[col for col in df_group2.columns if system in col]]
self.classify(df_group1_system, df_group2_system, [self.args.group1, self.args.group2], system=system)

# Create a classifier for all systems
if len(systems) > 1:
if len(self.systems) > 1:
self.classify(df_group1, df_group2, [self.args.group1, self.args.group2], system="all")


Expand Down

0 comments on commit 963c8da

Please sign in to comment.