Skip to content

Commit

Permalink
output genome metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
fernandomeyer committed Jun 11, 2021
1 parent f1f7cc6 commit f46ce06
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 31 deletions.
37 changes: 23 additions & 14 deletions amber.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,6 @@ def evaluate(queries_list, sample_id):
df_summary = pd.concat([df_summary, query_metrics_df], ignore_index=True, sort=True)

query.precision_df[utils_labels.TOOL] = query.label

# query.recall_df.to_csv(os.path.join(query.options.output_dir, 'genome', query.label, 'metrics_per_genome.tsv'), sep='\t', index=False)

pd_bins_all = pd.concat([pd_bins_all, query.precision_df.reset_index()], ignore_index=True, sort=True)

pd_bins_all['sample_id'] = sample_id
Expand All @@ -207,10 +204,10 @@ def evaluate_samples_queries(sample_id_to_queries_list):
return df_summary_all, pd_bins_all


def save_metrics(df_summary, pd_bins, output_dir, stdout):
def save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, stdout):
logging.getLogger('amber').info('Saving computed metrics')
df_summary.to_csv(os.path.join(output_dir, 'results.tsv'), sep='\t', index=False)
# pd_bins.to_csv(os.path.join(output_dir, 'bins.tsv'), index=False, sep='\t')
pd_bins.to_csv(os.path.join(output_dir, 'bin_metrics.tsv'), index=False, sep='\t')
if stdout:
summary_columns = [utils_labels.TOOL] + [col for col in df_summary if col != utils_labels.TOOL]
print(df_summary[summary_columns].to_string(index=False))
Expand All @@ -225,6 +222,18 @@ def save_metrics(df_summary, pd_bins, output_dir, stdout):
table = pd_group[['sample_id'] + list(bins_columns.keys())].rename(columns=dict(bins_columns))
table.to_csv(os.path.join(output_dir, 'taxonomic', tool, 'metrics_per_bin.tsv'), sep='\t', index=False)

pd_genomes_all = pd.DataFrame()
for sample_id in sample_id_to_queries_list:
pd_genomes_sample = pd.DataFrame()
for query in sample_id_to_queries_list[sample_id]:
if isinstance(query, binning_classes.GenomeQuery):
query.recall_df_cami1[utils_labels.TOOL] = query.label
pd_genomes_sample = pd.concat([pd_genomes_sample, query.recall_df_cami1], ignore_index=True, sort=False)
pd_genomes_sample['sample_id'] = sample_id
pd_genomes_all = pd.concat([pd_genomes_all, pd_genomes_sample], ignore_index=True, sort=False)
if not pd_genomes_all.empty:
pd_genomes_all.to_csv(os.path.join(output_dir, 'genome_metrics_cami1.tsv'), index=False, sep='\t')


def main(args=None):
parser = argparse.ArgumentParser(description="AMBER: Assessment of Metagenome BinnERs",
Expand Down Expand Up @@ -286,16 +295,16 @@ def main(args=None):

df_summary, pd_bins = evaluate_samples_queries(sample_id_to_queries_list)

save_metrics(df_summary, pd_bins, output_dir, args.stdout)
save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, args.stdout)

# plot_genome_binning(args.colors,
# sample_id_to_queries_list,
# df_summary,
# pd_bins[pd_bins['rank'] == 'NA'],
# labels,
# coverages_pd,
# output_dir)
# plot_taxonomic_binning(args.colors, df_summary, pd_bins, labels, output_dir)
plot_genome_binning(args.colors,
sample_id_to_queries_list,
df_summary,
pd_bins[pd_bins['rank'] == 'NA'],
labels,
coverages_pd,
output_dir)
plot_taxonomic_binning(args.colors, df_summary, pd_bins, labels, output_dir)

amber_html.create_html(df_summary,
pd_bins,
Expand Down
20 changes: 16 additions & 4 deletions src/binning_classes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2020 Department of Computational Biology for Infection Research - Helmholtz Centre for Infection Research
# Copyright 2021 Department of Computational Biology for Infection Research - Helmholtz Centre for Infection Research
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -390,7 +390,7 @@ def __init__(self, label, sample_id, options):
self.__gold_standard = None
self.__gold_standard_df = None
self.__precision_df = pd.DataFrame()
self.__recall_df = None
self.__recall_df = pd.DataFrame()
self.__confusion_df = None
self.__metrics = None
self.__metrics_filtered = None
Expand Down Expand Up @@ -490,16 +490,25 @@ class GenomeQuery(Query):
def __init__(self, df, label, sample_id, options):
super().__init__(label, sample_id, options)
self.__df = df
self.__recall_df_cami1 = pd.DataFrame()
self.metrics = Metrics()

@property
def df(self):
return self.__df

@property
def recall_df_cami1(self):
return self.__recall_df_cami1

@df.setter
def df(self, df):
self.__df = df

@recall_df_cami1.setter
def recall_df_cami1(self, recall_df_cami1):
self.__recall_df_cami1 = recall_df_cami1

def get_metrics_df(self):
metrics_dict = self.metrics.get_ordered_dict()
metrics_dict[utils_labels.TOOL] = self.label
Expand Down Expand Up @@ -618,15 +627,15 @@ def compute_metrics(self):
if self.options.genome_to_unique_common:
unmapped_genomes -= set(self.options.genome_to_unique_common)
num_unmapped_genomes = len(unmapped_genomes)
prec_copy = precision_df[['recall_bp', 'recall_seq']].reset_index()
# prec_copy = precision_df[['recall_bp', 'recall_seq', 'genome_id']].loc[precision_df.groupby('genome_id', sort=False)['recall_bp'].idxmax()].reset_index()
prec_copy = precision_df.reset_index()
if num_unmapped_genomes:
prec_copy = prec_copy.reindex(prec_copy.index.tolist() + list(range(len(prec_copy), len(prec_copy) + num_unmapped_genomes))).fillna(.0)
self.metrics.recall_avg_bp_cami1 = prec_copy['recall_bp'].mean()
self.metrics.recall_avg_seq_cami1 = prec_copy['recall_seq'].mean()
self.metrics.recall_avg_bp_sem_cami1 = prec_copy['recall_bp'].sem()
self.metrics.recall_avg_seq_sem_cami1 = prec_copy['recall_seq'].sem()
self.metrics.recall_avg_bp_var_cami1 = prec_copy['recall_bp'].var()
self.recall_df_cami1 = prec_copy
# End Compute recall as in CAMI 1

self.metrics.accuracy_bp = precision_df['tp_length'].sum() / recall_df['length_gs'].sum()
Expand Down Expand Up @@ -732,6 +741,9 @@ def rank_to_df(self, rank_to_df):
self.__rank_to_df = rank_to_df

def _create_profile(self, all_bins):
if self.recall_df.empty:
return [], []

class Prediction:
def __init__(self):
pass
Expand Down
20 changes: 9 additions & 11 deletions src/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,15 @@ def plot_by_genome_coverage(pd_bins, pd_target_column, available_tools, output_d

for i, (color, tool) in enumerate(zip(colors_list, available_tools)):
pd_tool = pd_bins[pd_bins[utils_labels.TOOL] == tool].sort_values(by=['genome_index'])
axs.scatter(pd_tool['genome_coverage'], pd_tool[pd_target_column], marker='o', color=colors_list[i], s=[2] * pd_tool.shape[0])
axs.scatter(pd_tool['genome_coverage'], pd_tool[pd_target_column], marker='o', color=colors_list[i], s=[3] * pd_tool.shape[0])
window = 50
rolling_mean = pd_tool[pd_target_column].rolling(window=window, min_periods=10).mean()
axs.plot(pd_tool['genome_coverage'], rolling_mean, color=colors_list[i])

axs.set_xlim([0.0, pd_tool['genome_coverage'].max()])
axs.set_ylim([0.0, 1.0])
axs.set_ylim([-0.01, 1.01])

# transform plot_labels to percentages
vals = axs.get_yticks()
axs.set_yticklabels(['{:3.0f}'.format(x * 100) for x in vals], fontsize=12)
axs.set_xticklabels(['{:,.1f}'.format(np.exp(x)) for x in axs.get_xticks()], fontsize=12)
axs.set_yticklabels(['{:3.0f}'.format(x * 100) for x in axs.get_yticks()], fontsize=12)

axs.tick_params(axis='x', labelsize=12)

Expand All @@ -106,7 +104,7 @@ def plot_by_genome_coverage(pd_bins, pd_target_column, available_tools, output_d
file_name = 'completeness_by_genome_coverage'

plt.ylabel(ylabel, fontsize=15)
plt.xlabel('log$_{10}$(average genome coverage)', fontsize=15)
plt.xlabel('Average genome coverage', fontsize=15)

colors_iter = iter(colors_list)
circles = []
Expand All @@ -124,7 +122,7 @@ def get_pd_genomes_recall(sample_id_to_queries_list):
for query in sample_id_to_queries_list[sample_id]:
if not isinstance(query, binning_classes.GenomeQuery):
continue
recall_df = query.recall_df[['genome_id', 'recall_bp']].copy()
recall_df = query.recall_df_cami1[['genome_id', 'recall_bp']].copy()
recall_df[utils_labels.TOOL] = query.label
recall_df['sample_id'] = sample_id
recall_df = recall_df.reset_index().set_index(['sample_id', utils_labels.TOOL])
Expand All @@ -141,13 +139,13 @@ def plot_precision_recall_by_coverage(sample_id_to_queries_list, pd_bins_g, cove

pd_genomes_recall = get_pd_genomes_recall(sample_id_to_queries_list)
pd_genomes_recall['genome_index'] = pd_genomes_recall['genome_id'].map(coverages_pd['rank'].to_dict())
pd_genomes_recall = pd_genomes_recall.groupby([utils_labels.TOOL, 'genome_id']).mean().reset_index()
pd_genomes_recall['genome_coverage'] = np.log10(pd_genomes_recall['genome_id'].map(coverages_pd['COVERAGE'].to_dict()))
pd_genomes_recall = pd_genomes_recall.reset_index()
pd_genomes_recall['genome_coverage'] = np.log(pd_genomes_recall['genome_id'].map(coverages_pd['COVERAGE'].to_dict()))
plot_by_genome_coverage(pd_genomes_recall, 'recall_bp', available_tools, output_dir)

pd_bins_precision = pd_bins_g[[utils_labels.TOOL, 'precision_bp', 'genome_id']].copy().dropna(subset=['precision_bp'])
pd_bins_precision['genome_index'] = pd_bins_precision['genome_id'].map(coverages_pd['rank'].to_dict())
pd_bins_precision['genome_coverage'] = np.log10(pd_bins_precision['genome_id'].map(coverages_pd['COVERAGE'].to_dict()))
pd_bins_precision['genome_coverage'] = np.log(pd_bins_precision['genome_id'].map(coverages_pd['COVERAGE'].to_dict()))
plot_by_genome_coverage(pd_bins_precision, 'precision_bp', available_tools, output_dir)


Expand Down
5 changes: 3 additions & 2 deletions src/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,13 @@ def read_metadata(file_path_query):


def load_binnings(samples_metadata, file_path_query):
columns = ['SEQUENCEID', 'BINID', 'TAXID', 'LENGTH', '_LENGTH']
sample_id_to_query_df = OrderedDict()
for metadata in samples_metadata:
logging.getLogger('amber').info('Loading ' + metadata[2]['SAMPLEID'])
nrows = metadata[1] - metadata[0] + 1

df = pd.read_csv(file_path_query, sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None)
col_indices = [k for k, v in metadata[3].items() if v in columns]
df = pd.read_csv(file_path_query, sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None, usecols=col_indices)
df.rename(columns=metadata[3], inplace=True)
df.rename(columns={'_LENGTH': 'LENGTH'}, inplace=True)
sample_id_to_query_df[metadata[2]['SAMPLEID']] = df
Expand Down

0 comments on commit f46ce06

Please sign in to comment.