Skip to content

Commit

Permalink
fix: store tax. binning results
Browse files Browse the repository at this point in the history
  • Loading branch information
fernandomeyer committed Aug 16, 2024
1 parent 50326b2 commit 29e54c7
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 20 deletions.
5 changes: 3 additions & 2 deletions amber.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,15 @@ def main(args=None):
ncbi_dir=args.ncbi_dir,
skip_gs=args.skip_gs)

sample_id_to_g_queries_list, sample_id_to_t_queries_list, sample_ids_list = load_data.load_queries_mthreaded(
sample_id_to_g_queries_list, sample_id_to_t_queries_list, sample_ids_list = load_data.load_queries(
args.gold_standard_file, args.bin_files, labels, options, options_gs)

coverages_pd = load_data.open_coverages(args.genome_coverage)

sample_id_to_queries_list = defaultdict(list)
for sample_id in sample_id_to_g_queries_list | sample_id_to_t_queries_list:
for sample_id in sample_id_to_g_queries_list:
sample_id_to_queries_list[sample_id] += sample_id_to_g_queries_list[sample_id]
for sample_id in sample_id_to_t_queries_list:
sample_id_to_queries_list[sample_id] += sample_id_to_t_queries_list[sample_id]

create_output_directories(output_dir, sample_id_to_queries_list)
Expand Down
30 changes: 16 additions & 14 deletions cami_amber/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,24 @@ def evaluate_sample(queries_list):


def evaluate_samples_queries(sample_id_to_g_queries_list, sample_id_to_t_queries_list):
for sample_id in sample_id_to_g_queries_list:
if not sample_id_to_g_queries_list[sample_id]:
continue
evaluate_sample(sample_id_to_g_queries_list[sample_id])
for sample_id in sample_id_to_t_queries_list:
if not sample_id_to_t_queries_list[sample_id]:
continue
evaluate_sample(sample_id_to_t_queries_list[sample_id])

pd_bins = pd.DataFrame()
df_summary = pd.DataFrame()
for sample_id in sample_id_to_g_queries_list:
for query in sample_id_to_g_queries_list[sample_id]:
if query.eval_success:
df_summary = pd.concat([df_summary, query.get_metrics_df().dropna(axis=1, how='all')], ignore_index=True, sort=True)
pd_bins = pd.concat([pd_bins, query.precision_df.reset_index()], ignore_index=True, sort=True)

def get_metrics(sample_id_to_queries_list):
nonlocal pd_bins
nonlocal df_summary

for sample_id in sample_id_to_queries_list:
evaluate_sample(sample_id_to_queries_list[sample_id])

for sample_id in sample_id_to_queries_list:
for query in sample_id_to_queries_list[sample_id]:
if query.eval_success:
df_summary = pd.concat([df_summary, query.get_metrics_df().dropna(axis=1, how='all')], ignore_index=True, sort=True)
pd_bins = pd.concat([pd_bins, query.precision_df.reset_index()], ignore_index=True, sort=True)

get_metrics(sample_id_to_g_queries_list)
get_metrics(sample_id_to_t_queries_list)

# Gold standard only has unfiltered metrics, so copy values to unfiltered columns
for col in df_summary.columns:
Expand Down
2 changes: 2 additions & 0 deletions cami_amber/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,8 @@ def plot_boxplot(sample_id_to_queries_list, metric_name, output_dir, available_t
pd_bins = pd.DataFrame()
for sample_id in sample_id_to_queries_list:
for query in sample_id_to_queries_list[sample_id]:
if not isinstance(query, binning_classes.GenomeQuery):
continue
metric_df = getattr(query, metric_name.replace('_bp', '_df')).copy()
metric_df[utils_labels.TOOL] = query.label
metric_df['sample_id'] = sample_id
Expand Down
10 changes: 6 additions & 4 deletions cami_amber/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def get_rank_to_df(query_df, taxonomy_df, label, is_gs=False):
return rank_to_df


def load_queries_mthreaded(gold_standard_file, bin_files, labels, options=None, options_gs=None):
def load_queries(gold_standard_file, bin_files, labels, options=None, options_gs=None):
max_workers = min(len(labels) + 1, os.cpu_count() or 1)
pool = ThreadPool(max_workers)

Expand Down Expand Up @@ -273,13 +273,15 @@ def load_queries_mthreaded(gold_standard_file, bin_files, labels, options=None,
if 'BINID' in columns:
g_query_gs = binning_classes.GenomeQuery(utils_labels.GS, sample_id, options_gs, metadata, True)
g_query_gs.gold_standard = g_query_gs
sample_id_to_g_queries_list[sample_id].append(g_query_gs)
sample_id_to_g_gs[sample_id] = g_query_gs
if not options.skip_gs:
sample_id_to_g_queries_list[sample_id].append(g_query_gs)
if 'TAXID' in columns and not taxonomy_df.empty:
t_query_gs = binning_classes.TaxonomicQuery(utils_labels.GS, sample_id, options_gs, metadata, taxonomy_df, True)
t_query_gs.gold_standard = t_query_gs
sample_id_to_t_queries_list[sample_id].append(t_query_gs)
sample_id_to_t_gs[sample_id] = t_query_gs
if not options.skip_gs:
sample_id_to_t_queries_list[sample_id].append(t_query_gs)

for query, label in zip(samples_metadata_queries, labels):
for metadata in query:
Expand All @@ -300,6 +302,6 @@ def load_queries_mthreaded(gold_standard_file, bin_files, labels, options=None,
t_query = binning_classes.TaxonomicQuery(label, sample_id, options, metadata, taxonomy_df)
t_query.gold_standard = sample_id_to_t_gs[sample_id]
sample_id_to_t_queries_list[sample_id].append(t_query)
options.only_taxonomic_queries = options_gs.only_genome_queries = False
options.only_genome_queries = options_gs.only_genome_queries = False

return sample_id_to_g_queries_list, sample_id_to_t_queries_list, [metadata[2]['SAMPLEID'] for metadata in samples_metadata_gs]

0 comments on commit 29e54c7

Please sign in to comment.