Skip to content

Error model output #169

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ requirements:
- python {{ python }}
- biom-format {{ biom_format }}
- bioconductor-dada2
- matplotlib >=3.8.4
- seaborn >=0.12.2
- q2templates {{ qiime2_epoch }}.*
- r-base {{ r_base }}
- r-optparse >=1.7.1
- openjdk
Expand Down
12 changes: 12 additions & 0 deletions q2_dada2/_dada_stats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from ._visualizer import stats_viz


__all__ = ['stats_viz']
98 changes: 98 additions & 0 deletions q2_dada2/_dada_stats/_visualizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
import pkg_resources
import matplotlib.pyplot as plt
import seaborn as sns
import qiime2.util
import q2templates
import qiime2

TEMPLATES = pkg_resources.resource_filename('q2_dada2._dada_stats', 'assets')


def _plot_errors(transdf, image_paths_arr, output_dir,
image_prefix, error_in, error_out, nominalq):

# generates baseline graph of obs errors
filtered_data = transdf[(transdf['from'].isin(["A", "C", "G", "T"])) &
(transdf['to'].isin(["A", "C", "G", "T"]))]

g = sns.FacetGrid(filtered_data, col="Transition", col_wrap=4)
g.map(plt.scatter, "Qual", "Observed", color="gray")

# options for error graph; estimated line, nominal line, and input line
if error_out is True:
g.map(plt.plot, "Qual", "Estimated", color="gray")
if nominalq is True:
g.map(plt.plot, "Qual", "Nominal", color="red")
if error_in is True:
g.map(plt.plot, "Qual", "Input", color="blue")

# layout options to bring it into line with the dada2 ouput version
g.tight_layout()
g.set(yscale="log")
g.set_axis_labels("", "")
g.figure.text(0.5, 0.009, 'Consensus quality score',
ha='center', va='center', fontsize=15)
g.figure.text(0.009, 0.5, 'Error frequency (log10)',
ha='center', va='center', rotation='vertical', fontsize=15)

# save image for html rendering
for ext in ['png', 'svg']:
img_fp = os.path.join(output_dir,
image_prefix + 'error_graph.%s' % ext)
if ext == 'png':
image_paths_arr.append(
'./' + image_prefix + 'error_graph.png'
)
plt.savefig(img_fp)
plt.clf()
return image_paths_arr # pass back path to image


def stats_viz(output_dir: str, dada2_error_stats: qiime2.Metadata,
nominalq: bool = False, error_in: bool = False,
error_out: bool = True):

# intitalize indexes for html rendering
index = os.path.join(TEMPLATES, 'index.html')

# initalize data structures for passing to html
image_paths_arr = []
paired_or_not = ''
temp_df = dada2_error_stats.to_dataframe()
# determines if the reads are paired
if len(temp_df.columns.tolist()) > 12:
df_fwd_subset = temp_df.iloc[:, :10]
df_fwd_subset.columns = \
df_fwd_subset.columns.str.replace('^F_', '', regex=True)
df_rev_subset = temp_df.iloc[:, -10:]
df_rev_subset.columns = \
df_rev_subset.columns.str.replace('^R_', '', regex=True)
image_paths_arr = _plot_errors(df_fwd_subset, image_paths_arr,
output_dir, "Forward_",
error_in, error_out, nominalq)
image_paths_arr = _plot_errors(df_rev_subset, image_paths_arr,
output_dir, "Reverse_",
error_in, error_out, nominalq)
else:
image_paths_arr = _plot_errors(temp_df, image_paths_arr,
output_dir, "",
error_in, error_out, nominalq)

# intializes the context variable for passing to html
context = {
'paired_or_not': paired_or_not,
'graph_paths': image_paths_arr
}

# renders the template and initalizes the
# js sorter for the dada2 read table
q2templates.render(index, output_dir, context=context)
45 changes: 45 additions & 0 deletions q2_dada2/_dada_stats/assets/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{% extends 'base.html' %}

{% block content %}
<style>
#imager {
display: flex;
justify-content: center;
gap: 20px;
}
#imager img {
max-width: 700px;
height: 700px;
padding: 10px;
border: 1px solid black;
}
</style>


{% for index in range(graph_paths|length) %}

<body>
<h1 ALIGN=CENTER>
{% if graph_paths|length == 1 %}

DADA2 Error Plot<br>

{% else %}
{% if index == 0 %}
Forward Read DADA2 Error Plot<br>
{% else %}
Reverse Read DADA2 Error Plot<br>
{% endif %}

{% endif %}
</h1>

<div id=imager>
<div>
<img src='{{graph_paths[index]}}' >
</div>
</div>
</body>
{% endfor %}

{% endblock %}
36 changes: 26 additions & 10 deletions q2_dada2/_denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@
import pandas as pd
import numpy as np

from qiime2.plugin.util import run_commands

from q2_types.feature_data import DNAIterator
from q2_types.per_sample_sequences import (
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt)
from qiime2.plugin.util import run_commands


def _check_featureless_table(fp):
Expand Down Expand Up @@ -103,7 +102,8 @@ def _filepath_to_sample_paired(fp):
# to have occurred in the calling functions, this is primarily for making
# sure that DADA2 is able to do what it needs to do.

def _denoise_helper(biom_fp, track_fp, hashed_feature_ids, retain_all_samples,
def _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples,
paired=False):

_check_featureless_table(biom_fp)
Expand Down Expand Up @@ -153,6 +153,12 @@ def _denoise_helper(biom_fp, track_fp, hashed_feature_ids, retain_all_samples,
df = df.round(round_cols)
metadata = qiime2.Metadata(df)

# reads in error plot df
df_err = pd.read_csv(err_track_fp, sep='\t', index_col=0)
df_err.index.name = 'id'
df_err.index = df_err.index.astype(str)
metadata_err = qiime2.Metadata(df_err)

# Currently the sample IDs in DADA2 are the file names. We make
# them the sample id part of the filename here.
sid_map = {id_: filepath_to_sample(id_)
Expand Down Expand Up @@ -187,7 +193,10 @@ def _denoise_helper(biom_fp, track_fp, hashed_feature_ids, retain_all_samples,
rep_sequences = DNAIterator(
(skbio.DNA(id_, metadata={'id': id_})
for id_ in table.ids(axis='observation')))
return table, rep_sequences, metadata

# initalize and populate DADA2 diagnoistic Stats dictionary
return table, rep_sequences, {"denoised-read-stats": metadata,
"error-plot-stats": metadata_err}


def _denoise_single(demultiplexed_seqs, trunc_len, trim_left, max_ee, trunc_q,
Expand All @@ -208,11 +217,13 @@ def _denoise_single(demultiplexed_seqs, trunc_len, trim_left, max_ee, trunc_q,
with tempfile.TemporaryDirectory() as temp_dir_name:
biom_fp = os.path.join(temp_dir_name, 'output.tsv.biom')
track_fp = os.path.join(temp_dir_name, 'track.tsv')
err_track_fp = os.path.join(temp_dir_name, 'err_track.tsv')

cmd = ['run_dada.R',
'--input_directory', str(demultiplexed_seqs),
'--output_path', biom_fp,
'--output_track', track_fp,
'--output_err_track', err_track_fp,
'--filtered_directory', temp_dir_name,
'--truncation_length', str(trunc_len),
'--trim_left', str(trim_left),
Expand Down Expand Up @@ -240,8 +251,8 @@ def _denoise_single(demultiplexed_seqs, trunc_len, trim_left, max_ee, trunc_q,
raise Exception("An error was encountered while running DADA2"
" in R (return code %d), please inspect stdout"
" and stderr to learn more." % e.returncode)
return _denoise_helper(biom_fp, track_fp, hashed_feature_ids,
retain_all_samples)
return _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples)


def denoise_single(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
Expand Down Expand Up @@ -298,6 +309,7 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
tmp_reverse = os.path.join(temp_dir, 'reverse')
biom_fp = os.path.join(temp_dir, 'output.tsv.biom')
track_fp = os.path.join(temp_dir, 'track.tsv')
err_track_fp = os.path.join(temp_dir, 'err_track.tsv')
filt_forward = os.path.join(temp_dir, 'filt_f')
filt_reverse = os.path.join(temp_dir, 'filt_r')
manifest_df = demultiplexed_seqs.manifest.view(pd.DataFrame)
Expand All @@ -321,6 +333,7 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
'--input_directory_reverse', tmp_reverse,
'--output_path', biom_fp,
'--output_track', track_fp,
'--output_err_track', err_track_fp,
'--filtered_directory', filt_forward,
'--filtered_directory_reverse', filt_reverse,
'--truncation_length', str(trunc_len_f),
Expand Down Expand Up @@ -355,8 +368,9 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
" in R (return code %d), please inspect stdout"
" and stderr to learn more." % e.returncode)

return _denoise_helper(biom_fp, track_fp, hashed_feature_ids,
retain_all_samples, paired=True)
return _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples,
paired=True)


def _remove_barcode(filename):
Expand Down Expand Up @@ -425,6 +439,7 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
with tempfile.TemporaryDirectory() as temp_dir_name:
biom_fp = os.path.join(temp_dir_name, 'output.tsv.biom')
track_fp = os.path.join(temp_dir_name, 'track.tsv')
err_track_fp = os.path.join(temp_dir_name, 'err_track.tsv')
nop_fp = os.path.join(temp_dir_name, 'nop')
filt_fp = os.path.join(temp_dir_name, 'filt')
for fp in nop_fp, filt_fp:
Expand All @@ -434,6 +449,7 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
'--input_directory', str(demultiplexed_seqs),
'--output_path', biom_fp,
'--output_track', track_fp,
'--output_err_track', err_track_fp,
'--removed_primer_directory', nop_fp,
'--filtered_directory', filt_fp,
'--forward_primer', str(front),
Expand Down Expand Up @@ -470,5 +486,5 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
raise Exception("An error was encountered while running DADA2"
" in R (return code %d), please inspect stdout"
" and stderr to learn more." % e.returncode)
return _denoise_helper(biom_fp, track_fp, hashed_feature_ids,
retain_all_samples)
return _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples)
4 changes: 2 additions & 2 deletions q2_dada2/_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def denoise_single(use):

rep_seqs.assert_output_type('FeatureData[Sequence]')
table_dada2.assert_output_type('FeatureTable[Frequency]')
denoise_stats.assert_output_type('SampleData[DADA2Stats]')
denoise_stats.assert_output_type('Collection[SampleData[DADA2Stats]]')


def denoise_paired(use):
Expand All @@ -55,4 +55,4 @@ def denoise_paired(use):

rep_seqs.assert_output_type('FeatureData[Sequence]')
table_dada2.assert_output_type('FeatureTable[Frequency]')
denoise_stats.assert_output_type('SampleData[DADA2Stats]')
denoise_stats.assert_output_type('Collection[SampleData[DADA2Stats]]')
13 changes: 13 additions & 0 deletions q2_dada2/_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,16 @@ def validate(*args):

DADA2StatsDirFmt = model.SingleFileDirectoryFormat(
'DADA2StatsDirFmt', 'stats.tsv', DADA2StatsFormat)


DADA2ErrorStats = SemanticType('DADA2ErrorStats',
variant_of=SampleData.field['type'])


class DADA2ErrorStatsFormat(model.TextFileFormat):
def validate(*args):
pass


DADA2StatsDirFmt = model.SingleFileDirectoryFormat(
'DADA2StatsDirFmt', 'stats.tsv', DADA2ErrorStatsFormat)
14 changes: 13 additions & 1 deletion q2_dada2/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import qiime2

from q2_dada2 import DADA2StatsFormat
from q2_dada2 import DADA2StatsFormat, DADA2ErrorStatsFormat
from q2_dada2.plugin_setup import plugin


Expand All @@ -22,3 +22,15 @@ def _2(obj: qiime2.Metadata) -> DADA2StatsFormat:
ff = DADA2StatsFormat()
obj.save(str(ff))
return ff


@plugin.register_transformer
def _3(ff: DADA2ErrorStatsFormat) -> qiime2.Metadata:
return qiime2.Metadata.load(str(ff))


@plugin.register_transformer
def _4(obj: qiime2.Metadata) -> DADA2ErrorStatsFormat:
ff = DADA2ErrorStatsFormat()
obj.save(str(ff))
return ff
Loading
Loading