Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updated snpeff version, fixed gb parsing py script #6387

Merged
merged 19 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 38 additions & 27 deletions tool_collections/snpeff/gbk2fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@


def get_opener(gbk_filename):
"""Determines the appropriate opener for a given file, supporting
bzip2, gzip, or standard open.
"""
try:
bz2.open(gbk_filename).read(1)
return bz2.open
Expand All @@ -18,30 +21,38 @@ def get_opener(gbk_filename):
return open


parser = argparse.ArgumentParser()
parser.add_argument(
"genbank_file",
help="GenBank input file. Can be compressed with gzip or bzip2"
)
parser.add_argument(
"fasta_file", help="FASTA output datset"
)
parser.add_argument(
"--remove_version", action="store_true",
help="Remove version number from NCBI form formatted accession numbers. "
"For example, this would convert 'B000657.2' to 'B000657'"
)
args = parser.parse_args()


gbk_open = get_opener(args.genbank_file)
with gbk_open(args.genbank_file, 'rt') as input_handle, \
open(args.fasta_file, 'w') as output_handle:
for seq_record in SeqIO.parse(input_handle, 'genbank'):
if args.remove_version:
seq_id = seq_record.id.split('.')[0]
else:
seq_id = seq_record.id
print('Writing FASTA record: {}'.format(seq_id))
print('>' + seq_id, file=output_handle)
print(seq_record.seq, file=output_handle)
def main():
parser = argparse.ArgumentParser(
description="Convert GenBank files to FASTA format. "
"Supports gzip and bzip2 compressed files."
)
parser.add_argument(
"genbank_file",
help="GenBank input file. Can be compressed with gzip or bzip2"
)
parser.add_argument(
"fasta_file",
help="FASTA output dataset"
)
parser.add_argument(
"--remove_version", action="store_true",
help="Remove version number from NCBI formatted accession numbers. "
"For example, this converts 'B000657.2' to 'B000657'."
)
args = parser.parse_args()

gbk_open = get_opener(args.genbank_file)
with gbk_open(args.genbank_file, 'rt') as input_handle, \
open(args.fasta_file, 'w') as output_handle:
for seq_record in SeqIO.parse(input_handle, 'genbank'):
if args.remove_version:
seq_id = seq_record.id.split('.')[0]
else:
seq_id = seq_record.id
print(f'Writing FASTA record: {seq_id}')
output_handle.write(f'>{seq_id}\n')
output_handle.write(f'{seq_record.seq}\n')


if __name__ == "__main__":
main()
69 changes: 20 additions & 49 deletions tool_collections/snpeff/snpEff.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff" name="SnpEff eff:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff" name="SnpEff eff:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> annotate variants</description>
<macros>
<import>snpEff_macros.xml</import>
Expand All @@ -13,7 +13,7 @@
ln -s '${intervals}' intervals.bed &&
#end if
snpEff @JAVA_OPTIONS@ eff
-i $inputFormat -o ${outputConditional.outputFormat} -upDownStreamLen $udLength
-i $inputFormat -o ${outputFormat} -upDownStreamLen $udLength
#if $spliceSiteSize and str($spliceSiteSize) != '':
-spliceSiteSize "$spliceSiteSize"
#end if
Expand Down Expand Up @@ -53,9 +53,6 @@
#if $csvStats:
-csvStats '$csvFile'
#end if
#if str($offset) != 'default':
${offset}
#end if
#if str($chr).strip() != '':
-chr '$chr'
#end if
Expand Down Expand Up @@ -103,35 +100,21 @@
mkdir '$statsFile.files_path' &&
mv '$genes_file' '#echo os.path.join($statsFile.files_path, $genes_file_name)#'
#end if
#if $outputConditional.outputFormat == 'gatk' and $outputConditional.gatk_v1
&&
## Replace real SnpEff version with 2.0.5 to prevent this GATK 1.x error: "The version of SnpEff used to generate the SnpEff input file (x.x) is not currently supported by the GATK. Supported versions are: [2.0.5]"
sed -i.bak -e 's/^\#\#SnpEffVersion="\(\S*\s\)/\#\#SnpEffVersion="2.0.5 - real is \1/' '$snpeff_output'
#end if
]]></command>
<inputs>
<param name="input" type="data" format="vcf,bed" label="Sequence changes (SNPs, MNPs, InDels)"/>

<param name="inputFormat" type="select" label="Input format">
<param argument="-i" name="inputFormat" type="select" label="Input format" help="Specify the format of input dataset(s)">
<option value="vcf" selected="true">VCF</option>
<option value="bed">BED (Deprecated)</option>
<option value="bed">BED</option>
</param>

<conditional name="outputConditional">
<param name="outputFormat" type="select" label="Output format">
<option value="vcf" selected="true">VCF (only if input is VCF)</option>
<option value="gatk">GATK-compatible VCF (only if input is VCF)</option>
<option value="bed">BED</option>
<option value="bedAnn">BED annotations</option>
</param>
<when value="vcf" />
<when value="gatk">
<param name="gatk_v1" type="boolean" checked="true" label="Compatible with GATK 1.x" />
</when>
<when value="bed" />
<when value="bedAnn" />
</conditional>
<param name="csvStats" type="boolean" truevalue="-csvStats" falsevalue="" checked="false" label="Create CSV report, useful for downstream analysis (-csvStats)" />
<param argument="-o" name="outputFormat" type="select" label="Output format" help="Specify output format">
<option value="vcf" selected="true">VCF (only if input is VCF)</option>
<option value="gatk">GATK-compatible VCF (only if input is VCF)</option>
<option value="bed">BED</option>
<option value="bedAnn">BED annotations</option>
</param>
<param argument="-csvStats" type="boolean" truevalue="-csvStats" falsevalue="" checked="false" label="Create CSV report?" help="Useful for downstream analyses and report generation" />
<param argument="-noStats" name="generate_stats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats?" help="Generates an HTML summary of results"/>
<conditional name="snpDb">
<param name="genomeSrc" type="select" label="Genome source">
<!-- These options are referenced in the help section of SnpEff download tool. If you change them, change help of SnpEff download as well -->
Expand Down Expand Up @@ -171,8 +154,7 @@
</section>
</when>
<when value="named">
<param name="genome_version" type="text" value="" label="Snpff Genome Version Name (e.g. GRCh38.86)">
<help>@SNPEFF_DATABASE_URL@</help>
<param name="genome_version" type="text" value="" label="Snpff Genome Version Name (e.g. GRCh38.86)" help="A list of databases can be obtained with 'snpEff download' tool">
<validator type="empty_field" message="A genome version name is required" />
</param>
</when>
Expand Down Expand Up @@ -209,18 +191,16 @@
</param>
</when>
</conditional>

<param name="udLength" argument="-ud" type="select" label="Upstream / Downstream length">
<option value="0">No upstream / downstream intervals (0 bases)</option>
<option value="0" selected="true">No upstream / downstream intervals (0 bases)</option>
<option value="200">200 bases</option>
nekrut marked this conversation as resolved.
Show resolved Hide resolved
<option value="500">500 bases</option>
<option value="1000">1000 bases</option>
<option value="2000">2000 bases</option>
<option value="5000" selected="true">5000 bases</option>
<option value="5000">5000 bases</option>
<option value="10000">10000 bases</option>
<option value="20000">20000 bases</option>
</param>

<param name="spliceSiteSize" argument="-ss" type="select" optional="true" label="Set size for splice sites (donor and acceptor) in bases">
<option value="1">1 base</option>
<option value="2" selected="true">2 bases</option>
Expand All @@ -232,7 +212,6 @@
<option value="8">8 bases</option>
<option value="9">9 bases</option>
</param>

<conditional name="spliceRegion">
<param name="setSpliceRegions" type="select" label="spliceRegion Settings">
<option value="no">Use Defaults</option>
Expand All @@ -245,7 +224,6 @@
<param argument="-spliceRegionIntronMax" type="integer" value="" min="1" max="10" optional="true" label="Set maximum number of bases for splice site region within intron. Default: 8 bases" />
</when>
</conditional>

<param name="annotations" type="select" display="checkboxes" multiple="true" label="Annotation options">
<option value="-formatEff">Use 'EFF' field compatible with older versions (instead of 'ANN')</option>
<option value="-classic">Use Classic Effect names and amino acid variant annotations (NON_SYNONYMOUS_CODING vs missense_variant and G180R vs p.Gly180Arg/c.538G>C)</option>
Expand Down Expand Up @@ -334,20 +312,13 @@
</param>
</when>
</conditional>

<param name="offset" type="select" display="radio" label="Chromosomal position">
<option value="default" selected="true">Use default (based on input type)</option>
<option value="-0">Force zero-based positions (both input and output)</option>
<option value="-1">Force one-based positions (both input and output)</option>
</param>
<param argument="-chr" type="text" label="Text to prepend to chromosome name">
<help>
By default SnpEff simplifies all chromosome names. For instance 'chr1' is just '1'.
You can prepend any string you want to the chromosome name
</help>
<validator type="regex" message="No whitespace allowed">^\S*$</validator>
</param>
<param name="generate_stats" argument="-noStats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats" />
<param argument="-noLog" type="boolean" truevalue="-noLog" falsevalue="" checked="true" label="Suppress reporting usage statistics to server" />
</inputs>
<outputs>
Expand Down Expand Up @@ -375,8 +346,8 @@
<param name="generate_stats" value="true"/>
<output name="snpeff_output">
<assert_contents>
<has_text_matching expression="KJ660346\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346\t1024\t.*synonymous_variant" />
<has_text_matching expression="KJ660346.1\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346.1\t1024\t.*synonymous_variant" />
</assert_contents>
</output>
<output name="statsFile">
Expand All @@ -398,13 +369,13 @@
<param name="csvStats" value="true"/>
<output name="snpeff_output">
<assert_contents>
<has_text_matching expression="KJ660346\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346\t1024\t.*synonymous_variant" />
<has_text_matching expression="KJ660346.1\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346.1\t1024\t.*synonymous_variant" />
</assert_contents>
</output>
<output name="csvFile">
<assert_contents>
<has_n_lines n="185"/>
<has_n_lines n="134"/>
<has_n_columns n="1" sep=","/>
</assert_contents>
</output>
Expand Down
10 changes: 5 additions & 5 deletions tool_collections/snpeff/snpEff_create_db.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy6" profile="22.01">
<tool id="snpEff_build_gb" name="SnpEff build:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> database from Genbank or GFF record</description>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of SNPEFF_VERSION could you please use TOOL_VERSION - https://galaxy-iuc-standards.readthedocs.io/en/latest/best_practices/tool_xml.html#tool-versions and also use @VERSION_SUFFIX@

this will help our bot with autoupdates.

<macros>
<import>snpEff_macros.xml</import>
</macros>
<requirements>
<expand macro="requirement" />
<requirement type="package" version="1.79">biopython</requirement>
<requirement type="package" version="1.84">biopython</requirement>
</requirements>
<expand macro="stdio" />
<expand macro="version_command" />
Expand Down Expand Up @@ -36,7 +36,7 @@
ln -s '${input_type.input}' 'snpeff_output/${genome_version}/genes.${input_type.input_type_selector}' &&
#end if

snpEff @JAVA_OPTIONS@ build -v
snpEff @JAVA_OPTIONS@ build -noCheckCds -noCheckProtein -v
-configOption '${genome_version}'.genome='${genome_version}'
-configOption '${genome_version}'.codonTable='${codon_table}'
#if str($input_type.input_type_selector) == "gb":
Expand Down Expand Up @@ -186,7 +186,7 @@
<help><![CDATA[
**What it does**

This tool uses `"snpEff build -genbank"` or `"snpEff build -gff3"` commands to create a snpEff database.
This tool uses `snpEff build` to create a snpEff database.

------

Expand All @@ -201,7 +201,7 @@ Using Genbank data for creating databases has several advantages:

.. class:: warningmark

SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use GFF route described below.
SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use the GFF route described below.

-------

Expand Down
24 changes: 18 additions & 6 deletions tool_collections/snpeff/snpEff_databases.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff_databases" name="SnpEff databases:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff_databases" name="SnpEff databases:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> list available databases</description>
<macros>
<import>snpEff_macros.xml</import>
Expand All @@ -19,6 +19,10 @@
| grep -v '${exclude_pattern}'
#end if

#if str($include_download_path) == "no":
| cut -f 1,2,3,4
#end if

> '${snpeff_dbs}'
]]></command>
<inputs>
Expand All @@ -38,7 +42,10 @@
</valid>
</sanitizer>
</param>

<param name="include_download_path" type="select" display="radio" label="Include download paths?" help="When snpEff dumps the list of available databases, it includes their download paths. These are not needed in the Galaxy context.">
<option value="yes">Yes</option>
<option value="no" selected="true">No</option>
</param>
</inputs>
<outputs>
<data name="snpeff_dbs" format="tabular" label="${tool.name} @SNPEFF_VERSION@ available databases" />
Expand All @@ -63,12 +70,16 @@
<help><![CDATA[
**What it does**

This tool downloads the master list of snpEff databases from @SNPEFF_DATABASE_URL@. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse* the it will produce a tabular dataset with the following content::
This tool downloads the master list of snpEff databases from a remote SnpEff repository. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse*, it will produce a tabular dataset with the following content::

mm10 Mouse
mm39 Mouse
mm9 Mouse

mm10 Mouse http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm10.zip
mm9 Mouse http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm9.zip
This means that there are three available snpEff databases for mouse genome. If you want to use mm39 in you analysis:

This means that there two available snpEff databases for mouse genome versions mm9 and mm10. In order to download these databases you should use identifier from the first column (e.g., mm9 or mm10 in this case).
- set **Genome source** option of **SnpEff eff** Galaxy tool to *Download on demand*
- enter 'mm39' into **Snpff Genome Version Name** text box

-------

Expand All @@ -83,6 +94,7 @@ There are two ways to use names of databases obtained with this tool in Galaxy's

@SNPEFF_IN_GALAXY_INFO@
@EXTERNAL_DOCUMENTATION@

]]></help>
<expand macro="citations" />
</tool>
Expand Down
4 changes: 2 additions & 2 deletions tool_collections/snpeff/snpEff_download.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff_download" name="SnpEff download:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff_download" name="SnpEff download:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> download a pre-built database</description>
<macros>
<import>snpEff_macros.xml</import>
Expand Down Expand Up @@ -42,7 +42,7 @@ mv temp/'$genome_version' '$snpeff_db.files_path'
<help><![CDATA[
**What it does**

This tool downloads a specified database from @SNPEFF_DATABASE_URL@. It deposits it into the history.
This tool downloads a specified database from a remote SnpEff repository. It deposits it into the history.

-------

Expand Down
9 changes: 4 additions & 5 deletions tool_collections/snpeff/snpEff_macros.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<macros>
<xml name="requirement">
<requirement type="package" version="4.3.1t">snpeff</requirement>
<requirement type="package" version="5.2">snpeff</requirement>
<yield/>
</xml>
<xml name="stdio">
Expand All @@ -14,9 +14,8 @@
snpEff -version
]]></version_command>
</xml>
<token name="@WRAPPER_VERSION@">4.3+T</token>
<token name="@SNPEFF_VERSION@">SnpEff4.3</token>
<token name="@SNPEFF_DATABASE_URL@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
<token name="@WRAPPER_VERSION@">0</token>
<token name="@SNPEFF_VERSION@">5.2</token>
<token name="@JAVA_OPTIONS@">-Xmx\${GALAXY_MEMORY_MB:-8192}m</token>
<xml name="ref_select">
<conditional name="reference_source">
Expand Down Expand Up @@ -59,7 +58,7 @@ In you *do not see them* keep reading...

**Download pre-built databases**

SnpEff project generates large numbers of pre-build databases. These are available at @SNPEFF_DATABASE_URL@ and can downloaded. Follow these steps:
SnpEff project generates large numbers of pre-build databases. To obtain and use them follow these steps:

#. Use **SnpEff databases** tool to generate a list of existing databases. Note the name of the database you need.
#. Use **SnpEff download** tool to download the database.
Expand Down
2 changes: 1 addition & 1 deletion tool_collections/snpeff/snpeff_get_chr_names.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff_get_chr_names" name="SnpEff chromosome-info:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff_get_chr_names" name="SnpEff chromosome-info:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description>list chromosome names/lengths</description>
<macros>
<import>snpEff_macros.xml</import>
Expand Down
Loading
Loading