From cab95d61fc82cfcb7f2ee60155122b862bb6aa66 Mon Sep 17 00:00:00 2001
From: "Richard C. Burhans" <rico@bx.psu.edu>
Date: Mon, 14 Oct 2024 15:44:17 -0400
Subject: [PATCH 1/4] adding NCBI EGAPx

---
 tools/ncbi_egapx/.shed.yml                    |  20 ++
 tools/ncbi_egapx/macros.xml                   |  20 ++
 tools/ncbi_egapx/ncbi_egapx.xml               | 313 ++++++++++++++++++
 tools/ncbi_egapx/test-data/input.yaml         |  17 +
 .../ncbi_egapx/tool-data/all_fasta.loc.sample |  18 +
 .../tool_data_table_conf.xml.sample           |   7 +
 6 files changed, 395 insertions(+)
 create mode 100644 tools/ncbi_egapx/.shed.yml
 create mode 100644 tools/ncbi_egapx/macros.xml
 create mode 100644 tools/ncbi_egapx/ncbi_egapx.xml
 create mode 100644 tools/ncbi_egapx/test-data/input.yaml
 create mode 100644 tools/ncbi_egapx/tool-data/all_fasta.loc.sample
 create mode 100644 tools/ncbi_egapx/tool_data_table_conf.xml.sample
diff --git a/tools/ncbi_egapx/.shed.yml b/tools/ncbi_egapx/.shed.yml
new file mode 100644
index 00000000000..33c26243340
--- /dev/null
+++ b/tools/ncbi_egapx/.shed.yml
@@ -0,0 +1,20 @@
+categories:
+  - Genome annotation
+description: Eukaryotic Genome Annotation Pipeline - External (EGAPx)
+homepage_url: https://github.com/ncbi/egapx
+long_description: |
+  EGAPx is the publicly accessible version of the updated NCBI Eukaryotic
+  Genome Annotation Pipeline. EGAPx takes an assembly fasta file, a
+  taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will
+  pick protein sets and HMM models. The pipeline runs miniprot to align
+  protein sequences, and STAR to align RNA-seq to the assembly. Protein
+  alignments and RNA-seq read alignments are then passed to Gnomon for
+  gene prediction. In the first step of Gnomon, the short alignments
+  are chained together into putative gene models. In the second step,
+  these predictions are further supplemented by ab-initio predictions
+  based on HMM models. The final annotation for the input assembly is
+  produced as a gff file.
+name: ncbi_egapx
+owner: richard-burhans
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_egapx
+type: unrestricted
diff --git a/tools/ncbi_egapx/macros.xml b/tools/ncbi_egapx/macros.xml
new file mode 100644
index 00000000000..ef2f9352a64
--- /dev/null
+++ b/tools/ncbi_egapx/macros.xml
@@ -0,0 +1,20 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <container type="docker">quay.io/richard-burhans/egapx:@TOOL_VERSION@</container>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.2-alpha</token>
+    <token name="@VERSION_SUFFIX@">4</token>
+    <token name="@PROFILE@">22.05</token>
+    <xml name="edam_ontology">
+        <edam_operations>
+            <edam_operation>operation_0362</edam_operation>
+        </edam_operations>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1093/bioinformatics/bts573</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/ncbi_egapx/ncbi_egapx.xml b/tools/ncbi_egapx/ncbi_egapx.xml
new file mode 100644
index 00000000000..fc09d9c50d6
--- /dev/null
+++ b/tools/ncbi_egapx/ncbi_egapx.xml
@@ -0,0 +1,313 @@
+<tool id="ncbi_egapx" name="NCBI EGAPx" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>annotates eukaryotic genomes</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="edam_ontology"/>
+    <expand macro="requirements"/>
+    <command detect_errors="aggressive"><![CDATA[
+    #if str($cond_input_style.input_style) == "fillform":
+        #set yamlconfig = "egapx.yaml"
+        echo '# yaml generated by ncbi_egapx.xml' > '$yamlconfig' &&
+        #if str($reference_genome.genome_type_select) == "history"
+            echo 'genome: $reference_genome.genome' >> '$yamlconfig' &&
+        #elif str($reference_genome.genome_type_select) == "indexed":
+            echo 'genome: $reference_genome.genome.fields.path' >> '$yamlconfig' &&
+        #else:
+            echo 'genome: $reference_genome.uri' >> '$yamlconfig' &&
+        #end if
+        echo 'taxid: $taxid' >> '$yamlconfig' &&
+        echo 'reads:' >> '$yamlconfig' &&
+        #if str($condrnaseq.rna_type_select) == "list":
+            #set rs = $rnaseq.split()
+            #set rsplit = [x.strip() for x in $rs]
+            #for $r in $rsplit:
+                echo '  - $r'  >> '$yamlconfig' &&
+            #end for
+        #else:
+            #for $r in $rnaseq:
+                echo '  - $r'  >> '$yamlconfig' &&
+            #end for
+        #end if
+        #if $proteins:
+            echo 'proteins: $proteins' >> '$yamlconfig' &&
+        #end if
+        #if len($xtra.strip()) > 0:
+            #set lxtra = $xtra.split("\n")
+            #for row in $lxtra:
+                echo '$row' >> '$yamlconfig' &&
+            #end for
+        #end if
+        echo '' >> '$yamlconfig' &&
+        echo "Calculated contents of egapx yaml" &&
+        cat '$yamlconfig' &&
+    #else:
+        #set yamlconfig = $yamlin
+    #end if
+    source /galaxy/env.bash &&
+    echo \${PATH} &&
+    ln -s /galaxy/egapx/egapx_config &&
+    python3 /galaxy/egapx/ui/egapx.py '$yamlconfig' -e galaxy -o 'egapx_out'
+    ]]></command>
+    <inputs>
+        <conditional name="cond_input_style">
+            <param name="input_style" type="select" label="Fill in a tool form or use an existing yaml configuration from the current history?"
+                help="Use the tool form to select inputs from the history, or use a pre-prepared yaml file.">
+                <option value="fillform" selected="True">Provide configuration details for conversion into a configuration yaml</option>
+                <option value="history">Use a pre-prepared yaml egapx configuration</option>
+            </param> 
+            <when value="fillform">
+                <conditional name="reference_genome">
+                    <param name="genome_type_select" type="select" label="Reference genome source for mapping supplied RNA-seq reads"
+                        help="Select a built in, history or remote URI for the reference genome fasta">
+                        <option value="history" selected="True">Use a genome fasta file from the current history</option>
+                        <option value="indexed">Use a Galaxy server built-in genome</option>
+                        <option value="uri">Provide a remote web link URI ("https://...") pointing at the required genome reference fasta file</option>
+                    </param>
+                    <when value="history">
+                        <param name="genome" type="data" format="fasta" label="Select the reference genome fasta from the current history"/>
+                    </when>
+                    <when value="indexed">
+                        <param name="genome" type="select" label="Select a built in reference genome or custom genome"
+                            help="If not listed, add a custom genome or use a reference genome from the history">
+                            <options from_data_table="all_fasta">
+                                <validator message="No genomes are available " type="no_options"/>
+                            </options>
+                        </param>
+                    </when>
+                    <when value="uri">
+                        <param name="uri" type="text" label="URI pointing to the reference genome fasta file"/>
+                    </when>
+                </conditional>
+                <param name="taxid" type="text" label="NCBI Taxon ID" help="Used to identify the HMM model files needed"/>
+                <conditional name="condrnaseq">
+                    <param name="rna_type_select" type="select" label="RNA sequence data source"
+                        help="Select RNAseq input data from history or input a list of SRA identifiers or remote URI">
+                        <option value="list" selected="True">Type in a list of SRA identifiers and/or remote RNA-seq fasta URI</option>
+                        <option value="history">Select one or more RNA-seq fastq datasets from the current history</option>
+                    </param>
+                    <when value="list">
+                        <param name="rnaseq" type="text" area="true" label="List all required individual RNA-seq URI or SRA identifiers, separated by spaces or newlines"
+                            help="Either a working URI for a RNA-seq fasta, or a bare SRA identifier will work - can be mixed">
+                            <validator type="empty_field"/>
+                        </param>
+                    </when>
+                    <when value="history">
+                        <param name="rnaseq" type="data" format="fastqsanger,fastqsanger.gz" multiple="true" label="Select multiple RNA-seq fastqsanger inputs from the current history"
+                            help="All selected rna-seq fastqsanger will be added to the yaml for egapx configuration"/>
+                    </when>
+                </conditional>
+                <param name="proteins" type="data" format="fasta,tasta.gz" optional="true" label="Select a protein set"/>
+                <param name="xtra" type="text" area="true" label="Additional yaml to append to the egapx.yaml configuration"
+                    help="Not normally needed but useful for testing additional configuration elements">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.printable"/>
+                    </sanitizer>
+                </param>
+            </when>
+            <when value="history">
+                <param name="yamlin" type="data" format="yaml,txt" label="egapx configuration yaml file to pass to Nextflow"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output" format="gff" label="EGAPx annotation for ${on_string}" from_work_dir="egapx_out/accept.gff"/>
+        <collection name="nextflow_stats" type="list" label="EGAPx nextflow stats for ${on_string}">
+            <data name="nf_log" format="txt" label="Nextflow execution log" from_work_dir="egapx_out/nextflow.log"/>
+            <data name="nf_report" format="html" label="Nextflow execution report" from_work_dir="egapx_out/run.report.html"/>
+            <data name="nf_trace" format="tabular" label="Nextflow trace file" from_work_dir="egapx_out/run.trace.txt"/>
+            <data name="nf_timeline" format="html" label="Nextflow execution timeline" from_work_dir="egapx_out/run.timeline.html"/>
+            <data name="nf_params" format="yaml" label="Nextflow run parameters" from_work_dir="egapx_out/run_params.yaml"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_test_failure="true">
+            <param name="input_style" value="history"/>
+            <param name="yamlin" value="input.yaml"/>
+            <output name="output"><assert_contents><has_size min="1"/></assert_contents></output>
+            <output_collection name="nextflow_stats" type="list">
+                <element name="nf_log"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_report"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_trace"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_timeline"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_params"><assert_contents><has_size min="1"/></assert_contents></element>
+            </output_collection>
+        </test>
+        <test expect_test_failure="true">
+            <param name="input_style" value="fillform"/>
+            <param name="taxid" value="6954"/>
+            <param name="genome_type_select" value="uri"/>
+            <param name="uri" value="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz"/>
+            <param name="rna_type_select" value="list"/>
+            <param name="rnaseq" value="https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2"/>
+            <param name="xtra" value="proteins: []&#10;hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params&#10;tasks:&#10;  star_wnode:&#10;    star_wnode: -cpus-per-worker 4"/>
+            <output name="output"><assert_contents><has_size min="1"/></assert_contents></output>
+            <output_collection name="nextflow_stats" type="list">
+                <element name="nf_log"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_report"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_trace"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_timeline"><assert_contents><has_size min="1"/></assert_contents></element>
+                <element name="nf_params"><assert_contents><has_size min="1"/></assert_contents></element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+Galaxy tool wrapping the Eukaryotic Genome Annotation Pipeline (EGAPx)
+=================================================================================================
+
+.. class:: warningmark
+
+**Proof of concept: a hack to run a NF workflow inside a specialised Galaxy tool wrapper**
+
+EGAPx is a big, complicated Nextflow workflow, challenging and costly to re-implement **properly**, requiring dozens of new tools and replicating a lot of
+complicated *groovy* workflow logic.
+
+It is also very new and in rapid development. Investing developer effort and keeping updated as EGAPx changes rapidly may be *inefficient of developer resources*.
+
+This wrapper is designed to allow measuring how *inefficient* it is in terms of computing resource utilisation, in comparison to the developer effort
+required to convert Nextflow DDL into tools and WF logic. Balancing these competing requirements is a fundamental Galaxy challenge.
+
+
+EGAPx requires very substantial resources to run with real data. *132GB and 32 cores* are the minimum requirement; *256GB and 64 cores* are recommended.
+
+A special minimal example that can be run in 6GB with 4 cores is provided as a yaml configuration and is used for the tool test.
+
+In this implementation, the user must supply a yaml configuration file as initial proof of concept.
+History inputs and even a yaml editor might be provided in future.
+
+The NF workflow to tool model tested here may be applicable to other NF workflows that take a single configuration yaml.
+
+.. class:: warningmark
+
+The computational resource cost of typing the wrong SRA identifiers into a tool form is potentially enormous with this tool!
+
+
+Sample yaml configurations
+===========================
+
+YAML sample configurations can be uploaded into your Galaxy history from the `EGAPx github repository <https://github.com/ncbi/egapx/tree/main/examples/>`_.
+The simplest possible example is shown below - can be cut/paste into a history dataset in the upload tool.
+
+
+*./examples/input_D_farinae_small.yaml* is shown below and can be cut and pasted into the upload form to create a yaml file.
+RNA-seq data is provided as URI to the reads FASTA files.
+
+input_D_farinae_small.yaml
+
+::
+
+  genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz
+  taxid: 6954
+  reads:
+    - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1
+    - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2
+    - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1
+    - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2
+
+
+input_Gavia_stellata.yaml
+
+::
+
+  genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/030/936/135/GCF_030936135.1_bGavSte3.hap2/GCF_030936135.1_bGavSte3.hap2_genomic.fna.gz
+  reads: txid37040[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession]
+  taxid: 37040
+
+input_C_longicornis.yaml
+
+::
+
+  genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/029//603/195/GCF_029603195.1_ASM2960319v2/GCF_029603195.1_ASM2960319v2_genomic.fna.gz
+  reads: txid2530218[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession]
+  taxid: 2530218
+
+Purpose
+========
+
+**This is not intended for production**
+
+Just a proof of concept.
+It is possibly too inefficient to be useful although it may turn out not to be a problem if run on a dedicated workstation.
+At least the efficiency can now be more easily estimated.
+
+This tool is not recommended for public deployment because of the resource demands.
+
+EGAPx Overview
+===============
+
+.. image:: $PATH_TO_IMAGES/Pipeline_sm_ncRNA_CAGE_80pct.png
+
+**Warning:**
+The current version is an alpha release with limited features and organism scope to collect initial feedback on execution. Outputs are not yet complete and not intended for production use. Please open a GitHub [Issue](https://github.com/ncbi/egapx/issues)  if you encounter any problems with EGAPx. You can also write to cgr@nlm.nih.gov to give us your feedback or if you have any questions.
+
+EGAPx is the publicly accessible version of the updated NCBI [Eukaryotic Genome Annotation Pipeline](https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/).
+
+EGAPx takes an assembly fasta file, a taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will pick protein sets and HMM models. The pipeline runs `miniprot` to align protein sequences, and `STAR` to align RNA-seq to the assembly. Protein alignments and RNA-seq read alignments are then passed to `Gnomon` for gene prediction. In the first step of `Gnomon`, the short alignments are chained together into putative gene models.
+In the second step, these predictions are further supplemented by *ab-initio* predictions based on HMM models. The final annotation for the input assembly is produced as a `gff` file.
+
+**Security Notice:**
+
+EGAPx has dependencies in and outside of its execution path that include several thousand files from the [NCBI C++ toolkit](https://www.ncbi.nlm.nih.gov/toolkit), and more than a million total lines of code. Static Application Security Testing has shown a small number of verified buffer overrun security vulnerabilities. Users should consult with their organizational security team on risk and if there is concern, consider mitigating options like running via VM or cloud instance.
+
+
+*To specify an array of NCBI SRA datasets in yaml*
+
+::
+
+   reads:
+     - SRR8506572
+     - SRR9005248
+
+
+*To specify an SRA entrez query*
+
+::
+
+  reads: 'txid6954[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession] AND (SRR8506572[Accession] OR SRR9005248[Accession] )'
+
+
+**Note:** Both the above examples will have more RNA-seq data than the `input_D_farinae_small.yaml` example. To make sure the entrez query does not produce a large number of SRA runs, please run it first at the [NCBI SRA page](https://www.ncbi.nlm.nih.gov/sra). If there are too many SRA runs, then select a few of them and list it in the input yaml.
+
+Output
+=======
+
+EGAPx output will appear as a collection in the user history. The main annotation file is called *accept.gff*.
+
+::
+
+ accept.gff
+ annot_builder_output
+ nextflow.log
+ run.report.html
+ run.timeline.html
+ run.trace.txt
+ run_params.yaml
+
+
+The *nextflow.log* is the log file that captures all the process information and their work directories. ``run_params.yaml`` has all the parameters that were used in the EGAPx run. More information about the process time and resources can be found in the other run* files.
+
+## Intermediate files
+
+In the log, each line denotes the process that completed in the workflow. The first column (_e.g._ `[96/621c4b]`) is the subdirectory where the intermediate output files and logs are found for the process in the same line, _i.e._, `egapx:miniprot:run_miniprot`. To see the intermediate files for that process, you can go to the work directory path that you had supplied and traverse to the subdirectory `96/621c4b`:
+
+::
+
+ $ aws s3 ls s3://temp_datapath/D_farinae/96/
+                           PRE 06834b76c8d7ceb8c97d2ccf75cda4/
+                           PRE 621c4ba4e6e87a4d869c696fe50034/
+ $ aws s3 ls s3://temp_datapath/D_farinae/96/621c4ba4e6e87a4d869c696fe50034/
+                           PRE output/
+ 2024-03-27 11:19:18          0
+ 2024-03-27 11:19:28          6 .command.begin
+ 2024-03-27 11:20:24        762 .command.err
+ 2024-03-27 11:20:26        762 .command.log
+ 2024-03-27 11:20:23          0 .command.out
+ 2024-03-27 11:19:18      13103 .command.run
+ 2024-03-27 11:19:18        129 .command.sh
+ 2024-03-27 11:20:24        276 .command.trace
+ 2024-03-27 11:20:25          1 .exitcode
+ $ aws s3 ls s3://temp_datapath/D_farinae/96/621c4ba4e6e87a4d869c696fe50034/output/
+ 2024-03-27 11:20:24   17127134 aligns.paf
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/ncbi_egapx/test-data/input.yaml b/tools/ncbi_egapx/test-data/input.yaml
new file mode 100644
index 00000000000..84cb5618c3b
--- /dev/null
+++ b/tools/ncbi_egapx/test-data/input.yaml
@@ -0,0 +1,17 @@
+# This is a very minimal example of EGAPx, it fits into 4 CPU cores and 6GB of memory.
+# To be able to do this, we culled the input files and some stages of execution.
+# To limit the requirements you also need to use -e docker_minimal
+
+genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz
+reads:
+  - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1
+  - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2
+  - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1
+  - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2
+taxid: 6954
+proteins: []
+hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params
+tasks:
+  star_wnode:
+    star_wnode: -cpus-per-worker 4
+
diff --git a/tools/ncbi_egapx/tool-data/all_fasta.loc.sample b/tools/ncbi_egapx/tool-data/all_fasta.loc.sample
new file mode 100644
index 00000000000..1a5a28d5e3f
--- /dev/null
+++ b/tools/ncbi_egapx/tool-data/all_fasta.loc.sample
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>	<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3	/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19	Human (Homo sapiens): hg19 Canonical	/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19	Human (Homo sapiens): hg19 Full	/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
diff --git a/tools/ncbi_egapx/tool_data_table_conf.xml.sample b/tools/ncbi_egapx/tool_data_table_conf.xml.sample
new file mode 100644
index 00000000000..d5c59b96f59
--- /dev/null
+++ b/tools/ncbi_egapx/tool_data_table_conf.xml.sample
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>

From 2c75d46ed8760a442bbd2e95eff3bc0bd13a0e6f Mon Sep 17 00:00:00 2001
From: "Richard C. Burhans" <rico@bx.psu.edu>
Date: Mon, 14 Oct 2024 15:46:29 -0400
Subject: [PATCH 2/4] changing owner to iuc

---
 tools/ncbi_egapx/.shed.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ncbi_egapx/.shed.yml b/tools/ncbi_egapx/.shed.yml
index 33c26243340..67eb96ce779 100644
--- a/tools/ncbi_egapx/.shed.yml
+++ b/tools/ncbi_egapx/.shed.yml
@@ -15,6 +15,6 @@ long_description: |
   based on HMM models. The final annotation for the input assembly is
   produced as a gff file.
 name: ncbi_egapx
-owner: richard-burhans
+owner: iuc
 remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_egapx
 type: unrestricted

From 7f45fc844cc583d4d72bc740a94c1cfd6ac31005 Mon Sep 17 00:00:00 2001
From: "Richard C. Burhans" <rico@bx.psu.edu>
Date: Mon, 14 Oct 2024 15:46:59 -0400
Subject: [PATCH 3/4] resetting

---
 tools/ncbi_egapx/macros.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ncbi_egapx/macros.xml b/tools/ncbi_egapx/macros.xml
index ef2f9352a64..b14fa3b7739 100644
--- a/tools/ncbi_egapx/macros.xml
+++ b/tools/ncbi_egapx/macros.xml
@@ -5,7 +5,7 @@
         </requirements>
     </xml>
     <token name="@TOOL_VERSION@">0.2-alpha</token>
-    <token name="@VERSION_SUFFIX@">4</token>
+    <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE@">22.05</token>
     <xml name="edam_ontology">
         <edam_operations>

From 8d00bd93c488f6825429bc6f4dc12240e4130a0e Mon Sep 17 00:00:00 2001
From: "Richard C. Burhans" <rico@bx.psu.edu>
Date: Wed, 16 Oct 2024 14:33:59 -0400
Subject: [PATCH 4/4] switch to using a configfile, replace "fasta" with
 "FASTA"

---
 tools/ncbi_egapx/ncbi_egapx.xml       | 115 +++++++++++++-------------
 tools/ncbi_egapx/test-data/input.yaml |   4 +-
 2 files changed, 58 insertions(+), 61 deletions(-)

diff --git a/tools/ncbi_egapx/ncbi_egapx.xml b/tools/ncbi_egapx/ncbi_egapx.xml
index fc09d9c50d6..506e8663c47 100644
--- a/tools/ncbi_egapx/ncbi_egapx.xml
+++ b/tools/ncbi_egapx/ncbi_egapx.xml
@@ -6,42 +6,9 @@
     <expand macro="edam_ontology"/>
     <expand macro="requirements"/>
     <command detect_errors="aggressive"><![CDATA[
-    #if str($cond_input_style.input_style) == "fillform":
-        #set yamlconfig = "egapx.yaml"
-        echo '# yaml generated by ncbi_egapx.xml' > '$yamlconfig' &&
-        #if str($reference_genome.genome_type_select) == "history"
-            echo 'genome: $reference_genome.genome' >> '$yamlconfig' &&
-        #elif str($reference_genome.genome_type_select) == "indexed":
-            echo 'genome: $reference_genome.genome.fields.path' >> '$yamlconfig' &&
-        #else:
-            echo 'genome: $reference_genome.uri' >> '$yamlconfig' &&
-        #end if
-        echo 'taxid: $taxid' >> '$yamlconfig' &&
-        echo 'reads:' >> '$yamlconfig' &&
-        #if str($condrnaseq.rna_type_select) == "list":
-            #set rs = $rnaseq.split()
-            #set rsplit = [x.strip() for x in $rs]
-            #for $r in $rsplit:
-                echo '  - $r'  >> '$yamlconfig' &&
-            #end for
-        #else:
-            #for $r in $rnaseq:
-                echo '  - $r'  >> '$yamlconfig' &&
-            #end for
-        #end if
-        #if $proteins:
-            echo 'proteins: $proteins' >> '$yamlconfig' &&
-        #end if
-        #if len($xtra.strip()) > 0:
-            #set lxtra = $xtra.split("\n")
-            #for row in $lxtra:
-                echo '$row' >> '$yamlconfig' &&
-            #end for
-        #end if
-        echo '' >> '$yamlconfig' &&
-        echo "Calculated contents of egapx yaml" &&
-        cat '$yamlconfig' &&
-    #else:
+    #if str($cond_input_style.input_style) == "fillform"
+        #set yamlconfig = $egapx_config
+    #else
         #set yamlconfig = $yamlin
     #end if
     source /galaxy/env.bash &&
@@ -49,6 +16,37 @@
     ln -s /galaxy/egapx/egapx_config &&
     python3 /galaxy/egapx/ui/egapx.py '$yamlconfig' -e galaxy -o 'egapx_out'
     ]]></command>
+    <configfiles>
+        <configfile name="egapx_config"><![CDATA[
+#if str($cond_input_style.input_style) == "fillform"
+# yaml generated by ncbi_egapx.xml
+    #if str($reference_genome.genome_type_select) == "history"
+        #set genome_value = $reference_genome.genome
+    #elif str($reference_genome.genome_type_select) == "indexed"
+        #set genome_value = $reference_genome.genome.fields.path
+    #else
+        #set genome_value = $reference_genome.uri
+    #end if
+genome: $genome_value
+taxid: $taxid
+    #if str($condrnaseq.rna_type_select) == "list"
+        #set $reads_values = $rnaseq.split()
+    #else
+        #set $reads_values = $rnaseq
+    #end if
+reads:
+    #for r in [x.strip() for x in $reads_values]
+  - $r
+    #end for
+    #if str($proteins) != "None"
+proteins: $proteins
+    #end if
+    #for row in $xtra.strip().split("\n")
+$row
+    #end for
+#end if
+        ]]></configfile>
+    </configfiles>
     <inputs>
         <conditional name="cond_input_style">
             <param name="input_style" type="select" label="Fill in a tool form or use an existing yaml configuration from the current history?"
@@ -59,13 +57,13 @@
             <when value="fillform">
                 <conditional name="reference_genome">
                     <param name="genome_type_select" type="select" label="Reference genome source for mapping supplied RNA-seq reads"
-                        help="Select a built in, history or remote URI for the reference genome fasta">
-                        <option value="history" selected="True">Use a genome fasta file from the current history</option>
+                        help="Select a built in, history or remote URI for the reference genome FASTA">
+                        <option value="history" selected="True">Use a genome FASTA file from the current history</option>
                         <option value="indexed">Use a Galaxy server built-in genome</option>
-                        <option value="uri">Provide a remote web link URI ("https://...") pointing at the required genome reference fasta file</option>
+                        <option value="uri">Provide a remote web link URI ("https://...") pointing at the required genome reference FASTA file</option>
                     </param>
                     <when value="history">
-                        <param name="genome" type="data" format="fasta" label="Select the reference genome fasta from the current history"/>
+                        <param name="genome" type="data" format="fasta" label="Select the reference genome FASTA from the current history"/>
                     </when>
                     <when value="indexed">
                         <param name="genome" type="select" label="Select a built in reference genome or custom genome"
@@ -76,19 +74,20 @@
                         </param>
                     </when>
                     <when value="uri">
-                        <param name="uri" type="text" label="URI pointing to the reference genome fasta file"/>
+                        <param name="uri" type="text" label="URI pointing to the reference genome FASTA file">
+                        </param>
                     </when>
                 </conditional>
-                <param name="taxid" type="text" label="NCBI Taxon ID" help="Used to identify the HMM model files needed"/>
+                <param name="taxid" type="integer" min="0" label="NCBI Taxon ID" help="Used to identify the HMM model files needed"/>
                 <conditional name="condrnaseq">
                     <param name="rna_type_select" type="select" label="RNA sequence data source"
                         help="Select RNAseq input data from history or input a list of SRA identifiers or remote URI">
-                        <option value="list" selected="True">Type in a list of SRA identifiers and/or remote RNA-seq fasta URI</option>
+                        <option value="list" selected="True">Type in a list of SRA identifiers and/or remote RNA-seq FASTA URI</option>
                         <option value="history">Select one or more RNA-seq fastq datasets from the current history</option>
                     </param>
                     <when value="list">
                         <param name="rnaseq" type="text" area="true" label="List all required individual RNA-seq URI or SRA identifiers, separated by spaces or newlines"
-                            help="Either a working URI for a RNA-seq fasta, or a bare SRA identifier will work - can be mixed">
+                            help="Either a working URI for a RNA-seq FASTA, or a bare SRA identifier will work - can be mixed">
                             <validator type="empty_field"/>
                         </param>
                     </when>
@@ -97,7 +96,7 @@
                             help="All selected rna-seq fastqsanger will be added to the yaml for egapx configuration"/>
                     </when>
                 </conditional>
-                <param name="proteins" type="data" format="fasta,tasta.gz" optional="true" label="Select a protein set"/>
+                <param name="proteins" type="data" format="fasta,fasta.gz" optional="true" label="Select a protein set"/>
                 <param name="xtra" type="text" area="true" label="Additional yaml to append to the egapx.yaml configuration"
                     help="Not normally needed but useful for testing additional configuration elements">
                     <sanitizer invalid_char="">
@@ -106,7 +105,7 @@
                 </param>
             </when>
             <when value="history">
-                <param name="yamlin" type="data" format="yaml,txt" label="egapx configuration yaml file to pass to Nextflow"/>
+                <param name="yamlin" type="data" format="yaml" label="egapx configuration yaml file to pass to Nextflow"/>
             </when>
         </conditional>
     </inputs>
@@ -121,9 +120,14 @@
         </collection>
     </outputs>
     <tests>
-        <test expect_test_failure="true">
-            <param name="input_style" value="history"/>
-            <param name="yamlin" value="input.yaml"/>
+        <test expect_test_failure="false">
+            <param name="input_style" value="fillform"/>
+            <param name="taxid" value="6954"/>
+            <param name="genome_type_select" value="uri"/>
+            <param name="uri" value="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz"/>
+            <param name="rna_type_select" value="list"/>
+            <param name="rnaseq" value="https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2"/>
+            <param name="xtra" value="hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params&#10;tasks:&#10;  star_wnode:&#10;    star_wnode: -cpus-per-worker 4"/>
             <output name="output"><assert_contents><has_size min="1"/></assert_contents></output>
             <output_collection name="nextflow_stats" type="list">
                 <element name="nf_log"><assert_contents><has_size min="1"/></assert_contents></element>
@@ -133,14 +137,9 @@
                 <element name="nf_params"><assert_contents><has_size min="1"/></assert_contents></element>
             </output_collection>
         </test>
-        <test expect_test_failure="true">
-            <param name="input_style" value="fillform"/>
-            <param name="taxid" value="6954"/>
-            <param name="genome_type_select" value="uri"/>
-            <param name="uri" value="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz"/>
-            <param name="rna_type_select" value="list"/>
-            <param name="rnaseq" value="https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2"/>
-            <param name="xtra" value="proteins: []&#10;hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params&#10;tasks:&#10;  star_wnode:&#10;    star_wnode: -cpus-per-worker 4"/>
+        <test expect_test_failure="false">
+            <param name="input_style" value="history"/>
+            <param name="yamlin" value="input.yaml"/>
             <output name="output"><assert_contents><has_size min="1"/></assert_contents></output>
             <output_collection name="nextflow_stats" type="list">
                 <element name="nf_log"><assert_contents><has_size min="1"/></assert_contents></element>
@@ -242,7 +241,7 @@ The current version is an alpha release with limited features and organism scope
 
 EGAPx is the publicly accessible version of the updated NCBI [Eukaryotic Genome Annotation Pipeline](https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/).
 
-EGAPx takes an assembly fasta file, a taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will pick protein sets and HMM models. The pipeline runs `miniprot` to align protein sequences, and `STAR` to align RNA-seq to the assembly. Protein alignments and RNA-seq read alignments are then passed to `Gnomon` for gene prediction. In the first step of `Gnomon`, the short alignments are chained together into putative gene models.
+EGAPx takes an assembly FASTA file, a taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will pick protein sets and HMM models. The pipeline runs `miniprot` to align protein sequences, and `STAR` to align RNA-seq to the assembly. Protein alignments and RNA-seq read alignments are then passed to `Gnomon` for gene prediction. In the first step of `Gnomon`, the short alignments are chained together into putative gene models.
 In the second step, these predictions are further supplemented by *ab-initio* predictions based on HMM models. The final annotation for the input assembly is produced as a `gff` file.
 
 **Security Notice:**
diff --git a/tools/ncbi_egapx/test-data/input.yaml b/tools/ncbi_egapx/test-data/input.yaml
index 84cb5618c3b..585fd76ff02 100644
--- a/tools/ncbi_egapx/test-data/input.yaml
+++ b/tools/ncbi_egapx/test-data/input.yaml
@@ -3,15 +3,13 @@
 # To limit the requirements you also need to use -e docker_minimal
 
 genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz
+taxid: 6954
 reads:
   - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1
   - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2
   - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1
   - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2
-taxid: 6954
-proteins: []
 hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params
 tasks:
   star_wnode:
     star_wnode: -cpus-per-worker 4
-