Merge pull request #75 from ncbi/release-0.3.2-alpha

Release 0.3.2-alpha
ncbi · Jan 7, 2025 · 92cf9bd · 92cf9bd
2 parents 1057884 + 02439ea
commit 92cf9bd
Show file tree

Hide file tree

Showing 20 changed files with 633 additions and 252 deletions.
diff --git a/README.md b/README.md
@@ -15,10 +15,11 @@ We currently have protein datasets posted that are suitable for most vertebrates
 
 Fungi, protists and nematodes are currently out-of-scope for EGAPx pending additional refinements.
 
-
+**Submitting to GenBank:**
+If you’d like to be an early tester as we refine the output and workflow for submitting EGAPx annotation to GenBank, please contact us at cgr@nlm.nih.gov.
 
 **Warning:**
-The current version is an alpha release with limited features and organism scope to collect initial feedback on execution. Outputs are not yet complete and not intended for production use. Please open a GitHub [Issue](https://github.com/ncbi/egapx/issues)  if you encounter any problems with EGAPx. You can also write to cgr@nlm.nih.gov to give us your feedback or if you have any questions.  
+The current version is an early release and still under active development to add features and refine outputs. The workflow for GenBank submission is still under development. Please open a GitHub [Issue](https://github.com/ncbi/egapx/issues) if you encounter any problems with EGAPx. You can also write to cgr@nlm.nih.gov to give us your feedback or if you have any questions.  
 
 
 **Security Notice:**
@@ -59,7 +60,9 @@ Input to EGAPx is in the form of a YAML file.
   taxid: NCBI Taxonomy identifier of the target organism 
   reads: RNA-seq data
   ```
-  You can obtain taxid from the [NCBI Taxonomy page](https://www.ncbi.nlm.nih.gov/taxonomy).
+  - The assembled genome should be screened for contamination prior to running EGAPx. See the NCBI [Foreign Contamination Screen](https://github.com/ncbi/fcs) for a fast, user-friendly contamination screening tool. 
+
+  - You can obtain taxid from the [NCBI Taxonomy page](https://www.ncbi.nlm.nih.gov/taxonomy).
 
 
   - RNA-seq data can be supplied in any one of the following ways:
@@ -71,9 +74,9 @@ Input to EGAPx is in the form of a YAML file.
     reads: SRA query for reads
     ```
 
-- The following are the _optional_ key-value pairs for the input file:  
+- The following are the _optional_ key-value pairs for the input file. The default taxid-based settings (i.e. omitting these parameters) are recommended for most use cases:  
 
-  - A protein set. A taxid-based protein set will be chosen if no protein set is provided.
+  - A protein set. A taxid-based protein set will be chosen if no protein set is provided. This should only be needed for annotation of obscure organisms or those with little RNAseq data available.
     ```
     proteins: path to proteins data in FASTA format. 
     ```
@@ -420,7 +423,7 @@ If you do not have internet access from your cluster, you can run EGAPx in offli
 ```
 rm egap*sif
 singularity cache clean
-singularity pull docker://ncbi/egapx:0.3.1-alpha
+singularity pull docker://ncbi/egapx:0.3.2-alpha
 ```
 
 - Clone the repo:
@@ -452,7 +455,7 @@ Now edit the file paths of SRA reads files in `examples/input_D_farinae_small.ya
 - Run `egapx.py` first to edit the `biowulf_cluster.config`:
 ```
 ui/egapx.py examples/input_D_farinae_small.yaml -e biowulf_cluster -w dfs_work -o dfs_out -lc ../local_cache
-echo "process.container = '/path_to_/egapx_0.3-alpha.sif'" >> egapx_config/biowulf_cluster.config
+echo "process.container = '/path_to_/egapx_0.3.2-alpha.sif'" >> egapx_config/biowulf_cluster.config
 ```
 
 - Run `egapx.py`:

diff --git a/nf/subworkflows/ncbi/annot_proc/diamond/main.nf b/nf/subworkflows/ncbi/annot_proc/diamond/main.nf
diff --git a/nf/subworkflows/ncbi/annot_proc/gnomon_biotype/main.nf b/nf/subworkflows/ncbi/annot_proc/gnomon_biotype/main.nf
@@ -14,17 +14,15 @@ workflow gnomon_biotype {
         raw_blastp_hits
         parameters  // Map : extra parameter and parameter update
     main:
-        default_params = ""
-        effective_params = merge_params(default_params, parameters, 'gnomon_biotype')
-        run_gnomon_biotype(models_files, splices_files, denylist, gencoll_asn, swiss_prot_asn,  lds2_source, raw_blastp_hits, default_params)
+        def effective_params = merge_params("", parameters, 'gnomon_biotype')
+        run_gnomon_biotype(models_files, splices_files, denylist, gencoll_asn, swiss_prot_asn,  lds2_source, raw_blastp_hits, effective_params)
     emit:
         biotypes = run_gnomon_biotype.out.biotypes
         prots_rpt = run_gnomon_biotype.out.prots_rpt
         all = run_gnomon_biotype.out.all
 }
 
 
-
 process run_gnomon_biotype {
     input:
         path models_files
@@ -34,7 +32,7 @@ process run_gnomon_biotype {
         path swiss_prot_asn
         path lds2_source, stageAs: 'genome/*'
         path raw_blastp_hits
-        val parameters
+        val  parameters
     output:
         path ('output/biotypes.tsv'), emit: 'biotypes'
         path ('output/prots_rpt.tsv'), emit: 'prots_rpt'
@@ -45,18 +43,18 @@ process run_gnomon_biotype {
     mkdir -p ./asncache/
     prime_cache -cache ./asncache/ -ifmt asnb-seq-entry  -i ${swiss_prot_asn} -oseq-ids spids -split-sequences
     prime_cache -cache ./asncache/ -ifmt asnb-seq-entry  -i ${models_files} -oseq-ids gnids -split-sequences
-    lds2_indexer -source genome/ -db LDS2 
+    lds2_indexer -source genome/ -db LDS2
     echo "${raw_blastp_hits.join('\n')}" > raw_blastp_hits.mft
     merge_blastp_hits -asn-cache ./asncache/ -nogenbank -lds2 LDS2 -input-manifest raw_blastp_hits.mft -o prot_hits.asn
     echo "${models_files.join('\n')}" > models.mft
     echo "prot_hits.asn" > prot_hits.mft
     echo "${splices_files.join('\n')}" > splices.mft
-    if [ -z "$denylist" ]
-    then
-      gnomon_biotype  -gc $gencoll_asn -asn-cache ./asncache/ -lds2 ./LDS2  -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_hits prot_hits.mft -prot_splices splices.mft  -reftrack-server 'NONE' -allow_lt631 true
-    else
-      gnomon_biotype  -gc $gencoll_asn -asn-cache ./asncache/ -lds2 ./LDS2  -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_denylist $denylist -prot_hits prot_hits.mft -prot_splices splices.mft  -reftrack-server 'NONE' -allow_lt631 true
+    effective_params="${parameters}"
+    if [ -n "$denylist" ]; then
+        effective_params="\$effective_params -prot_denylist $denylist"
     fi
+    gnomon_biotype \$effective_params -logfile ./gn_biotype_log.txt -gc $gencoll_asn -asn-cache ./asncache/ -lds2 ./LDS2  -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true
+    cat ./gn_biotype_log.txt
     """
     stub:
     """
@@ -65,4 +63,3 @@ process run_gnomon_biotype {
     touch output/biotypes.tsv
     """
 }
-
diff --git a/nf/subworkflows/ncbi/annot_proc/main.nf b/nf/subworkflows/ncbi/annot_proc/main.nf
@@ -59,6 +59,7 @@ workflow annot_proc_plane {
         symbol_format_class // string for how to format gene names 
         ortho_files      /// ortho reference input files
         reference_sets  // reference sets, for now only swissprot
+        prot_denylist
         task_params     // task parameters for every task
     main:
         // Post GNOMON
@@ -70,7 +71,7 @@ workflow annot_proc_plane {
         // Seed Protein-Model Hits
         diamond_worker(prot_gnomon_prepare.out.prot_ids, swiss_prot_ids, gnomon_models, swiss_prot_asn, task_params.get('diamond_identify', [:]))
         best_protein_hits(gnomon_models, swiss_prot_asn,  diamond_worker.out.alignments , task_params.get('best_protein_hits', [:]))
-        gnomon_biotype(gnomon_models,/*splices_file  -- constant*/ [],  /*denylist -- constant*/ [], gencoll_asn, swiss_prot_asn, [], diamond_worker.out.alignments,task_params.get('gnomon_biotype', [:]))
+        gnomon_biotype(gnomon_models,/*splices_file  -- constant*/ [], prot_denylist, gencoll_asn, swiss_prot_asn, [], diamond_worker.out.alignments,task_params.get('gnomon_biotype', [:]))
 
         annot_builder(gencoll_asn, gnomon_models, genome_asn, task_params.get('annot_builder', [:]))
         def accept_ftable_file = annot_builder.out.accept_ftable_annot

diff --git a/nf/subworkflows/ncbi/default/convert_annotations/main.nf b/nf/subworkflows/ncbi/default/convert_annotations/main.nf
@@ -34,58 +34,45 @@ process run_converter {
         path 'output/*.cds.fna', emit: 'cds_fasta'
         path 'output/*.proteins.faa', emit: 'proteins_fasta'
     script:
-        //def basename = asn_file.baseName.toString()
-        def basename = asn_files.first().baseName.toString()
     """
-    echo "${asn_files.join('\n')}" > ${basename}.mft
     mkdir -p output
-    ##if [ -s ${asn_files} ]; then
-        mkdir -p tmpout
-        for af in ${asn_files}
-        do
-          afb=\$(basename \$af)
-          annotwriter ${gff_params} -nogenbank -i \${af} -format gff3 -o tmpout/\${afb}.genomic.gff
-          annotwriter ${gtf_params} -nogenbank -i \${af} -format gtf -o tmpout/\${afb}.genomic.gtf
-          asn2fasta -nogenbank -i \${af} -nucs-only |sed -e 's/^>lcl|\\(.*\\)/>\\1/' > tmpout/\${afb}.genomic.fna
-          asn2fasta -nogenbank -i \${af} -feats rna_fasta -o tmpout/\${afb}.transcripts.fna
-          asn2fasta -nogenbank -i \${af} -feats fasta_cds_na -o tmpout/\${afb}.cds.fna
-          asn2fasta -nogenbank -i \${af} -prots-only -o tmpout/\${afb}.proteins.faa
-        done
-        cat tmpout/*.gff > output/complete.genomic.gff
-        cat tmpout/*.gtf > output/complete.genomic.gtf
-        cat tmpout/*.genomic.fna > output/complete.genomic.fna
-        cat tmpout/*.transcripts.fna > output/complete.transcripts.fna
-        cat tmpout/*.cds.fna > output/complete.cds.fna
-        cat tmpout/*.proteins.faa > output/complete.proteins.faa
-        rm tmpout/*
-      
-        ##annotwriter ${gff_params} -nogenbank -i ${asn_files} -format gff3 -o output/${basename}.genomic.gff
-        ##annotwriter ${gtf_params} -nogenbank -i ${asn_files} -format gtf -o output/${basename}.genomic.gtf
-        ##asn2fasta -nogenbank -nucs-only -indir asn_inputs  -o - |sed -e 's/^>lcl|\\(.*\\)/>\\1/' >output/${basename}.genomic.fna
-        ##asn2fasta -nogenbank -feats rna_fasta        -indir asn_inputs -o output/${basename}.transcripts.fna
-        ##asn2fasta -nogenbank -feats fasta_cds_na -i  -indir asn_inputs -o output/${basename}.cds.fna
-        ##asn2fasta -nogenbank -prots-only -i -indir asn_inputs -o output/${basename}.proteins.faa
-    ##else
-    ##    touch output/${basename}.genomic.gff
-    ##    touch output/${basename}.genomic.gtf
-    ##    touch output/${basename}.genomic.fna
-    ##    touch output/${basename}.transcripts.fna
-    ##    touch output/${basename}.cds.fna
-    ##    touch output/${basename}.proteins.faa
-    ##fi
+    mkdir -p tmpout
+    found_afbs=(0)
+    for af in asn_inputs/*
+    do
+        afb=\$(basename \$af)
+        found_afbs+=(\${afb})
+        annotwriter ${gff_params} -nogenbank -i \${af} -format gff3 -o tmpout/\${afb}.genomic.gff
+        annotwriter ${gtf_params} -nogenbank -i \${af} -format gtf -o tmpout/\${afb}.genomic.gtf
+        asn2fasta -nogenbank -i \${af} -nucs-only |sed -e 's/^>lcl|\\(.*\\)/>\\1/' > tmpout/\${afb}.genomic.fna
+        asn2fasta -nogenbank -i \${af} -feats rna_fasta -o tmpout/\${afb}.transcripts.fna
+        asn2fasta -nogenbank -i \${af} -feats fasta_cds_na -o tmpout/\${afb}.cds.fna
+        asn2fasta -nogenbank -i \${af} -prots-only -o tmpout/\${afb}.proteins.faa
+    done
+    ##echo 'D: ' \${found_afbs[@]}
+    cat `find tmpout -name g*.gff -o -name all_unannot*.genomic.gff` > output/complete.genomic.gff
+    cat `find tmpout -name g*.gtf -o -name all_unannot*.genomic.gtf` > output/complete.genomic.gtf
+    cat `find tmpout -name g*.genomic.fna -o -name all_unannot*.genomic.fna` > output/complete.genomic.fna
+    cat `find tmpout -name g*.transcripts.fna -o -name all_unannot*.transcripts.fna` > output/complete.transcripts.fna
+    cat `find tmpout -name g*.cds.fna -o -name all_unannot*.cds.fna` > output/complete.cds.fna
+    cat `find tmpout -name g*.proteins.faa -o -name all_unannot*.proteins.faa` > output/complete.proteins.faa
+    rm tmpout/*
+    touch output/complete.genomic.gff
+    touch output/complete.genomic.gtf
+    touch output/complete.genomic.fna
+    touch output/complete.transcripts.fna
+    touch output/complete.cds.fna
+    touch output/complete.proteins.faa
     """
 
     stub:
-        def basename = asn_files.first().baseName.toString()
-        print(asn_files)
-        print(basename)
     """
     mkdir -p output
-    echo "Genomic GFF"    > output/${basename}.genomic.gff
-    echo "Genomic GTF"    > output/${basename}.genomic.gtf
-    echo "Genomic FASTA"  > output/${basename}.genomic.fna
-    echo "Transcript FASTA" > output/${basename}.transcripts.fna
-    echo "CDS FASTA" > output/${basename}.cds.fna
-    echo "Protein FASTA"   > output/${basename}.proteins.faa
+    echo "Genomic GFF"      > output/complete.genomic.gff
+    echo "Genomic GTF"      > output/complete.genomic.gtf
+    echo "Genomic FASTA"    > output/complete.genomic.fna
+    echo "Transcript FASTA" > output/complete.transcripts.fna
+    echo "CDS FASTA"        > output/complete.cds.fna
+    echo "Protein FASTA"    > output/complete.proteins.faa
     """
 }
diff --git a/nf/subworkflows/ncbi/gnomon-training-iteration/utilities.nf b/nf/subworkflows/ncbi/gnomon-training-iteration/utilities.nf
@@ -25,7 +25,7 @@ workflow gnomon_training_iteration {
 
         chainer(chainer_alignments, initial_hmm_params, chainer_evidence_denylist, chainer_gap_fill_allowlist, chainer_scaffolds, chainer_trusted_genes, genome_asn, proteins_asn, parameters.get('chainer_wnode', [:]))
         gnomon_wnode(gnomon_scaffolds, chainer.out.chains, chainer.out.chains_slices, initial_hmm_params, gnomon_softmask, [], genome_asn, proteins_asn,  parameters.get('gnomon_wnode', [:]))
-        gnomon_training(genome_asn, gnomon_wnode.out.outputs, max_intron, parameters.get('gnomon_training', [:]))
+        gnomon_training(genome_asn, gnomon_wnode.out.gn_models, max_intron, parameters.get('gnomon_training', [:]))
 
     emit:
         hmm_params_file = gnomon_training.out.hmm_params_file

diff --git a/nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf b/nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf
@@ -121,9 +121,9 @@ process run_chainer {
     # with the same filename. We need to avoid that to be able to stage
     # the output files for gpx_make_outputs. We add the job file numeric
     # extension as a prefix to the filename.
-    mkdir interim
+    mkdir -p interim
     chainer_wnode $params -start-job-id \$start_job_id  -workers 32 -input-jobs ${job} -O interim -nogenbank -lds2 LDS2 -evidence-denylist-manifest evidence_denylist.mft -gap-fill-allowlist-manifest gap_fill_allowlist.mft -param ${hmm_params} -scaffolds-manifest scaffolds.mft -trusted-genes-manifest trusted_genes.mft
-    mkdir output
+    mkdir -p output
     for f in interim/*; do
         if [ -f \$f ]; then
             mv \$f output/\${extension}_\$(basename \$f)