Merge pull request #109 from ncsa/develop

Develop
ncsa · May 16, 2024 · 66ab859 · 66ab859
2 parents 2c80a44 + c8bf73d
commit 66ab859
Show file tree

Hide file tree

Showing 14 changed files with 2,035 additions and 49 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -1,53 +1,61 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+# This workflow configures the environment and executes NEAT read-simulator tests using relative paths for a series of configuration files individually
+# For more information on using Python with GitHub Actions, refer to:
+# https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 
-name: NEAT unit tests
+name: NEAT Unit Tests
 
 on:
   push:
-    branches: [ "main", "develop" ]
+    branches: [develop, main]
   pull_request:
-    branches: [ "main" ]
+    branches: [main]
 
 jobs:
-  build:
+  detailed_test_execution:
     runs-on: ubuntu-latest
-
     steps:
       - uses: actions/checkout@v3
       - uses: s-weigand/setup-conda@v1.1.1
         with:
-          conda-channels: bioconda, conda-forge
+          conda-channels: [bioconda, conda-forge]
           activate-conda: true
           repository: NCSA/NEAT
-      - name: basic test
+      - name: Environment Setup
         run: |
           conda env create -f environment.yml -n test_neat
           conda activate test_neat
           poetry install
-          neat
+          cd config_template
 
-      - name: run coverage tests
-        run: |
-          conda activate test_neat
-          python tests/coverage_tests.py
-      
-#       - name: lint with flake8
-#         run: |
-#           conda activate neat
-#           # stop the build if there are Python syntax errors or undefined names
-#           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-#           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-#           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#       - name: Execute test_gen_reads
-#         run: |
-#           conda activate neat
-#           cd ${{ github.workspace }}
-#           poetry install
-#           neat --log-level ERROR --no-log read-simulator -c data/test_config.yml -o test
-#       - run: echo "This job's status is ${{ job.status }}."
-#       - name: Execute seq_err_model_test
-#         run: |
-#           cd ${{ github.workspace }}
-#           neat --log-level ERROR --no-log model-seq-err -i data/baby.fastq
-#       - run: echo "This job's status is ${{ job.status }}." 
+      - name: Run NEAT Simulation for config_test1
+        run: python -m neat --log-level DEBUG read-simulator -c config_test1.yml -o ../outputs/test1_read-simulator
+
+      - name: Run NEAT Simulation for config_test2
+        run: python -m neat --log-level DEBUG read-simulator -c config_test2.yml -o ../outputs/test2_read-simulator
+
+      - name: Run NEAT Simulation for config_test3
+        run: python -m neat --log-level DEBUG read-simulator -c config_test3.yml -o ../outputs/test3_read-simulator
+
+      - name: Run NEAT Simulation for config_test4
+        run: python -m neat --log-level DEBUG read-simulator -c config_test4.yml -o ../outputs/test4_read-simulator
+
+      - name: Run NEAT Simulation for config_test5
+        run: python -m neat --log-level DEBUG read-simulator -c config_test5.yml -o ../outputs/test5_read-simulator
+
+      - name: Run NEAT Simulation for config_test6
+        run: python -m neat --log-level DEBUG read-simulator -c config_test6.yml -o ../outputs/test6_read-simulator
+
+      - name: Run NEAT Simulation for config_test7
+        run: python -m neat --log-level DEBUG read-simulator -c config_test7.yml -o ../outputs/test7_read-simulator
+
+      - name: Run NEAT Simulation for config_test8
+        run: python -m neat --log-level DEBUG read-simulator -c config_test8.yml -o ../outputs/test8_read-simulator
+
+      - name: Run NEAT Simulation for config_test9
+        run: python -m neat --log-level DEBUG read-simulator -c config_test9.yml -o ../outputs/test9_read-simulator
+
+      - name: Run NEAT Simulation for config_test10
+        run: python -m neat --log-level DEBUG read-simulator -c config_test10.yml -o ../outputs/test10_read-simulator
+
+      - name: Run NEAT Simulation for config_test11
+        run: python -m neat --log-level DEBUG read-simulator -c config_test11.yml -o ../outputs/test11_read-simulator
diff --git a/config_template/config_test1.yml b/config_template/config_test1.yml
@@ -0,0 +1,177 @@
+# Test 1: Default parameters, H1N1 data
+
+## Template for gen_reads parallel
+## Any parameter that is not required but has a default value will use the
+## default value even if the variable is not included in the config. For
+## required items, they must be included in the config and the must be given a value.
+## All other items can be present or not. If present and the value is set to a single
+## period, the variable will be treated as though it had been omitted. Please do
+## not modify this template, but instead make a copy in your working directory. Done this
+## way, you can run without even needing to declare -c.
+
+# Absolute path to input reference fasta file
+# type = string | required: yes
+reference: ../data/H1N1.fa
+
+# How to partition the reference for analysis. By default, NEAT will
+# attempt to process one contig per thread. However, if you have very
+# large fasta files, you will see additional runtime benefit from choosing
+# the subdivision method, which will split the contigs up into equal sizes
+# for processing. If you need further speedups and have access to a distributed system
+# you can use a shell script wrapper around NEAT to split the fasta into
+# contigs, then join the results later. NEAT does not feature translocations, so
+# this will not affect NEAT's output. Note that subdivision will only activate for
+# number of threads > 1.
+# type = string | required: no | default = chrom | possible values: chrom, subdivision
+partition_mode: .
+
+# Read length of the reads in the fastq output. Only required if @produce_fastq is set to true
+# type = int | required: no | default = 101
+read_len: .
+
+# Number of threads to request for NEAT. The recommended amount is the number of chromosomes in
+# your input fasta plus 1.
+# type = int | required: no | default = 1
+threads: .
+
+# Average Coverage for the entire genome.
+# type = float | required: no | default = 10.0
+coverage: .
+
+# Absolute path to file with sequencing error model
+# type = string | required: no | default: <NEAT_DIR>/neat/models/defaults/default_error_model.pickle.gz
+error_model: .
+
+# Average sequencing error rate for the sequencing machine
+# type = float | required = no | must be between 0.0 and 0.3
+avg_seq_error: .
+
+# This scales the quality scores to match the desired average sequencing error rate
+# specified by avg_seq_error.
+# type: boolean | required = no | default = false
+rescale_qualities: .
+
+# This is the factor to add to the quality scores to get the ascii text version of the
+# score. The default follows the sanger quality offset
+# type: int | required = no | default = 33
+quality_offset: .
+
+# Desired ploidy
+# type = int | required = no | default = 2
+ploidy: .
+
+# Absolute path to vcf file containing variants that will always be included, regardless
+# of genotype and filter. You can pre-filter your vcf for these fields before inputting it
+# if this is not the desired behavior.
+# type: string | required = no
+input_variants: .
+
+# Absolute path to bed file containing reference regions that the simulation
+# should target.
+# type = string | required = no
+target_bed: .
+
+# Scalar value for coverage in regions outside the targeted bed. Example 0.5
+# would get you roughly half the coverage as the on target areas. Default is
+# 2% of total coverage in off-target regions.
+# type: float | required = no | default = 0.02
+off_target_scalar: .
+
+# Whether to discard areas outside the targeted bed region. By default, this is set
+# to false and NEAT will use a different model for off-target regions but still
+# include them in the final output.
+# TODO this may not be necessary
+# type: boolean | required = no | default = false
+discard_offtarget: .
+
+# Absolute path to bed file containing reference regions that the simulation
+# should discard.
+# type = string | required = no
+discard_bed: .
+
+# Absolute path to the mutation model pickle file. Omitting this value will cause
+# NEAT to use the default model, with some standard parameters, and generally uniform biases.
+# type: string | required = no
+mutation_model: .
+
+# Average mutation rate per base pair. Overall average is 0.001, or model default
+# Use either this value to override the mutation rate for the default or input model.
+# type: float | required = no | must be between 0.0 and 0.3
+mutation_rate: .
+
+# Absolute path to a bed file with mutation rates by region.
+# Rates must be in the fourth column and be of the form "mut_rate=x.xx"
+# Rates must be between 0.00 and 0.03
+# type: string | required = no
+mutation_bed: .
+
+# Absolute path to GC content model generated by compute_gc.py
+# type: string | required = no | default: <NEAT_DIR>/neat/models/defaults/default_gc_bias_model.pickle.gz
+gc_model: .
+
+# Whether the output should be paired ended. For certain conditions (i.e., vcf only or
+# fasta only), this will be ignored. If this is true, then there must be an included fragment
+# length model output from runner.py or a mean and standard deviation
+# by declaring values for @fragment_mean and @fragment_std_dev.
+# type: boolean | required = no | default = false
+paired_ended: .
+
+# Absolute path to a pickle file containing the fragment length model output
+# from runner.py.
+# type: string | required = no | default: <NEAT_DIR>/neat/models/defaults/default_fraglen_model.pickle.gz
+fragment_model: .
+
+# Mean for the paired end fragment length. This only applies if paired-ended is set to true.
+# This number will form the mean for the sample distribution of the fragment lengths in the simulation
+# Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used.
+# type: float | required: no (unless paired-ended)
+fragment_mean: .
+
+# Standard deviation for the paired end fragment length. This only applies if paired-ended is set to true.
+# This number will form the standard deviation about the mean specified above for the sample distribution
+# of the fragment lengths in the simulation.
+# Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used.
+# type: float | required: no (unless paired-ended)
+fragment_st_dev: .
+
+# Whether to produce the golden bam file. This file will contain the reads
+# aligned with the exact region of the genome
+# type: boolean | required = no | default = false
+produce_bam: .
+
+# Whether to produce a vcf file containing all the mutation errors added
+# by NEAT.
+# type: boolean | required = no | default = false
+produce_vcf: .
+
+# Whether to output the mutated fasta. This will output a fasta file with mutations
+# inserted. It does not include sequencing errors or read information. Useful for
+# multigenerational mutations.
+# type: boolean | required = no | default = false
+produce_fasta: .
+
+# Whether to output the fastq(s) of the reads. This is the default output. NEAT
+# will produce 1 fastq for single ended reads or 2 fastqs for paired ended.
+# type: boolean | required = no | default = true
+produce_fastq: .
+
+# If set to true, this will ignore statistical models and force coverage to be
+# constant across the genome. This is considered a debugging feature.
+# type: boolean | required = no | default = false
+no_coverage_bias: .
+
+# Set an RNG seed value. Runs using identical RNG values should produce identical results
+# so things like read locations, variant positions, error positions, etc. should be the same.
+# Useful for debugging.
+# type: int | required = no
+rng_seed: .
+
+# Set an absolute minimum number of mutations. The program always adds at least 1 mutation.
+# Useful for very small datasets.
+# type: int | required = no
+min_mutations: .
+
+# Overwrite the output files, if they are named the same as the current run.
+# Default is to quit if files already exist to avoid data destruction
+# type: bool | required = no | default = false
+overwrite_output: True