moka-guys · aledj2 · Jun 1, 2022 · May 31, 2022 · Jun 1, 2022 · Jun 1, 2022
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
-# tso500_output_parser_v1.1
+# tso500_output_parser_v1.2
 
 ## What does this app do?
-This app takes a jobid from the TSO500 docker app and sets off additional dx run commands including fastqc, coverage calculations and multiqc.
+This app takes a jobid from the TSO500 docker app and sets off additional dx run commands including fastqc, coverage calculations, sompy (for HD200 commercial control samples) and multiqc.
 
 ## What are typical use cases for this app?
 This app runs after the the applet TSO500app_v1.1. It is used to set off QC steps for specific files output by this job using existing applets (which are set up to process one file at a time)
@@ -13,20 +13,23 @@ This app runs after the the applet TSO500app_v1.1. It is used to set off QC step
 * coverage_bedfile_id - the id of the BED file used for coverage project-abc:file123
 * coverage_app_id - the id of the coverage app used in format project-abc:applet123
 * fastqc_app_id - the id of the fastqc app used in format project-abc:applet123
+* sompy_app_id - the id of the sompy app used in format project-abc:applet123
 * multiqc_app_id - the id of the multiqc app used in format project-abc:applet123
 * upload_multiqc_app_id - the id of the upload_multiqc app used in format project-abc:applet123
 * coverage_commands - any extra commands for coverage (eg -iadditional_sambamba_flags and -iadditional_filter_commands)
 * coverage_level - the required read depth to be used for the coverage calculation (string)
 * multiqc_coverage_level - the required read depth to be used in multiQC (string)
 
 ## How does this app work?
-The app takes the job id from the TSO500 applet and parses the `fastqs` and `bams_for_coverage` outputs.
+The app takes the job id from the TSO500 applet and parses the `fastqs`, `bams_for_coverage` and `results_vcfs` outputs.
 dx describe functions are called on each output and used to extract the fileids and build dx run commands.
 
 The fastqc app is run for each fastq.gz file in the output and the jobid from the resulting job captured.
 
 The sambamba_chanjo coverage app is run for each pair of bam/bai files in the bams_for_coverage output.
 
+The sompy app is run for an sample with "HD200" in the sample name, comparing it with the HD200 known variants to generate recall statistics.
+
 Finally a dx run command for MultiQC is built, as the fastqc jobids used in the --depends-on flag to delay the start until all fastqc jobs have finished sucessfully.
 
 These commands are built using arguments and inputs provided to this app.

diff --git a/dxapp.json b/dxapp.json
@@ -1,11 +1,11 @@
 {
-	"name": "tso500_output_parser_v1.1",
-	"title": "tso500_output_parser_v1.1",
+	"name": "tso500_output_parser_v2.0",
+	"title": "tso500_output_parser_v2.0",
 	"summary": "Parse the output of the TSO500 app and set off jobs",
 	"tags": [
 	],
 	"properties": {
-	"github release": "v1.1"
+	"github release": "v2.0"
 	},
 	"dxapi": "1.0.0",
 	"inputSpec": [
@@ -46,6 +46,12 @@
 	"class": "string"
 	},
 	{
+	"name": "sompy_app_id",
+	"label": "sompy_app_id",
+	"help": "the id of the sompy applet in format project-abc:applet123",
+	"class": "string"
+	},
+	{
 	"name": "multiqc_app_id",
 	"label": "multiqc_app_id",
 	"help": "the id of the multiqc applet in format project-abc:applet123",
@@ -86,13 +92,24 @@
 	  "help": "log and dx run commands files",
 	  "class": "array:file",
 	  "optional": true
+	},
+	{
+		"name": "vcf_index",
+		"label": "indexed vcfs",
+		"help": "Indexed vcf files (vcf.gz + vcf.gz.tbi)",
+		"class": "array:file",
+		"optional": true
 	}
   ],
   "runSpec": {
 	"execDepends": [
+		{
+			"name": "tabix",
+			"version": "1.10.2-3"
+		}
 	],
 	"distribution" : "Ubuntu",
-	"release":"16.04",
+	"release":"20.04",
 	"version": "0",
 	"file": "src/code.sh",
 	"interpreter": "bash",

diff --git a/src/code.sh b/src/code.sh
@@ -4,13 +4,15 @@
 set -e -x -o pipefail
 
 # make output folder
-mkdir -p /home/dnanexus/out/logfiles/logfiles/
+mkdir -p /home/dnanexus/out/logfiles/logfiles/ /home/dnanexus/out/vcf_index/vcf_index
+#install tabix (needed for indexing the vcfs)
+#sudo apt-get install tabix
 
 # Store the API key. Grants the script access to DNAnexus resources    
 API_KEY=$(dx cat project-FQqXfYQ0Z0gqx7XG9Z2b4K43:mokaguys_nexus_auth_key)
 
 # print the arguments to the app to the logfile
-printf "projectname $project_name\nprojectid $project_id\ntso500_jobid $tso500_jobid\ncoverage_bedfile_id $coverage_bedfile_id\ncoverage_app_id $coverage_app_id\nfastqc_app_id $fastqc_app_id\nmultiqc_app_id $multiqc_app_id\n" >> /home/dnanexus/out/logfiles/logfiles/$project_name.output_parser.log
+printf "projectname $project_name\nprojectid $project_id\ntso500_jobid $tso500_jobid\ncoverage_bedfile_id $coverage_bedfile_id\ncoverage_app_id $coverage_app_id\nfastqc_app_id $fastqc_app_id\nsompy_app_id $sompy_app_id\nmultiqc_app_id $multiqc_app_id\n" >> /home/dnanexus/out/logfiles/logfiles/$project_name.output_parser.log
 
 #Add a cautionary note to the dx_run_cmds.sh
 printf "#note whilst these are the dx run commands used this script does not capture the jobids required to delay the multiqc app\n" >> /home/dnanexus/out/logfiles/logfiles/$project_name.dx_run_cmds.sh
@@ -78,6 +80,45 @@ for bai in $bai_array
         $dx_run_cmd
     done 
 
+### run sompy on HD200 sample if present
+printf "\nVCFs to be searched for HD200 sample and sompy run if present\nVCF indexes created at this stage\n" >> /home/dnanexus/out/logfiles/logfiles/$project_name.output_parser.log
+# create a array of results vcf names
+# parse the dx describe output for all outputs in the results_vcfs output
+# collapse the jsob for each output into a single line, filter for genome.vcf (the merged small variants vcf file) and return the file names
+vcfs_array=$(dx describe --json --multi $tso500_jobid:results_vcfs | jq -c '.[]'  | grep genome.vcf)
+#loop through vcfs to find control and build dx run command for sompy
+for genome_vcf in $vcfs_array
+    do 
+        # for each input (a json) return the id field and filename
+        fileid=$(jq -r '.id' <<< $genome_vcf)
+        filename=$(jq -r '.name' <<< $genome_vcf)
+
+        if [[ "$filename" =~ .*"HD200".* ]]; 
+            then
+            # build sompy command using the provided appid
+            sompy_command="dx run $sompy_app_id  --detach -y --brief --name=$filename -itruthVCF=project-ByfFPz00jy1fk6PjpZ95F27J:file-G7g9Pfj0jy1f87k1J1qqX83X -iqueryVCF=$fileid -iTSO=true -iskip=false --dest=$project_name:/ --auth-token $API_KEY" 
+            # write cmd to file and to stdout
+            echo $sompy_command
+            echo "jobid=($sompy_command)" >> /home/dnanexus/out/logfiles/logfiles/$project_name.dx_run_cmds.sh
+            #execute the command and capture jobid to delay multiqc
+            jobid=$($sompy_command)
+            depends_list="${depends_list} -d ${jobid}"
+        fi
+
+        # create indexed vcfs
+        # use project and file ids to download the vcf
+        projectid=$(jq -r '.project' <<<$genome_vcf)
+        vcf_id=$projectid:$fileid
+        dx download $vcf_id
+        # create path for bgzipped vcf
+        echo "creating indexed vcf for file name $filename file ID $vcf_id"
+        gzip_vcf_path=/home/dnanexus/out/vcf_index/vcf_index/$filename.gz
+        bgzip -c $filename > $gzip_vcf_path
+        cd /home/dnanexus/out/vcf_index/vcf_index
+        tabix -p vcf $filename.gz
+        cd ~
+    done
+
 # create dx run command for multiqc - giving depends on list, echo it to file and execute
 multiqc_cmd="dx run $multiqc_app_id --detach -y --brief $depends_list -iproject_for_multiqc=$project_name -icoverage_level=$multiqc_coverage_level --dest=$project_id:/ --auth-token $API_KEY" 
 echo $multiqc_cmd >> /home/dnanexus/out/logfiles/logfiles/$project_name.dx_run_cmds.sh