Allow for consensus files as input

aknijn · Apr 12, 2021 · 1e2cc15 · 1e2cc15
1 parent df77f5e
commit 1e2cc15
Show file tree

Hide file tree

Showing 9 changed files with 87 additions and 53 deletions.
diff --git a/RECoVG/RECoVG.py b/RECoVG/RECoVG.py
@@ -56,16 +56,15 @@ def __main__():
     # Sanger
     elif args.library=='sang':
         # ALIGN SARS-COV-2 GENOME
-        if os.stat(args.input1).st_size < 5120:
-            # SPIKE
-            subprocess.call("bowtie2 -p ${GALAXY_SLOTS:-4} -x '" + TOOL_DIR + "/data/genome_Spike' -f -U '" + args.input1 + "' --very-sensitive | samtools sort -@${GALAXY_SLOTS:-2} -O bam -o " + args.covidref_aligned, shell=True)
-        else:
-            # CONSENSUS
-            subprocess.call("minimap2 -t ${GALAXY_SLOTS:-4} " + TOOL_DIR + "/data/genome.fa '" + args.input1 + "' -a | samtools sort -@${GALAXY_SLOTS:-2} -O bam -o " + args.covidref_aligned, shell=True)
+        subprocess.call("bowtie2 -p ${GALAXY_SLOTS:-4} -x '" + TOOL_DIR + "/data/genome_Spike' -f -U '" + args.input1 + "' --very-sensitive | samtools sort -@${GALAXY_SLOTS:-2} -O bam -o " + args.covidref_aligned, shell=True)
+    # Consensus
+    elif args.library=='cons':
+        # ALIGN SARS-COV-2 GENOME
+        subprocess.call("minimap2 -t ${GALAXY_SLOTS:-4} " + TOOL_DIR + "/data/genome.fa '" + args.input1 + "' -a | samtools sort -@${GALAXY_SLOTS:-2} -O bam -o " + args.covidref_aligned, shell=True)
 
 
     # COPY CORRESPONDING REFERENCE
-    if args.library=='sang' and os.stat(args.input1).st_size < 5120:
+    if args.library=='sang':
         shutil.copy(TOOL_DIR + "/data/CovidREF_Spike.gbk", args.reference_genbank)
         shutil.copy(TOOL_DIR + "/data/genome_Spike.fa", args.reference_fasta)
     else:

diff --git a/RECoVG/data/CovidREF.gbk b/RECoVG/data/CovidREF.gbk
@@ -2,7 +2,7 @@ LOCUS       NC_045512              29903 bp ss-RNA     linear   VRL 13-MAR-2020
 DEFINITION  Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1,
             complete genome.
 ACCESSION   NC_045512
-VERSION     NC_045512.2
+VERSION     NC_045512
 DBLINK      BioProject: PRJNA485481
 KEYWORDS    RefSeq.
 SOURCE      Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
@@ -205,6 +205,8 @@ FEATURES             Location/Qualifiers
                      /gene="orf1ab nsp12"
      CDS             join(13442..13468,13468..16236)
                      /gene="orf1ab nsp12"
+                     /ribosomal_slippage
+                     /codon_start=1
                      /product="orf1ab nsp12"
                      /translation="SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAG
                      FAKFLKTNCCRFQEKDEDDNLIDSYFVVKRHTFSNYQHEETIYNLLKDCPAVAKHDFF

diff --git a/RECoVG/data/ProteineNt_Covid19.fasta b/RECoVG/data/ProteineNt_Covid19.fasta
diff --git a/RECoVG/recovg.xml b/RECoVG/recovg.xml
@@ -11,10 +11,15 @@
     </stdio>
     <command>
 <![CDATA[
+    #import os
     #if $token == "D85SteptkQZ9MobOk9pa":
         #if str( $input_pc.forward.name ) == "dummy.fastq" or str( $input_pc.reverse.name ) == "dummy.fastq":
           #if str( $input_pc.forward.name ).endswith(".fasta") or str( $input_pc.reverse.name ).endswith(".fasta"):
-            #set $library = "sang"
+		    #if os.stat( str($input_pc.forward) ).st_size < 5120 and os.stat( str($input_pc.reverse) ).st_size < 5120:
+              #set $library = "sang"
+			#else:
+              #set $library = "cons"
+			#end if
           #else:
             #if str( $input_pc.forward.name ).endswith(".fast5") or str( $input_pc.reverse.name ).endswith(".fast5"):
               #set $library = "nano"
@@ -28,10 +33,10 @@
         echo $library > $librarytype &&
         echo $strain > $strainname &&
 		touch $uploaded_fasta &&
-        #if str( $input_pc.forward.name ) == "dummy.fastq" and $library == "sang":
+        #if str( $input_pc.forward.name ) == "dummy.fastq" and str( $input_pc.reverse.name ).endswith(".fasta"):
 		  cp '${input_pc.reverse}' $uploaded_fasta &&
 		#else:
-		  #if str( $input_pc.reverse.name ) == "dummy.fastq" and $library == "sang":
+		  #if str( $input_pc.reverse.name ) == "dummy.fastq" and str( $input_pc.forward.name ).endswith(".fasta"):
             cp '${input_pc.forward}' $uploaded_fasta &&
 		  #end if
 		#end if
@@ -52,10 +57,11 @@
         <param name="token" type="text" label="execution token" />
         <param name="strain" type="text" label="Set output FASTA ID with Strain name" />
         <param name="library" type="select" label="What kind of sequences are you using?">
-            <option value="iont">Ion torrent</option>
+            <option value="iont">Ion Torrent</option>
             <option value="illu">Illumina</option>
             <option value="nano">Nanopore</option>
-            <option value="nano">Sanger</option>
+            <option value="sang">Sanger</option>
+            <option value="cons">Consensus</option>
 		</param>
         <param name="input_pc" type="data_collection"  format="fastqsanger" collection_type="paired" label="Paired-end FASTQ collection" help="Must be of datatype &quot;fastqsanger&quot;" optional="false" />
 	</inputs>

diff --git a/RECoVJ/RECoVJ.py b/RECoVJ/RECoVJ.py
@@ -123,6 +123,17 @@ def main():
         report_data["information_name"] = args.strain
         report_data["region"] = args.region
         report_data["year"] = args.year
+        library = open(args.librarytype).readline().rstrip()
+        if library == 'iont':
+            report_data["sequence"] = "Ion Torrent"
+        elif library == 'illu':
+            report_data["sequence"] = "Illumina"
+        elif library == 'nano':
+            report_data["sequence"] = "Nanopore"
+        elif library == 'sang':
+            report_data["sequence"] = "Sanger"
+        elif library == 'cons':
+            report_data["sequence"] = "Consensus"
         with open(args.lineage) as table_in:
             tab_lineage = [[str(col).rstrip() for col in row.split(',')] for row in table_in]
         report_data["lineage"] = tab_lineage[1][1] + " (" + tab_lineage[1][2] + ")"
@@ -136,7 +147,7 @@ def main():
             report_data["qc_status"] = 'Passed'
         with open(args.variants) as table_in:
             tab_variants = [[str(col).rstrip() for col in row.split('\t')] for row in table_in]
-        if open(args.librarytype).readline().rstrip() == 'sang':
+        if library == 'sang':
             strDefault = "ND"
         else:
             strDefault = "="

diff --git a/tools/ivar_covid_consensus.xml b/tools/ivar_covid_consensus.xml
@@ -12,8 +12,8 @@
         #set $idname = $subprocess.getoutput('cat ' + str($strainname))
         #set $library = $subprocess.getoutput('cat ' + str($librarytype))
         #set $clean_name = re.sub('[^\w\-]', '_', str($idname))
-        #if str( $library ) == "sang":
-            cp $uploaded_fasta consensus.fa
+        #if str( $library ) == "sang" or str( $library ) == "cons":
+            cat $uploaded_fasta > consensus.fa &&
 		#else:
             ln -s '$input_bam' sorted.bam &&
             samtools mpileup -A -d 0 -Q 0 sorted.bam | ivar consensus -p consensus -t 0.2 -n N -q 20 -m 30 &&

diff --git a/tools/ivar_covid_variants.xml b/tools/ivar_covid_variants.xml
@@ -15,7 +15,7 @@
         -p variants
         -q 20
         -t 0.5 &&
-        #if str( $library ) == "sang":
+        #if str( $library ) == "sang" or str( $library ) == "cons":
 		  mv variants.tsv $output_variants
 		#else:
 		  awk -F '\t' '$12>9 { print }' variants.tsv > $output_variants

diff --git a/tools/remove_aa_artifact.py b/tools/remove_aa_artifact.py
@@ -21,13 +21,11 @@
     'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}
 
 def getAABase(AA):
-    if len(AA) <= 1:
-        AABase = '-'
-    elif AA[-1].isnumeric():
-        AABase = AA
-    else:
-        AABase = AA[:-1]
-    return AABase
+    AABase=''
+    for x in AA:
+        if x.isdigit():
+            AABase+=x
+    return int(AABase)
 
 def __main__():
     parser = argparse.ArgumentParser()
@@ -40,26 +38,47 @@ def __main__():
     out_file = open(args.outtab,'w')
     out_file.write('\t'.join(read_csv[0])+'\n')
 
+    read_csv2=[]
+    read_csv2.append(read_csv[0])
+    ff=1
+    while ff<len(read_csv[:-1]):
+        if read_csv[ff][7]=='':
+            read_csv2.append(read_csv[ff])
+            ff += 1
+        if read_csv[ff][7] != '' and read_csv[ff-1][7]=='':
+            read_csv2.append(read_csv[ff])
+            ff += 1
+        if read_csv[ff-1][5].find('DELETION')!=-1:
+            n1 = getAABase(read_csv[ff][7])
+            n2 = getAABase(read_csv[ff - 1][7])
+            if n1==n2:
+                ff+=1
+            else:
+                read_csv2.append(read_csv[ff])
+                ff+=1
+        else:
+            read_csv2.append(read_csv[ff])
+            ff+=1
+    read_csv2.append(read_csv[ff])
+
     index=1
-    while index<len(read_csv[:-1]):
-        aa=getAABase(read_csv[index][7])
+    while index<len(read_csv2[:-1]):
+        aa=read_csv2[index][7][:-1]
         codone=[]
         riga=''
         non_trovato=0
-        if aa != '-':
-            if aa==getAABase(read_csv[index+1][7]):
-                codone.append(read_csv[index])
-                codone.append(read_csv[index+1])
-                index += 1
-                non_trovato+=1
-
-            if aa==getAABase(read_csv[index+1][7]):
-                codone.append(read_csv[index+1])
-                index += 1
-                non_trovato+=1
 
+        if aa==read_csv2[index+1][7][:-1]:
+            codone.append(read_csv2[index])
+            codone.append(read_csv2[index+1])
+            index += 1
+            non_trovato+=1
+        if aa==read_csv2[index+1][7][:-1]:
+            codone.append(read_csv2[index+1])
+            index += 1
+            non_trovato+=1
         if non_trovato==0:
-            out_file.write('\t'.join(read_csv[index])+'\n')
+            out_file.write('\t'.join(read_csv2[index])+'\n')
         if non_trovato==1:
             riga += codone[0][0] + '\t' + codone[0][1]+ '\t'
             cod=[]
@@ -68,7 +87,6 @@ def __main__():
             codon=''
             for x in range(len(codone)):
                 cod.append(codone[x][6])
-
             for x in range(0, 7):
                 if x<3:
                     if cod[0][x].isupper() and cod[1][x].islower():
@@ -95,15 +113,13 @@ def __main__():
                 mutations+='SYNONYMOUS_CODING'
             else:
                 mutations+='NON_SYNONYMOUS_CODING'
-
             riga+=cod1+'\t'+cod2+'\t'+codone[0][0]+'\t'+mutations+'\t'+codon+'\t'+gencode.get(codon[0:-4].upper())+codone[0][7][1:-1]+gencode.get(codon[4:].upper())+'\n'
             out_file.write(riga)
         if non_trovato==2:
             newcodon=codone[0][6][4]+codone[1][6][5]+codone[2][6][6]
-            riga+=codone[0][0]+'\t'+codone[0][1]+'\t'+read_csv[index][6][0:3].upper()+'\t'+newcodon+'\t'+codone[0][0]+'\t'+codone[0][5]+'\t'+read_csv[index][6][0:3].upper()+'/'+newcodon+'\t'+codone[0][7][:-1]+gencode.get(newcodon)+'\n'
+            riga+=codone[0][0]+'\t'+codone[0][1]+'\t'+read_csv2[index][6][0:3].upper()+'\t'+newcodon+'\t'+codone[0][0]+'\t'+codone[0][5]+'\t'+read_csv2[index][6][0:3].upper()+'/'+newcodon+'\t'+codone[0][7][:-1]+gencode.get(newcodon)+'\n'
             out_file.write(riga)
         index+=1
-
     if len(read_csv) > 0:
         out_file.write('\t'.join(read_csv[len(read_csv)-1])+'\n')
     out_file.close

diff --git a/tools/remove_aa_artifact.xml b/tools/remove_aa_artifact.xml
@@ -2,7 +2,7 @@
     <description>Transforms SNVs on the same codon in MNVs</description>
     <command detect_errors="exit_code">
 <![CDATA[
-	awk -F '\t' '$6!="N" && $4!="N" { print }' $input_tab > clean_input_tab &&
+	awk -F '\t' '$6!="N" && $4!="N" && $1!="" { print }' $input_tab > clean_input_tab &&
     python $__tool_directory__/remove_aa_artifact.py -i clean_input_tab -o $output_tab
 ]]>
     </command>