Merge pull request #29 from nf-core/dev

PR for 1.2.0 Release
nf-core · Jan 21, 2019 · 8f2b0ae · 8f2b0ae
2 parents 84967b1 + 0ed1312
commit 8f2b0ae
Show file tree

Hide file tree

Showing 12 changed files with 713 additions and 94 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,7 @@ before_install:
   # Pull the docker image first so the test doesn't wait for this
   - docker pull nfcore/mhcquant:dev
   # Fake the tag locally so that the pipeline runs properly
-  - docker tag nfcore/mhcquant:dev nfcore/mhcquant:1.1.0
+  - docker tag nfcore/mhcquant:dev nfcore/mhcquant:1.2.0
 
 install:
   # Install Nextflow

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # nf-core/mhcquant: Changelog
 
+## v1.2.0 nf-core/mhcquant "Golden Eagle" - 2019/01/19
+
+### `Added`
+Subset FDR refinement option
+Fred2 dependency
+vcf parser and translation to proteins
+
+### `Fixed`
+Documentation
+
 ## v1.1.0 nf-core/mhcquant "Black Crow" - 2019/01/04
 
 ### `Added`

diff --git a/Dockerfile b/Dockerfile
@@ -6,5 +6,5 @@ COPY environment.yml /
 COPY environment-percolator.yml /
 RUN conda env create -f /environment.yml && conda clean -a
 RUN conda env create -f /environment-percolator.yml && conda clean -a
-ENV PATH /opt/conda/envs/nf-core-mhcquant-1.1.0/bin:$PATH
-ENV PATH /opt/conda/envs/nf-core-mhcquant-percolator-1.0dev/bin:$PATH
+ENV PATH /opt/conda/envs/nf-core-mhcquant-1.2.0/bin:$PATH
+ENV PATH /opt/conda/envs/nf-core-mhcquant-percolator-1.2.0/bin:$PATH
diff --git a/Singularity b/Singularity
@@ -3,10 +3,10 @@ Bootstrap:docker
 
 %labels
     DESCRIPTION Singularity image containing all requirements for the nf-core/mhcquant pipeline
-    VERSION 1.1.0
+    VERSION 1.2.0
 
 %environment
-    PATH=/opt/conda/envs/nf-core-mhcquant-1.1.0/bin:$PATH
+    PATH=/opt/conda/envs/nf-core-mhcquant-1.2.0/bin:$PATH
     PATH=/opt/conda/envs/nf-core-mhcquant-percolator-1.0dev/bin:$PATH
     export PATH
 

diff --git a/bin/mhcflurry_predict_mztab_for_filtering.py b/bin/mhcflurry_predict_mztab_for_filtering.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+from mhcflurry import Class1AffinityPredictor
+import sys
+
+#List of alleles supported by mhclurry
+supported_alleles="A*01:01,A*02:01,A*02:02,A*02:03,A*02:05,A*02:06,A*02:07,A*02:11,A*02:12,A*02:16,A*02:17,A*02:19,A*02:50,A*03:01,A*11:01,A*23:01,A*24:02,A*24:03,A*25:01,A*26:01,A*26:02,A*26:03,A*29:02,A*30:01,A*30:02,A*31:01,A*32:01,A*33:01,A*66:01,A*68:01,A*68:02,A*68:23,A*69:01,A*80:01,B*07:01,B*07:02,B*08:01,B*08:02,B*08:03,B*14:02,B*15:01,B*15:02,B*15:03,B*15:09,B*15:17,B*18:01,B*27:02,B*27:03,B*27:04,B*27:05,B*27:06,B*35:01,B*35:03,B*37:01,B*38:01,B*39:01,B*39:06,B*40:01,B*40:02,B*42:01,B*44:02,B*44:03,B*45:01,B*46:01,B*48:01,B*51:01,B*53:01,B*54:01,B*57:01,B*58:01,B*83:01,C*03:03,C*04:01,C*05:01,C*06:02,C*07:02,C*08:02,C*12:03,C*14:02,C*15:02".split(",")
+
+#read provided allotypes
+op=open(sys.argv[-4])
+alleles=op.read().split("\n")
+op.close()
+
+alleles=[a for a in alleles if a in supported_alleles]
+
+#read identified peptides with q-value < threshold
+mztab=open(sys.argv[-3])
+mztab_read=mztab.readlines()
+mztab.close()
+seqs=[l.split()[1] for l in mztab_read if l.startswith("PSM")]
+seqs_new_smaller_qval=list(set(seqs))
+
+#read all remaining peptides with q-value > threshold
+mztab=open(sys.argv[-2])
+mztab_read=mztab.readlines()
+mztab.close()
+seqs=[l.split()[1] for l in mztab_read if l.startswith("PSM") if l.split()[1] not in seqs_new_smaller_qval]
+seqs_new_greater_qval=list(set(seqs))
+seqs_new_greater_qval=[s for s in seqs_new_greater_qval if 7<len(s)<13]
+
+#call mhcflurry
+#subprocess.call("mhcflurry-predict --peptides {p} --alleles {a} --out {o}".format(p=" ".join(seqs_new), a=" ".join(alleles), o=sys.argv[-1]))
+seqs_filtered=[]
+for allele in alleles:
+   print allele
+   predictor = Class1AffinityPredictor.load()
+   df_pred=predictor.predict_to_dataframe(allele=allele, peptides=seqs_new_greater_qval)
+   seqs_filtered+=df_pred[df_pred['prediction']<=sys.argv[-5]]['peptide'].values.tolist()
+
+#merge sequence lists and append decoys
+seqs_new_all=list(set(seqs_new_smaller_qval+seqs_filtered))
+seqs_new_all=seqs_new_all+[s[::-1] for s in seqs_new_all]
+
+#write idXML for filtering
+op=open(sys.argv[-1],'w')
+op.write('<PeptideIdentification score_type="q-value" higher_score_better="false">' + '\n')
+for pep in seqs_new_all:
+	op.write('\t\t\t' + '<PeptideHit sequence="' + pep + '" score="0" charge="0" >' + '\n')
+	op.write('\t\t\t' + '</PeptideHit>' + '\n')
+op.write('</PeptideIdentification>' + '\n')
+op.close()
+
+
+
diff --git a/bin/variants2fasta.py b/bin/variants2fasta.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+import time
+import sys
+import argparse
+
+from Fred2.Core import Protein, Peptide, Allele, MutationSyntax, Variant
+from Fred2.Core.Variant import VariationType
+from Fred2.IO import read_lines, MartsAdapter, read_annovar_exonic
+from Fred2.EpitopePrediction import EpitopePredictorFactory
+from Fred2.Core import generate_peptides_from_proteins, generate_peptides_from_variants, generate_transcripts_from_variants, generate_proteins_from_transcripts
+from Fred2.IO.ADBAdapter import EIdentifierTypes, EAdapterFields
+from Fred2.IO.FileReader import read_vcf
+
+
+MARTDBURL = {"GRCH37": "http://grch37.ensembl.org/biomart/martservice?query=",
+             "GRCH38": "http://www.ensembl.org/biomart/martservice?query="} # is corrently set to GRCh38
+
+def read_variant_effect_predictor(file, gene_filter=None):
+    """
+    Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects
+
+    :param str file: Path to vcf file
+    :param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list
+    :return: list(Variant) - a list of Fred2.Core.Variant objects
+    """
+    vars = []
+    def get_type(ref, alt):
+        """
+            returns the variant type
+        """
+        if len(ref)==1 and len(alt)==1:
+            return VariationType.SNP
+        if len(ref)>0 and len(alt)==0:
+            if len(ref)%3 == 0:
+                return VariationType.DEL
+            else:
+                return VariationType.FSDEL
+        if len(ref) == 0 and len(alt)>0:
+            if len(alt)% 3 == 0:
+                return VariationType.INS
+            else:
+                return VariationType.FSINS
+        return VariationType.UNKNOWN
+
+    coding_types = set(["3_prime_UTR_variant", "5_prime_UTR_variant", "start_lost", "stop_gained",
+        "frameshift_variant", "start_lost", "inframe_insertion", "inframe_deletion", "missense_variant",
+        "protein_altering_variant", "splice_region_variant", "incomplete_terminal_codon_variant", "stop_retained_variant",
+        "synonymous_variant", "coding_sequence_variant"])
+
+    with open(file, "r") as f:
+        for i,l in enumerate(f):
+
+            #skip comments
+            if l.startswith("#") or l.strip() == "":
+                continue
+
+            chrom, gene_pos,var_id,ref,alt,_,filter_flag,info= l.strip().split("\t")[:8]
+            coding = {}
+            isSynonymous = False
+
+            for co in info.split(","):
+                #Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE">
+                _,var_type,_,gene,_,transcript_type,transcript_id,_,_,_,_,_,_,transcript_pos,prot_pos,aa_mutation = co.strip().split("|")[:16]
+                HGNC_ID=co.strip().split("|")[22]
+
+                #pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.)
+                #pass genes that are uninterresting for us
+                if transcript_type != "Transcript" or (HGNC_ID not in gene_filter and gene_filter):
+                    continue
+
+                #pass all intronic and other mutations that do not directly influence the protein sequence
+                if any(t in coding_types for t in var_type.split("&")):
+                    #generate mutation syntax
+
+                    #positioning in Fred2 is 0-based!!!
+                    if transcript_pos != "":
+                        coding[transcript_id] = MutationSyntax(transcript_id, int(transcript_pos)-1, 
+                            -1 if prot_pos  == "" else int(prot_pos)-1, co, "", geneID=HGNC_ID)
+
+                #is variant synonymous?
+                isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&"))
+            if coding:
+                vars.append(Variant(var_id, get_type(ref, alt), chrom, int(gene_pos), ref.upper(), alt.upper(), coding, False, isSynonymous))
+    return vars
+
+
+def main():
+
+    model = argparse.ArgumentParser(description='Neoepitope protein fasta generation from variant vcf')
+
+    model.add_argument(
+        '-v', '--vcf',
+        type=str,
+        default=None,
+        help='Path to the vcf input file'
+        )
+
+    model.add_argument(
+        '-t', '--type',
+        type=str,
+        choices=["VEP", "ANNOVAR", "SNPEFF"],
+        default="VEP",
+        help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
+        )
+
+    model.add_argument(
+        '-f', '--fasta_ref',
+        type=str,
+        default=None,
+        help='Path to the fasta input file'
+        )
+
+    model.add_argument(
+        '-p','--proteins',
+        type=str,
+        default=None,
+        help='Path to the protein ID input file (in HGNC-ID)'
+        )
+
+    model.add_argument(
+        '-r' ,'--reference',
+        type=str,
+        default='GRCh38',
+        help='The reference genome used for varinat annotation and calling.'
+        )
+
+    model.add_argument(
+        '-fINDEL' ,'--filterINDEL',
+        action="store_true",
+        help='Filter insertions and deletions (including frameshifts)'
+        )
+
+    model.add_argument(
+        '-fFS' ,'--filterFSINDEL',
+        action="store_true",
+        help='Filter frameshift INDELs'
+        )
+
+    model.add_argument(
+        '-fSNP' ,'--filterSNP',
+        action="store_true",
+        help='Filter SNPs'
+        )
+
+    model.add_argument(
+        '-o','--output',
+        type=str,
+        required=True,
+        help='Path to the output file'
+        )
+
+
+    args = model.parse_args()
+
+    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
+
+
+    if args.vcf is None:
+        sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
+        return -1
+
+    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
+    if args.vcf is not None:
+        protein_ids = []
+        if args.proteins is not None:
+            with open(args.proteins, "r") as f:
+                for l in f:
+                    l = l.strip()
+                    if l != "":
+                        protein_ids.append(l)
+
+        if args.type == "VEP":
+            variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids)
+
+        elif args.type == "SNPEFF":
+            variants = read_vcf(args.vcf)[0]
+
+        else:
+            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)
+
+
+        if args.filterSNP:
+            variants = filter(lambda x: x.type != VariationType.SNP, variants)
+
+        if args.filterINDEL:
+            variants = filter(lambda x: x.type not in [VariationType.INS,
+                                                       VariationType.DEL,
+                                                       VariationType.FSDEL,
+                                                       VariationType.FSINS], variants)
+
+        if args.filterFSINDEL:
+            variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants)
+
+        if not variants:
+            sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n")
+            return -1
+
+        #generate transcripts
+        transcripts = generate_transcripts_from_variants(variants, martDB, EIdentifierTypes.ENSEMBL)
+
+        #generate proteins
+        proteins = generate_proteins_from_transcripts(transcripts)
+
+        print proteins
+
+        #write fasta file
+        with open(args.output.replace('.fasta','_vcf.fasta'), "w") as f:
+            for p in proteins:
+                f.write('>' + str(p.transcript_id) + '|' + str(p.vars) + '_var_' + '\n')
+                f.write(str(p)+ '\n')
+
+
+        #concatenate fasta file with fasta reference
+        op=open(args.output.replace('.fasta','_vcf.fasta'))
+        opr1=op.read()
+        op.close()
+
+        op=open(args.fasta_ref)
+        opr2=op.read()
+        op.close()
+
+        concat=opr1+'\n'+opr2
+
+        op=open(args.output,'w')
+        op.write(concat)
+        op.close()
+
+    else:
+        sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
+        return -1              
+
+    return 0
+
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/test.config b/conf/test.config
@@ -15,4 +15,5 @@ params {
   fasta = ['https://github.com/nf-core/test-datasets/raw/mhcquant/test.fasta'] 
   mzmls = ['https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_1_A_1.mzML', 'https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_1_B_1.mzML', 'https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_2_A_1.mzML', 'https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_2_B_1.mzML']
   alleles = ['https://github.com/nf-core/test-datasets/raw/mhcquant/alleles.tsv']
+  vcf = ['https://github.com/nf-core/test-datasets/raw/mhcquant/GRIAFFLKY_snpeff.vcf']
 }