Skip to content

Commit

Permalink
Merge pull request #29 from nf-core/dev
Browse files Browse the repository at this point in the history
PR for 1.2.0 Release
  • Loading branch information
apeltzer authored Jan 21, 2019
2 parents 84967b1 + 0ed1312 commit 8f2b0ae
Show file tree
Hide file tree
Showing 12 changed files with 713 additions and 94 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ before_install:
# Pull the docker image first so the test doesn't wait for this
- docker pull nfcore/mhcquant:dev
# Fake the tag locally so that the pipeline runs properly
- docker tag nfcore/mhcquant:dev nfcore/mhcquant:1.1.0
- docker tag nfcore/mhcquant:dev nfcore/mhcquant:1.2.0

install:
# Install Nextflow
Expand Down
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# nf-core/mhcquant: Changelog

## v1.2.0 nf-core/mhcquant "Golden Eagle" - 2019/01/19

### `Added`
Subset FDR refinement option
Fred2 dependency
vcf parser and translation to proteins

### `Fixed`
Documentation

## v1.1.0 nf-core/mhcquant "Black Crow" - 2019/01/04

### `Added`
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ COPY environment.yml /
COPY environment-percolator.yml /
RUN conda env create -f /environment.yml && conda clean -a
RUN conda env create -f /environment-percolator.yml && conda clean -a
ENV PATH /opt/conda/envs/nf-core-mhcquant-1.1.0/bin:$PATH
ENV PATH /opt/conda/envs/nf-core-mhcquant-percolator-1.0dev/bin:$PATH
ENV PATH /opt/conda/envs/nf-core-mhcquant-1.2.0/bin:$PATH
ENV PATH /opt/conda/envs/nf-core-mhcquant-percolator-1.2.0/bin:$PATH
4 changes: 2 additions & 2 deletions Singularity
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ Bootstrap:docker

%labels
DESCRIPTION Singularity image containing all requirements for the nf-core/mhcquant pipeline
VERSION 1.1.0
VERSION 1.2.0

%environment
PATH=/opt/conda/envs/nf-core-mhcquant-1.1.0/bin:$PATH
PATH=/opt/conda/envs/nf-core-mhcquant-1.2.0/bin:$PATH
PATH=/opt/conda/envs/nf-core-mhcquant-percolator-1.0dev/bin:$PATH
export PATH

Expand Down
53 changes: 53 additions & 0 deletions bin/mhcflurry_predict_mztab_for_filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python
from mhcflurry import Class1AffinityPredictor
import sys

#List of alleles supported by mhclurry
supported_alleles="A*01:01,A*02:01,A*02:02,A*02:03,A*02:05,A*02:06,A*02:07,A*02:11,A*02:12,A*02:16,A*02:17,A*02:19,A*02:50,A*03:01,A*11:01,A*23:01,A*24:02,A*24:03,A*25:01,A*26:01,A*26:02,A*26:03,A*29:02,A*30:01,A*30:02,A*31:01,A*32:01,A*33:01,A*66:01,A*68:01,A*68:02,A*68:23,A*69:01,A*80:01,B*07:01,B*07:02,B*08:01,B*08:02,B*08:03,B*14:02,B*15:01,B*15:02,B*15:03,B*15:09,B*15:17,B*18:01,B*27:02,B*27:03,B*27:04,B*27:05,B*27:06,B*35:01,B*35:03,B*37:01,B*38:01,B*39:01,B*39:06,B*40:01,B*40:02,B*42:01,B*44:02,B*44:03,B*45:01,B*46:01,B*48:01,B*51:01,B*53:01,B*54:01,B*57:01,B*58:01,B*83:01,C*03:03,C*04:01,C*05:01,C*06:02,C*07:02,C*08:02,C*12:03,C*14:02,C*15:02".split(",")

#read provided allotypes
op=open(sys.argv[-4])
alleles=op.read().split("\n")
op.close()

alleles=[a for a in alleles if a in supported_alleles]

#read identified peptides with q-value < threshold
mztab=open(sys.argv[-3])
mztab_read=mztab.readlines()
mztab.close()
seqs=[l.split()[1] for l in mztab_read if l.startswith("PSM")]
seqs_new_smaller_qval=list(set(seqs))

#read all remaining peptides with q-value > threshold
mztab=open(sys.argv[-2])
mztab_read=mztab.readlines()
mztab.close()
seqs=[l.split()[1] for l in mztab_read if l.startswith("PSM") if l.split()[1] not in seqs_new_smaller_qval]
seqs_new_greater_qval=list(set(seqs))
seqs_new_greater_qval=[s for s in seqs_new_greater_qval if 7<len(s)<13]

#call mhcflurry
#subprocess.call("mhcflurry-predict --peptides {p} --alleles {a} --out {o}".format(p=" ".join(seqs_new), a=" ".join(alleles), o=sys.argv[-1]))
seqs_filtered=[]
for allele in alleles:
print allele
predictor = Class1AffinityPredictor.load()
df_pred=predictor.predict_to_dataframe(allele=allele, peptides=seqs_new_greater_qval)
seqs_filtered+=df_pred[df_pred['prediction']<=sys.argv[-5]]['peptide'].values.tolist()

#merge sequence lists and append decoys
seqs_new_all=list(set(seqs_new_smaller_qval+seqs_filtered))
seqs_new_all=seqs_new_all+[s[::-1] for s in seqs_new_all]

#write idXML for filtering
op=open(sys.argv[-1],'w')
op.write('<PeptideIdentification score_type="q-value" higher_score_better="false">' + '\n')
for pep in seqs_new_all:
op.write('\t\t\t' + '<PeptideHit sequence="' + pep + '" score="0" charge="0" >' + '\n')
op.write('\t\t\t' + '</PeptideHit>' + '\n')
op.write('</PeptideIdentification>' + '\n')
op.close()



237 changes: 237 additions & 0 deletions bin/variants2fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
#!/usr/bin/env python
import time
import sys
import argparse

from Fred2.Core import Protein, Peptide, Allele, MutationSyntax, Variant
from Fred2.Core.Variant import VariationType
from Fred2.IO import read_lines, MartsAdapter, read_annovar_exonic
from Fred2.EpitopePrediction import EpitopePredictorFactory
from Fred2.Core import generate_peptides_from_proteins, generate_peptides_from_variants, generate_transcripts_from_variants, generate_proteins_from_transcripts
from Fred2.IO.ADBAdapter import EIdentifierTypes, EAdapterFields
from Fred2.IO.FileReader import read_vcf


MARTDBURL = {"GRCH37": "http://grch37.ensembl.org/biomart/martservice?query=",
"GRCH38": "http://www.ensembl.org/biomart/martservice?query="} # is corrently set to GRCh38

def read_variant_effect_predictor(file, gene_filter=None):
"""
Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects
:param str file: Path to vcf file
:param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list
:return: list(Variant) - a list of Fred2.Core.Variant objects
"""
vars = []
def get_type(ref, alt):
"""
returns the variant type
"""
if len(ref)==1 and len(alt)==1:
return VariationType.SNP
if len(ref)>0 and len(alt)==0:
if len(ref)%3 == 0:
return VariationType.DEL
else:
return VariationType.FSDEL
if len(ref) == 0 and len(alt)>0:
if len(alt)% 3 == 0:
return VariationType.INS
else:
return VariationType.FSINS
return VariationType.UNKNOWN

coding_types = set(["3_prime_UTR_variant", "5_prime_UTR_variant", "start_lost", "stop_gained",
"frameshift_variant", "start_lost", "inframe_insertion", "inframe_deletion", "missense_variant",
"protein_altering_variant", "splice_region_variant", "incomplete_terminal_codon_variant", "stop_retained_variant",
"synonymous_variant", "coding_sequence_variant"])

with open(file, "r") as f:
for i,l in enumerate(f):

#skip comments
if l.startswith("#") or l.strip() == "":
continue

chrom, gene_pos,var_id,ref,alt,_,filter_flag,info= l.strip().split("\t")[:8]
coding = {}
isSynonymous = False

for co in info.split(","):
#Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE">
_,var_type,_,gene,_,transcript_type,transcript_id,_,_,_,_,_,_,transcript_pos,prot_pos,aa_mutation = co.strip().split("|")[:16]
HGNC_ID=co.strip().split("|")[22]

#pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.)
#pass genes that are uninterresting for us
if transcript_type != "Transcript" or (HGNC_ID not in gene_filter and gene_filter):
continue

#pass all intronic and other mutations that do not directly influence the protein sequence
if any(t in coding_types for t in var_type.split("&")):
#generate mutation syntax

#positioning in Fred2 is 0-based!!!
if transcript_pos != "":
coding[transcript_id] = MutationSyntax(transcript_id, int(transcript_pos)-1,
-1 if prot_pos == "" else int(prot_pos)-1, co, "", geneID=HGNC_ID)

#is variant synonymous?
isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&"))
if coding:
vars.append(Variant(var_id, get_type(ref, alt), chrom, int(gene_pos), ref.upper(), alt.upper(), coding, False, isSynonymous))
return vars


def main():

model = argparse.ArgumentParser(description='Neoepitope protein fasta generation from variant vcf')

model.add_argument(
'-v', '--vcf',
type=str,
default=None,
help='Path to the vcf input file'
)

model.add_argument(
'-t', '--type',
type=str,
choices=["VEP", "ANNOVAR", "SNPEFF"],
default="VEP",
help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
)

model.add_argument(
'-f', '--fasta_ref',
type=str,
default=None,
help='Path to the fasta input file'
)

model.add_argument(
'-p','--proteins',
type=str,
default=None,
help='Path to the protein ID input file (in HGNC-ID)'
)

model.add_argument(
'-r' ,'--reference',
type=str,
default='GRCh38',
help='The reference genome used for varinat annotation and calling.'
)

model.add_argument(
'-fINDEL' ,'--filterINDEL',
action="store_true",
help='Filter insertions and deletions (including frameshifts)'
)

model.add_argument(
'-fFS' ,'--filterFSINDEL',
action="store_true",
help='Filter frameshift INDELs'
)

model.add_argument(
'-fSNP' ,'--filterSNP',
action="store_true",
help='Filter SNPs'
)

model.add_argument(
'-o','--output',
type=str,
required=True,
help='Path to the output file'
)


args = model.parse_args()

martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])


if args.vcf is None:
sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
return -1

# if vcf file is given: generate variants and filter them if HGNC IDs ar given
if args.vcf is not None:
protein_ids = []
if args.proteins is not None:
with open(args.proteins, "r") as f:
for l in f:
l = l.strip()
if l != "":
protein_ids.append(l)

if args.type == "VEP":
variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids)

elif args.type == "SNPEFF":
variants = read_vcf(args.vcf)[0]

else:
variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)


if args.filterSNP:
variants = filter(lambda x: x.type != VariationType.SNP, variants)

if args.filterINDEL:
variants = filter(lambda x: x.type not in [VariationType.INS,
VariationType.DEL,
VariationType.FSDEL,
VariationType.FSINS], variants)

if args.filterFSINDEL:
variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants)

if not variants:
sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n")
return -1

#generate transcripts
transcripts = generate_transcripts_from_variants(variants, martDB, EIdentifierTypes.ENSEMBL)

#generate proteins
proteins = generate_proteins_from_transcripts(transcripts)

print proteins

#write fasta file
with open(args.output.replace('.fasta','_vcf.fasta'), "w") as f:
for p in proteins:
f.write('>' + str(p.transcript_id) + '|' + str(p.vars) + '_var_' + '\n')
f.write(str(p)+ '\n')


#concatenate fasta file with fasta reference
op=open(args.output.replace('.fasta','_vcf.fasta'))
opr1=op.read()
op.close()

op=open(args.fasta_ref)
opr2=op.read()
op.close()

concat=opr1+'\n'+opr2

op=open(args.output,'w')
op.write(concat)
op.close()

else:
sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
return -1

return 0



if __name__ == "__main__":
sys.exit(main())
1 change: 1 addition & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ params {
fasta = ['https://github.com/nf-core/test-datasets/raw/mhcquant/test.fasta']
mzmls = ['https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_1_A_1.mzML', 'https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_1_B_1.mzML', 'https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_2_A_1.mzML', 'https://github.com/nf-core/test-datasets/raw/mhcquant/MM15_Melanom_2_B_1.mzML']
alleles = ['https://github.com/nf-core/test-datasets/raw/mhcquant/alleles.tsv']
vcf = ['https://github.com/nf-core/test-datasets/raw/mhcquant/GRIAFFLKY_snpeff.vcf']
}
Loading

0 comments on commit 8f2b0ae

Please sign in to comment.