-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from nf-core/dev
PR for 1.2.0 Release
- Loading branch information
Showing
12 changed files
with
713 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/usr/bin/env python | ||
from mhcflurry import Class1AffinityPredictor | ||
import sys | ||
|
||
#List of alleles supported by mhclurry | ||
supported_alleles="A*01:01,A*02:01,A*02:02,A*02:03,A*02:05,A*02:06,A*02:07,A*02:11,A*02:12,A*02:16,A*02:17,A*02:19,A*02:50,A*03:01,A*11:01,A*23:01,A*24:02,A*24:03,A*25:01,A*26:01,A*26:02,A*26:03,A*29:02,A*30:01,A*30:02,A*31:01,A*32:01,A*33:01,A*66:01,A*68:01,A*68:02,A*68:23,A*69:01,A*80:01,B*07:01,B*07:02,B*08:01,B*08:02,B*08:03,B*14:02,B*15:01,B*15:02,B*15:03,B*15:09,B*15:17,B*18:01,B*27:02,B*27:03,B*27:04,B*27:05,B*27:06,B*35:01,B*35:03,B*37:01,B*38:01,B*39:01,B*39:06,B*40:01,B*40:02,B*42:01,B*44:02,B*44:03,B*45:01,B*46:01,B*48:01,B*51:01,B*53:01,B*54:01,B*57:01,B*58:01,B*83:01,C*03:03,C*04:01,C*05:01,C*06:02,C*07:02,C*08:02,C*12:03,C*14:02,C*15:02".split(",") | ||
|
||
#read provided allotypes | ||
op=open(sys.argv[-4]) | ||
alleles=op.read().split("\n") | ||
op.close() | ||
|
||
alleles=[a for a in alleles if a in supported_alleles] | ||
|
||
#read identified peptides with q-value < threshold | ||
mztab=open(sys.argv[-3]) | ||
mztab_read=mztab.readlines() | ||
mztab.close() | ||
seqs=[l.split()[1] for l in mztab_read if l.startswith("PSM")] | ||
seqs_new_smaller_qval=list(set(seqs)) | ||
|
||
#read all remaining peptides with q-value > threshold | ||
mztab=open(sys.argv[-2]) | ||
mztab_read=mztab.readlines() | ||
mztab.close() | ||
seqs=[l.split()[1] for l in mztab_read if l.startswith("PSM") if l.split()[1] not in seqs_new_smaller_qval] | ||
seqs_new_greater_qval=list(set(seqs)) | ||
seqs_new_greater_qval=[s for s in seqs_new_greater_qval if 7<len(s)<13] | ||
|
||
#call mhcflurry | ||
#subprocess.call("mhcflurry-predict --peptides {p} --alleles {a} --out {o}".format(p=" ".join(seqs_new), a=" ".join(alleles), o=sys.argv[-1])) | ||
seqs_filtered=[] | ||
for allele in alleles: | ||
print allele | ||
predictor = Class1AffinityPredictor.load() | ||
df_pred=predictor.predict_to_dataframe(allele=allele, peptides=seqs_new_greater_qval) | ||
seqs_filtered+=df_pred[df_pred['prediction']<=sys.argv[-5]]['peptide'].values.tolist() | ||
|
||
#merge sequence lists and append decoys | ||
seqs_new_all=list(set(seqs_new_smaller_qval+seqs_filtered)) | ||
seqs_new_all=seqs_new_all+[s[::-1] for s in seqs_new_all] | ||
|
||
#write idXML for filtering | ||
op=open(sys.argv[-1],'w') | ||
op.write('<PeptideIdentification score_type="q-value" higher_score_better="false">' + '\n') | ||
for pep in seqs_new_all: | ||
op.write('\t\t\t' + '<PeptideHit sequence="' + pep + '" score="0" charge="0" >' + '\n') | ||
op.write('\t\t\t' + '</PeptideHit>' + '\n') | ||
op.write('</PeptideIdentification>' + '\n') | ||
op.close() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
#!/usr/bin/env python | ||
import time | ||
import sys | ||
import argparse | ||
|
||
from Fred2.Core import Protein, Peptide, Allele, MutationSyntax, Variant | ||
from Fred2.Core.Variant import VariationType | ||
from Fred2.IO import read_lines, MartsAdapter, read_annovar_exonic | ||
from Fred2.EpitopePrediction import EpitopePredictorFactory | ||
from Fred2.Core import generate_peptides_from_proteins, generate_peptides_from_variants, generate_transcripts_from_variants, generate_proteins_from_transcripts | ||
from Fred2.IO.ADBAdapter import EIdentifierTypes, EAdapterFields | ||
from Fred2.IO.FileReader import read_vcf | ||
|
||
|
||
MARTDBURL = {"GRCH37": "http://grch37.ensembl.org/biomart/martservice?query=", | ||
"GRCH38": "http://www.ensembl.org/biomart/martservice?query="} # is corrently set to GRCh38 | ||
|
||
def read_variant_effect_predictor(file, gene_filter=None): | ||
""" | ||
Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects | ||
:param str file: Path to vcf file | ||
:param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list | ||
:return: list(Variant) - a list of Fred2.Core.Variant objects | ||
""" | ||
vars = [] | ||
def get_type(ref, alt): | ||
""" | ||
returns the variant type | ||
""" | ||
if len(ref)==1 and len(alt)==1: | ||
return VariationType.SNP | ||
if len(ref)>0 and len(alt)==0: | ||
if len(ref)%3 == 0: | ||
return VariationType.DEL | ||
else: | ||
return VariationType.FSDEL | ||
if len(ref) == 0 and len(alt)>0: | ||
if len(alt)% 3 == 0: | ||
return VariationType.INS | ||
else: | ||
return VariationType.FSINS | ||
return VariationType.UNKNOWN | ||
|
||
coding_types = set(["3_prime_UTR_variant", "5_prime_UTR_variant", "start_lost", "stop_gained", | ||
"frameshift_variant", "start_lost", "inframe_insertion", "inframe_deletion", "missense_variant", | ||
"protein_altering_variant", "splice_region_variant", "incomplete_terminal_codon_variant", "stop_retained_variant", | ||
"synonymous_variant", "coding_sequence_variant"]) | ||
|
||
with open(file, "r") as f: | ||
for i,l in enumerate(f): | ||
|
||
#skip comments | ||
if l.startswith("#") or l.strip() == "": | ||
continue | ||
|
||
chrom, gene_pos,var_id,ref,alt,_,filter_flag,info= l.strip().split("\t")[:8] | ||
coding = {} | ||
isSynonymous = False | ||
|
||
for co in info.split(","): | ||
#Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE"> | ||
_,var_type,_,gene,_,transcript_type,transcript_id,_,_,_,_,_,_,transcript_pos,prot_pos,aa_mutation = co.strip().split("|")[:16] | ||
HGNC_ID=co.strip().split("|")[22] | ||
|
||
#pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.) | ||
#pass genes that are uninterresting for us | ||
if transcript_type != "Transcript" or (HGNC_ID not in gene_filter and gene_filter): | ||
continue | ||
|
||
#pass all intronic and other mutations that do not directly influence the protein sequence | ||
if any(t in coding_types for t in var_type.split("&")): | ||
#generate mutation syntax | ||
|
||
#positioning in Fred2 is 0-based!!! | ||
if transcript_pos != "": | ||
coding[transcript_id] = MutationSyntax(transcript_id, int(transcript_pos)-1, | ||
-1 if prot_pos == "" else int(prot_pos)-1, co, "", geneID=HGNC_ID) | ||
|
||
#is variant synonymous? | ||
isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&")) | ||
if coding: | ||
vars.append(Variant(var_id, get_type(ref, alt), chrom, int(gene_pos), ref.upper(), alt.upper(), coding, False, isSynonymous)) | ||
return vars | ||
|
||
|
||
def main(): | ||
|
||
model = argparse.ArgumentParser(description='Neoepitope protein fasta generation from variant vcf') | ||
|
||
model.add_argument( | ||
'-v', '--vcf', | ||
type=str, | ||
default=None, | ||
help='Path to the vcf input file' | ||
) | ||
|
||
model.add_argument( | ||
'-t', '--type', | ||
type=str, | ||
choices=["VEP", "ANNOVAR", "SNPEFF"], | ||
default="VEP", | ||
help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' | ||
) | ||
|
||
model.add_argument( | ||
'-f', '--fasta_ref', | ||
type=str, | ||
default=None, | ||
help='Path to the fasta input file' | ||
) | ||
|
||
model.add_argument( | ||
'-p','--proteins', | ||
type=str, | ||
default=None, | ||
help='Path to the protein ID input file (in HGNC-ID)' | ||
) | ||
|
||
model.add_argument( | ||
'-r' ,'--reference', | ||
type=str, | ||
default='GRCh38', | ||
help='The reference genome used for varinat annotation and calling.' | ||
) | ||
|
||
model.add_argument( | ||
'-fINDEL' ,'--filterINDEL', | ||
action="store_true", | ||
help='Filter insertions and deletions (including frameshifts)' | ||
) | ||
|
||
model.add_argument( | ||
'-fFS' ,'--filterFSINDEL', | ||
action="store_true", | ||
help='Filter frameshift INDELs' | ||
) | ||
|
||
model.add_argument( | ||
'-fSNP' ,'--filterSNP', | ||
action="store_true", | ||
help='Filter SNPs' | ||
) | ||
|
||
model.add_argument( | ||
'-o','--output', | ||
type=str, | ||
required=True, | ||
help='Path to the output file' | ||
) | ||
|
||
|
||
args = model.parse_args() | ||
|
||
martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) | ||
|
||
|
||
if args.vcf is None: | ||
sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") | ||
return -1 | ||
|
||
# if vcf file is given: generate variants and filter them if HGNC IDs ar given | ||
if args.vcf is not None: | ||
protein_ids = [] | ||
if args.proteins is not None: | ||
with open(args.proteins, "r") as f: | ||
for l in f: | ||
l = l.strip() | ||
if l != "": | ||
protein_ids.append(l) | ||
|
||
if args.type == "VEP": | ||
variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) | ||
|
||
elif args.type == "SNPEFF": | ||
variants = read_vcf(args.vcf)[0] | ||
|
||
else: | ||
variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) | ||
|
||
|
||
if args.filterSNP: | ||
variants = filter(lambda x: x.type != VariationType.SNP, variants) | ||
|
||
if args.filterINDEL: | ||
variants = filter(lambda x: x.type not in [VariationType.INS, | ||
VariationType.DEL, | ||
VariationType.FSDEL, | ||
VariationType.FSINS], variants) | ||
|
||
if args.filterFSINDEL: | ||
variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) | ||
|
||
if not variants: | ||
sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") | ||
return -1 | ||
|
||
#generate transcripts | ||
transcripts = generate_transcripts_from_variants(variants, martDB, EIdentifierTypes.ENSEMBL) | ||
|
||
#generate proteins | ||
proteins = generate_proteins_from_transcripts(transcripts) | ||
|
||
print proteins | ||
|
||
#write fasta file | ||
with open(args.output.replace('.fasta','_vcf.fasta'), "w") as f: | ||
for p in proteins: | ||
f.write('>' + str(p.transcript_id) + '|' + str(p.vars) + '_var_' + '\n') | ||
f.write(str(p)+ '\n') | ||
|
||
|
||
#concatenate fasta file with fasta reference | ||
op=open(args.output.replace('.fasta','_vcf.fasta')) | ||
opr1=op.read() | ||
op.close() | ||
|
||
op=open(args.fasta_ref) | ||
opr2=op.read() | ||
op.close() | ||
|
||
concat=opr1+'\n'+opr2 | ||
|
||
op=open(args.output,'w') | ||
op.write(concat) | ||
op.close() | ||
|
||
else: | ||
sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") | ||
return -1 | ||
|
||
return 0 | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.