CATE/parameters.json at main · theLongLab/CATE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
{
    # Cuda device
    "CUDA Device ID":0,

    # IO directory listings and universal file paths
    # RECOMMENDS THE CREATION OF SEPARATE PARAMATER FILES WITH SEPARATE OUTPUT AND INTERMEDIATE PATHS FOR INDIVIDUAL PROJECTS
    "Input path":"ON_neutral",
    "Output path":"fa_test",
    "Intermediate path":"intermediate_test",

    # Prometheus settings
    # Prometheus is a high performance mode in CATE. It is greedy on hardware resources but has extremely fast processing speeds.
    # User is capable of determining the number of gene combinations and number of SNPs to be processed by CATE.
    # When setting these parameters be mindful of system hardware availability.
    # WARNING: Multi read CAN CAUSE A BOTTLENECK IF HARDWARE DOES NOT SUPPORT IT. NVMe DRIVES CONNECTED VIA PCIe BUS WILL SUPPORT MULTI READ. TYPICAL HDD DRIVES DO NOT.
    "Prometheus activate":"No",
    "CPU cores":10,
    "SNPs per time":1000,
    "Number of genes":2,
    "Multi read":"Yes",

    # VCF sample details
    "Ploidy":1,

    # Protein information
    # DNA INTERPRETATION SHOULD BE USED FOR START AND STOP CODONS
    "Start codon(s)":"ATG",
    "Stop codon(s)":"TAA,TAG,TGA",
    "Genetic code":"A|GCT,GCC,GCA,GCG;R|CGT,CGC,CGA,CGG,AGA,AGG;N|AAT,AAC;D|GAT,GAC;B|AAT,AAC,GAC;C|TGT,TGC;Q|CAA,CAG;E|GAA,GAG;Z|CAA,CAG,GAA;G|GGT,GGC,GGA,GGG;H|CAT,CAC;M|ATG;I|ATT,ATC,ATA;L|CTT,CTC,CTA,CTG,TTA,TTG;K|AAA,AAG;F|TTT,TTC;P|CCT,CCC,CCA,CCG;S|TCT,TCC,TCA,TCG,AGT,AGC;T|ACT,ACC,ACA,ACG;W|TGG;Y|TAT,TAC;V|GTT,GTC,GTA,GTG;X|TAA,TGA,TAG",

    # Neutrality and FST calculation mode
    # Calculation mode can be FILE (to calculate the tests for predefined regions) or WINDOW
    "Calculation mode":"FILE",

    # WINDOW mode parameters
    "Window size":10000,
    "Step size":10000,

    # FILE mode parameters
    # Any tab-deliminated text based formats such as but not limited to (*.txt, *.csv)
    "Universal gene list":"cov.txt",

    # VCF split parameters
    # The type of Split mode can be either CHR or CTSPLIT.
    # The CHR mode will split the VCF file by chromosomes and extract the GT column only.
    # The CTSPLIT mode will create CATE's proprietary file structure. For this the parent VCFs must only contain one chromosome's data and have only the GT column.
    # Column numbers are non zero digits, i.e. Column numbers start with one.
    "Split mode":"CHR",
    "CHR individual summary":"YES",
    "Split cores":10,
    "Split SNPs per_time_CPU":10000,
    "Split SNPs per_time_GPU":5000,
    "Population file path":"/mnt/d/Deshan/Books/population/sample_population_codes_2.tsv",
    "Sample_ID Column number":1,
    "Population_ID Column number":6,
    "Reference allele count":1,
    "Alternate allele count":1,
    "SNP count per file":10000,
    "MAF frequency":"0",
    "Frequency logic":">=",

    # FASTA split parameters
    # REQUIRES SEPARATE INPUT, SINCE IT IS A SINGULAR INPUT
    # IF ALL IS SET AS SEQUENCE THEN THE WHOLE FILE IS SPLIT. ELSE STATE THE SEQUENCE ID OF THE SEQUENCE THAT NEEDS TO BE SEPARATED
    "Sequence":"All",
    "Raw FASTA file":"/mnt/d/Deshan/Books/University of Calgary/Experiments/Neutrality_Linux/HIV_sequences/Selected_genomes_concatenated.fna",

    # FASTA merge parameters
    # REQUIRES SEPARATE INPUT AND OUTPUT LOCATION
    # ENSURE THE FASTA FILES HAVE THE APPROPRIATE EXTENSION: .fasta, .fna, .ffn, .faa, .frn, .fa
    "FASTA files folder":"/mnt/d/Deshan/Books/University of Calgary/Experiments/Neutrality_Linux/reference_Split/3",
    "Merge FASTA path":"/mnt/d/Deshan/Books/University of Calgary/Experiments/Neutrality_Linux/reference_Split/Three.fasta",

    # Extract genes parameters
    # Extract gene FASTA sequences from REFERENCE FASTA file and outputs them as separate FASTA files
    "Reference genome ex":"/mnt/d/Deshan/Books/University of Calgary/Experiments/Neutrality_Linux/reference_merge/One.fasta",
    "Extract gene list":"universal",

    # GFF to Genes parameters
    # REQUIRES SEPERATE INPUT, SINCE IT IS A SINGULAR INPUT
    "GFF file":"TAIR10_GFF3_genes.gff",

    # Haplotype INFO extract
    # Extract haplotype information from VCFs
    # IF population out is set to YES then the entire populations FASTA configuration will be generated. ELSE only the unique haplotypes FASTA will be generated.
    "Reference genome hap":"/mnt/d/Deshan/Books/University of Calgary/Experiments/Neutrality_Linux/reference_merge/One.fasta",
    "Population out":"Yes",
    "Hap extract gene list":"universal",

    # MAP file to Gene file parameters
    # REQUIRES SEPERATE INPUT, SINCE IT IS A SINGULAR INPUT
    "MAP file":"map/wgs.map",
    "SNP prefix":"S",

    # All Neutrality tests
    # Calculates all three neutrality tests (Tajima's D, Fay and Wu's and Fu and Li's) at once
    # File created would have the extension *.nt (tab deliminated text file). File Name: CountryName_GeneListFileName.nt
    "Neutrality gene list":"universal",

    # Tajima's D
    # File created would have the extension *.td (tab deliminated text file). File Name: CountryName_GeneListFileName.td
    "Tajima gene list":"universal",

    # Fu and Li
    # File created would have the extension *.fl (tab deliminated text file). File Name: CountryName_GeneListFileName.fl
    "Fu and Li gene list":"universal",

    # Fay and Wu
    # File created would have the extension *.fw (tab deliminated text file). File Name: CountryName_GeneListFileName.fw
    "Fay and Wu gene list":"universal",

    # McDonald–Kreitman
    # REFERENCE GENOME FILE SHOULD BE THE SAME AS THAT USED TO GENERATE THE VCF FILE
    # ALIGNMENT FILE SHOULD BE A PAIRWISE ALIGNMENT OF THE REFERENCE GENOME TO THE OUTGROUP GENOME. SHOULD BE IN .maf FORMAT
    # Alignment can be gene wide (GENE mode) or chromosome (CHROM mode) wide.
    # GENE mode would require each gene to be separately aligned and the location of each alignment file to be added as a third column to the gene list file.
    # CHROM mode would require the outgroup species chromosome's genome to be aligned with the entire chromosome of the query species reference genome.
    # File created would have the extension *.mk (tab deliminated text file). File Name: CountryName_GeneListFileName.mk
    "Alignment mode":"GENE",
    "ORF known":"Yes",
    "Reference genome mk":"/mnt/d/Deshan/Books/University of Calgary/Experiments/Neutrality_Linux/reference_merge/One.fasta",
    "Alignment file":"extract_5/align/alignment.txt",
    "McDonald–Kreitman gene list":"universal",

    # Fst or Fixation Index
    # Indexed population file should be a THREE column tab deliminated *.txt file with headings: Sample_name	population_ID	Super_population.
    # Ensure population_ID values are unique to each super population and do not overlap.
    "Population index file path":"fst_pop/population.txt",
    "Fst gene list":"genelist2.txt",
    "Population ID":"AFR,XKY,ZXY",

    # EHH or Extended Haplotype Homozygosity
    # Define the extended haplotype region specification mode by either using a separate gene list file (FILE mode) or state the standard displacement value (FIXED mode) in bases.
    # Degradation of EHH around a SNP can be calculated by SNP or BP mode.
    "Range mode":"BP",
    "EHH FILE path":"genelist_ehh.txt",
    "FIXED mode":"+1000",
    "SNP default count":53,
    "SNP BP displacement":100000,
    "EHH CPU cores":10
}