Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# Changelog
All new changes are documented here.

## [v1.1.5]
### Added
- Mutation profile of the individual variants composing the SDMs (first and second changes)
- Allele frequencies in the different populations considered

## [v1.1.4]
### Added
- Add process to compute the binned allele frequencies for the derived alleles in each group

## [v1.1.3]
### Fixed
- `bcftools sort` failing in some single jobs in distributed systems
Expand Down
50 changes: 50 additions & 0 deletions bin/FreqBinner
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import sys, gzip
import numpy as np

itv1 = np.round(
np.concatenate((
np.arange(0.0, 0.001, 0.0001),
np.arange(0.001, 0.01, 0.001),
np.arange(0.01, 0.1, 0.01),
np.arange(0.1, 1, 0.1) )),
decimals=5
)
itv2 = np.concatenate((itv1[1:], np.array([1.])))

if sys.argv[1] == '/dev/stdin':
opn = sys.stdin
elif '.gz' in sys.argv[1]:
opn = gzip.open(sys.argv[1])
else:
opn = open(sys.argv[1])

for n,line in enumerate(opn):
try:
line = line.decode()
except:
pass
if n == 0:
line = line.strip().split('\t')
#vals = { i: [np.zeros(len(itv1), dtype = 'uint64'), np.zeros(len(itv1), dtype = 'uint64')] for i in line[5:] }
vals = { i: {'all' : np.zeros(len(itv1), dtype = 'uint64') } for i in line[6:] }
cols = line[6:]
continue
line = line.strip().split('\t')
values = map(float, line[6:])
for (col, val) in zip(cols, values):
if val > 0:
binning = np.where(np.logical_and(val <= itv2, val > itv1))[0][0]
vals[col]['all'][ binning ] += 1
if line[5] in vals[col].keys():
vals[col][line[5]][binning] += 1
else:
vals[col][line[5]] = np.zeros(len(itv1), dtype = 'uint64')
vals[col][line[5]][binning] += 1

print('COL\tbinI\tbinE\tChange\ttotChanges\tnChange')
for col in vals:
allvals = vals[col]['all']
changes = [k for k in vals[col].keys() if k!='all']
for change in changes:
for n in range(0, len(itv1)):
print( '{}\t{}\t{}\t{}\t{}\t{}'.format(col, itv1[n], itv2[n], change, allvals[n], vals[col][change][n]) )
11 changes: 11 additions & 0 deletions bin/getCounts3
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ load(inRdata)
dat<-dat[which(dat$SameCodon == 1), ]
uniqued<-dat[!duplicated(dat[c("Id1","Id2","Cons1","Cons2")]),]

# Save uniqued positions for both pos1 and pos2
pos1 <- dat %>%
select(Chr, Pos1) %>%
unique()
write_delim(pos1, paste("uniquePos1.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE)
pos2 <- dat %>%
select(Chr, Pos2) %>%
unique()
write_delim(pos2, paste("uniquePos2.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE)

# Continue with the analyses
indCounts<-vector()
indCounts_uniqued<-vector()
doubleCounts<-vector()
Expand Down
111 changes: 111 additions & 0 deletions bin/maf2snp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python
import argparse
from itertools import zip_longest


def parse_args():
# Argument definition
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--maf",
metavar="maffile.maf",
type=str,
help="Input alignments",
required=True,
)
parser.add_argument(
"-o",
"--out",
metavar="prefix",
type=str,
help="Output file prefix.",
required=False,
default="snps",
)
parser.add_argument(
"-d",
"--delim",
metavar="char",
type=str,
help="Delimiter.",
required=False,
default=",",
)
parser.add_argument(
"--genomes",
type=str,
help="Genomes to consider.",
nargs="+",
required=True,
)
return parser.parse_args()


def reader(maf):
lines = []
infile = open(maf)
for line in infile:
if "#" in line:
continue
elif line[0] == "a":
continue
elif line[0] == "s":
lines.append(line)
continue
elif line.strip() == "" and len(lines) != 0:
yield lines
lines = []
else:
lines = []
if len(lines) > 0:
yield lines


SFX = {
",": "csv",
";": "csv",
"\t": "tsv",
" ": "txt",
}


def get_matching_prefix(string, prefixes):
for prefix in prefixes:
if string.startswith(prefix):
return prefix
raise Exception(f'Prefix not found for: {string}')


def main():
# Get arguments
args = parse_args()
# Define unique list of input genomes
genomes = set(args.genomes)

# Prepare output
if args.out == '-' or args.out == '/dev/stdout':
ofname = '/dev/stdout'
else:
ofname = f"{args.out}.{SFX.get(args.delim, 'txt')}"
outfile = open(ofname, "w")

# Process the data
outfile.write("CHROM\tPOS\t{}\n".format("\t".join(args.genomes)))
for n, aln in enumerate(reader(args.maf)):
_, chrxspp, bpi, alnlen, _, _, _ = aln[0].split()
try:
_, chromosome = chrxspp.split(".", 1)
except:
raise Exception(f"Impossible to split: {chrxspp}")
splits = [a.split() for a in aln]
sequences = {get_matching_prefix(a[1], args.genomes): a[-1] for a in splits}
for genome in genomes.difference(set(sequences.keys())):
sequences[genome] = "-" * int(alnlen)
zipped = list(zip_longest(*[sequences[g] for g in args.genomes]))
for n, bp in enumerate(range(int(bpi), int(bpi) + int(alnlen))):
outfile.write("{}\t{}\t{}\n".format(chromosome, bp, "\t".join(zipped[n])))


if __name__ == "__main__":
main()
123 changes: 123 additions & 0 deletions bin/sfs2allele.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python
import argparse
import sys

import numpy as np


def parse_args():
# Argument definition
parser = argparse.ArgumentParser()
parser.add_argument(
"-s",
"--sfs",
metavar="TXT",
type=str,
help="SFS probabilities",
required=True,
)
parser.add_argument(
"-i",
"--input",
metavar="TXT",
type=str,
help="Input file for EST-SFS",
required=True,
)
parser.add_argument(
"-S",
"--sites",
metavar="TXT",
type=str,
help="Site positions for the SFS input",
required=False,
)
parser.add_argument(
"-o",
"--out",
metavar="prefix",
type=str,
help="Output ancestral allele files.",
required=False,
default="-",
)
return parser.parse_args()


NTS = ['A', 'C', 'G', 'T']


def get_ancestral(maj_p, maj_a, nodes, counts):
"""Define the ancestral allele."""
# If major allele is more likely than the others, return it
if maj_p > 0.5:
return maj_a
else:
alleles = np.array(NTS)
counts = np.array(map(int, counts))
min_allele_idx = (counts != np.max(counts)) & (counts != 0)
min_alleles = set(alleles[min_allele_idx])
isecs = set(nodes).intersection(min_alleles)
# If one Nt only is possible, return it
if len(isecs) == 1:
return isecs[0]
# Otherwise, return the oldest node in the tree
else:
return nodes[-1]



def main():
# Get arguments
args = parse_args()

# Get major allele for each site
majors = {}
with open(args.input) as inputfile:
for n, line in enumerate(inputfile):
allele_cnts = [int(i) for i in line.strip().split()[0].split(',')]
majors[str(n+1)] = {
'major': NTS[np.argmax(allele_cnts)],
'vector' : line.strip().split()[0],
'chrom': None,
'pos': None
}

# Add site mapping
if args.sites:
with open(args.sites) as sitesfile:
for n, line in enumerate(sitesfile):
chrom, pos = line.strip().split()
majors[str(n+1)]['chrom'] = chrom
majors[str(n+1)]['pos'] = pos

# Define output
if args.out == '-' or args.out == '/dev/stdout':
out = sys.stdout
else:
out = open(args.out, 'w')

# Define the ancestral based on the probs
nts_combinations = [ NT1+NT2 for NT1 in NTS for NT2 in NTS ]
with open(args.sfs) as sfs:
for line in sfs:
if line[0] == '0':
continue
site_n, _, maj_prob, probs = line.strip().split(' ', 3)
maj_prob = float(maj_prob)
outgr_probs = tuple(map(float, probs.split(' ')))
max_og_idx = np.argmax(outgr_probs)
maj_allele = majors[site_n]['major']
vect = majors[site_n]['vector']
nodes = nts_combinations[max_og_idx]
ancestral = get_ancestral(maj_prob, maj_allele, nodes, vect.split(','))
if args.sites:
chrom = majors[site_n]['chrom']
pos = majors[site_n]['pos']
out.write(f"{site_n}\t{chrom}\t{pos}\t{maj_prob}\t{outgr_probs[max_og_idx]}\t{maj_allele}\t{vect}\t{nodes}\t{ancestral}\n")
else:
out.write(f"{site_n}\t{maj_prob}\t{outgr_probs[max_og_idx]}\t{maj_allele}\t{vect}\t{nodes}\t{ancestral}\n")


if __name__ == "__main__":
main()
Loading