From 23c66b241f1992f63afa0f0247214ef2aaecbeff Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:37:36 +0100 Subject: [PATCH 1/8] Add individual changes counts --- CHANGELOG.md | 4 +++ bin/FreqBinner | 50 +++++++++++++++++++++++++++++++++++++ bin/getCounts3 | 11 ++++++++ include/process/mutyper.nf | 33 ++++++++++++++++++++++-- include/process/sdm.nf | 40 +++++++++++++++++++++++++++-- include/workflow/mutyper.nf | 2 +- include/workflow/sdm.nf | 26 ++++++++++++++++--- main.nf | 4 +-- nextflow.config | 2 +- 9 files changed, 160 insertions(+), 12 deletions(-) create mode 100644 bin/FreqBinner diff --git a/CHANGELOG.md b/CHANGELOG.md index 763d934..cad410b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ # Changelog All new changes are documented here. +## [v1.1.4] +### Added +- Add process to compute the binned allele frequencies for the derived alleles in each group + ## [v1.1.3] ### Fixed - `bcftools sort` failing in some single jobs in distributed systems diff --git a/bin/FreqBinner b/bin/FreqBinner new file mode 100644 index 0000000..2b9e449 --- /dev/null +++ b/bin/FreqBinner @@ -0,0 +1,50 @@ +import sys, gzip +import numpy as np + +itv1 = np.round( + np.concatenate(( + np.arange(0.0, 0.001, 0.0001), + np.arange(0.001, 0.01, 0.001), + np.arange(0.01, 0.1, 0.01), + np.arange(0.1, 1, 0.1) )), + decimals=5 +) +itv2 = np.concatenate((itv1[1:], np.array([1.]))) + +if sys.argv[1] == '/dev/stdin': + opn = sys.stdin +elif '.gz' in sys.argv[1]: + opn = gzip.open(sys.argv[1]) +else: + opn = open(sys.argv[1]) + +for n,line in enumerate(opn): + try: + line = line.decode() + except: + pass + if n == 0: + line = line.strip().split('\t') + #vals = { i: [np.zeros(len(itv1), dtype = 'uint64'), np.zeros(len(itv1), dtype = 'uint64')] for i in line[5:] } + vals = { i: {'all' : np.zeros(len(itv1), dtype = 'uint64') } for i in line[6:] } + cols = line[6:] + continue + line = line.strip().split('\t') + values = map(float, line[6:]) + for (col, val) in zip(cols, values): + if val > 0: + binning = np.where(np.logical_and(val <= itv2, val > itv1))[0][0] + vals[col]['all'][ binning ] += 1 + if line[5] in vals[col].keys(): + vals[col][line[5]][binning] += 1 + else: + vals[col][line[5]] = np.zeros(len(itv1), dtype = 'uint64') + vals[col][line[5]][binning] += 1 + +print('COL\tbinI\tbinE\tChange\ttotChanges\tnChange') +for col in vals: + allvals = vals[col]['all'] + changes = [k for k in vals[col].keys() if k!='all'] + for change in changes: + for n in range(0, len(itv1)): + print( '{}\t{}\t{}\t{}\t{}\t{}'.format(col, itv1[n], itv2[n], change, allvals[n], vals[col][change][n]) ) \ No newline at end of file diff --git a/bin/getCounts3 b/bin/getCounts3 index dd7ec5f..5fb935b 100755 --- a/bin/getCounts3 +++ b/bin/getCounts3 @@ -19,6 +19,17 @@ load(inRdata) dat<-dat[which(dat$SameCodon == 1), ] uniqued<-dat[!duplicated(dat[c("Id1","Id2","Cons1","Cons2")]),] +# Save uniqued positions for both pos1 and pos2 +pos1 <- uniqued %>% + select(Chr, Pos1) %>% + unique() +write_delim(pos1, paste("uniquePos1.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE) +pos2 <- uniqued %>% + select(Chr, Pos2) %>% + unique() +write_delim(pos2, paste("uniquePos2.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE) + +# Continue with the analyses indCounts<-vector() indCounts_uniqued<-vector() doubleCounts<-vector() diff --git a/include/process/mutyper.nf b/include/process/mutyper.nf index 05c6939..ec30b38 100755 --- a/include/process/mutyper.nf +++ b/include/process/mutyper.nf @@ -150,10 +150,10 @@ process consequence_table { process count_mutations { tag "medium" label "medium" - publishDir "${params.outdir}/mutyper/full_counts", mode: "${params.publish_dir_mode}", overwrite: true + publishDir "${params.outdir}/${outpath}", mode: "${params.publish_dir_mode}", overwrite: true input: - tuple val(k), path(vcf), path(tbi), path(levels) + tuple val(k), path(vcf), path(tbi), path(levels), val(outpath) output: tuple val(k), path("mutationSpectra_${params.species.capitalize()}_${k}.tsv") @@ -374,4 +374,33 @@ process normalize_results { touch ${counts.baseName}.Knorm.csv touch ${counts.baseName}.KCnorm.csv """ +} + +process allele_frequencies { + tag 'kcnt' + label 'medium' + publishDir "${params.outdir}/mutyper/af_bins", mode: "${params.publish_dir_mode}", overwrite: true + + input: + tuple val(k), path(vcf), path(tbi) + path groups + val af_fields + + output: + path "frequency_bins_${params.species.capitalize()}.K${k}.tsv", emit: bins + path "frequencies_${params.species.capitalize()}.K${k}.tsv.gz", emit: afs + + script: + """ + bcftools +fill-tags ${vcf} -- -S ${groups} -t AF | \ + bcftools query -H -f "%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%mutation_type\t%${af_fields}\n" | \ + tee >( FreqBinner /dev/stdin > frequency_bins_${params.species.capitalize()}.K${k}.tsv ) | \ + gzip -c > frequencies_${params.species.capitalize()}.K${k}.tsv.gz + """ + + stub: + """ + touch frequencies_${params.species.capitalize()}.K${k}.tsv.gz + touch frequency_bins_${params.species.capitalize()}.K${k}.tsv + """ } \ No newline at end of file diff --git a/include/process/sdm.nf b/include/process/sdm.nf index 72f31d3..ca22b9a 100755 --- a/include/process/sdm.nf +++ b/include/process/sdm.nf @@ -90,8 +90,10 @@ process count_sdm { output: - path "*.txt" - path "indChanges.${samplename}.RData" + path "*.txt", emit: 'counts' + path "indChanges.${samplename}.RData", emit: 'rdata' + path "uniquePos1.${samplename}.tsv", emit: 'first_change' + path "uniquePos2.${samplename}.tsv", emit: 'second_change' script: """ @@ -109,6 +111,8 @@ process count_sdm { touch changeCounts_uniqued.${samplename}.txt touch doubleCounts.${samplename}.txt touch doubleCounts_uniqued.${samplename}.txt + touch uniquePos1.${samplename}.tsv + touch uniquePos2.${samplename}.tsv """ } @@ -222,4 +226,36 @@ process sdm_matrix { """ touch sdm_${params.species.capitalize()}_K3.csv """ +} + + +process fetch_sites { + tag "sdm" + publishDir "${params.outdir}/sdm/single_changes/", mode: "${params.publish_dir_mode}", overwrite: true + conda {params.enable_conda ? "${baseDir}/envs/sdm-environment.yml" : null} + + + input: + tuple val(chroms), path("vcfs/*"), path("vcfs/*") + path ancfasta + path ancfai + path positions + + output: + tuple path("${positions.baseName}.vcf.gz"), path("${positions.baseName}.vcf.gz.tbi") + + script: + """ + bcftools concat vcfs/*.vcf.gz -a -O u | \ + bcftools sort -O u -T ./ -m 2G | \ + bcftools view -R ${positions} -Ov | \ + mutyper variants --k 3 ${ancfasta} | \ + bcftools view -Oz --write-index=tbi -o ${positions.baseName}.vcf.gz + """ + + stub: + """ + touch ${positions.baseName}.vcf.gz + touch ${positions.baseName}.vcf.gz.tbi + """ } \ No newline at end of file diff --git a/include/workflow/mutyper.nf b/include/workflow/mutyper.nf index ed625e6..f6be509 100755 --- a/include/workflow/mutyper.nf +++ b/include/workflow/mutyper.nf @@ -56,7 +56,7 @@ workflow MUTYPER { | filter{ it[0].toInteger() < 8 } | map{ k, vcf_fn, tbi_fn -> - [k, vcf_fn, tbi_fn, file("${baseDir}/assets/K${k}_mutations.txt")] + [k, vcf_fn, tbi_fn, file("${baseDir}/assets/K${k}_mutations.txt"), "mutyper/full_counts"] } | count_mutations diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf index 2b9ab44..bdbfb64 100755 --- a/include/workflow/sdm.nf +++ b/include/workflow/sdm.nf @@ -2,13 +2,17 @@ include { get_individuals; get_breeds; } from "../process/prerun" include { sdm; filter_sdm; count_sdm } from "../process/sdm" include { make_ksfs; sdm_plot } from "../process/sdm" include { repeat_mask_split_sdm} from "../process/sdm" +include { fetch_sites } from '../process/sdm.nf' include { sdm_matrix } from '../process/sdm.nf' +include { count_mutations } from '../process/mutyper.nf' workflow SDM { take: vcf_by_chr reffasta reffai + ancfasta + ancfai masks_ch chromosomeList @@ -45,16 +49,30 @@ workflow SDM { repeat_mask_split_sdm(filtered_ch.bed.combine(masks_ch)) // Generate outputs - count_sdm( filtered_ch.rdata ) + sdmcounts_ch = count_sdm( filtered_ch.rdata ) // Create matrix of SDMs - count_sdm.out[0] | collect | sdm_matrix + sdmcounts_ch.counts | collect | sdm_matrix + + // We collect first and second changes in a full list + first_change = sdmcounts_ch.first_change + | collectFile("${params.outdir}/sdm_first_change.txt") + second_change = sdmcounts_ch.second_change + | collectFile("${params.outdir}/sdm_second_change.txt") + // Extract the changes + fetch_sites( vcf_by_chr | collect(), ancfasta, ancfai, first_change | mix(second_change) ) + | map { + vcf, tbi -> + tuple( "3", vcf, tbi, file("${baseDir}/assets/K3_mutations.txt"), "sdm/${vcf.simpleName}" ) + } + // Count individual mutations spectrums + | count_mutations // Prepare Ksfs files raw_sdm | make_ksfs // Make plots for sdm results - all_counts = count_sdm.out[0] + all_counts = sdmcounts_ch.counts //sdm_plot( breeds_ch, all_counts.collect() ) - sdm_plot( all_counts.collect() ) + sdm_plot( all_counts.collect() ) } \ No newline at end of file diff --git a/main.nf b/main.nf index 096454f..49a809b 100755 --- a/main.nf +++ b/main.nf @@ -179,9 +179,9 @@ workflow { } if (params.mutyper){ MUTYPER( ancestral_fna, ancestral_fai, ch_masks, vcf_by_chr ) - } + } if (params.sdm){ - SDM( vcf_by_chr, reference_fna, reference_fai, ch_masks, ch_chr_lists ) + SDM( vcf_by_chr, reference_fna, reference_fai, ancestral_fna, ancestral_fai, ch_masks, ch_chr_lists ) } } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 8a5a6ec..be1623c 100755 --- a/nextflow.config +++ b/nextflow.config @@ -148,5 +148,5 @@ manifest { description = 'Nextflow mutation spectra analysis workflow.' mainScript = 'main.nf' nextflowVersion = '>=21.04.0' - version = '1.1.3' + version = '1.1.4' } \ No newline at end of file From 66ce97c410208959fa1e863923f10896eb06ab66 Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:38:43 +0100 Subject: [PATCH 2/8] Up version --- CHANGELOG.md | 4 ++++ nextflow.config | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cad410b..6841a7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ # Changelog All new changes are documented here. +## [v1.1.5] +### Added +- Add mutation profile of individual elements of the SDMs + ## [v1.1.4] ### Added - Add process to compute the binned allele frequencies for the derived alleles in each group diff --git a/nextflow.config b/nextflow.config index be1623c..dea2f64 100755 --- a/nextflow.config +++ b/nextflow.config @@ -148,5 +148,5 @@ manifest { description = 'Nextflow mutation spectra analysis workflow.' mainScript = 'main.nf' nextflowVersion = '>=21.04.0' - version = '1.1.4' + version = '1.1.5' } \ No newline at end of file From 7704cc4d373352f94d212f0e2acc80658bc35037 Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:51:01 +0100 Subject: [PATCH 3/8] Bop --- bin/getCounts3 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/getCounts3 b/bin/getCounts3 index 5fb935b..a8822bb 100755 --- a/bin/getCounts3 +++ b/bin/getCounts3 @@ -20,11 +20,11 @@ dat<-dat[which(dat$SameCodon == 1), ] uniqued<-dat[!duplicated(dat[c("Id1","Id2","Cons1","Cons2")]),] # Save uniqued positions for both pos1 and pos2 -pos1 <- uniqued %>% +pos1 <- dat %>% select(Chr, Pos1) %>% unique() write_delim(pos1, paste("uniquePos1.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE) -pos2 <- uniqued %>% +pos2 <- dat %>% select(Chr, Pos2) %>% unique() write_delim(pos2, paste("uniquePos2.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE) From d6bca7dd4162fb456edbc407e10ca020e2dfd0cf Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:59:30 +0100 Subject: [PATCH 4/8] Add allelic frequencies calculation --- include/process/mutyper.nf | 3 ++- include/workflow/mutyper.nf | 27 +++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/include/process/mutyper.nf b/include/process/mutyper.nf index ec30b38..e4a0cdc 100755 --- a/include/process/mutyper.nf +++ b/include/process/mutyper.nf @@ -391,9 +391,10 @@ process allele_frequencies { path "frequencies_${params.species.capitalize()}.K${k}.tsv.gz", emit: afs script: + def fields = af_fields.join("\t") """ bcftools +fill-tags ${vcf} -- -S ${groups} -t AF | \ - bcftools query -H -f "%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%mutation_type\t%${af_fields}\n" | \ + bcftools query -H -f "%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%mutation_type\t${fields}\n" | \ tee >( FreqBinner /dev/stdin > frequency_bins_${params.species.capitalize()}.K${k}.tsv ) | \ gzip -c > frequencies_${params.species.capitalize()}.K${k}.tsv.gz """ diff --git a/include/workflow/mutyper.nf b/include/workflow/mutyper.nf index f6be509..bce8dcc 100755 --- a/include/workflow/mutyper.nf +++ b/include/workflow/mutyper.nf @@ -3,7 +3,7 @@ include { chromosomeList } from '../process/prerun' include { group_results; plot_results; ksfs } from '../process/mutyper' include { mutyper_variant; mutyper_spectra; mutyper_concat } from '../process/mutyper' include { count_mutations; count_mutations_csq; consequence_table } from '../process/mutyper' -include { kmercount; normalize_results } from '../process/mutyper' +include { kmercount; normalize_results; allele_frequencies } from '../process/mutyper' // Workflow @@ -16,7 +16,8 @@ workflow MUTYPER { main: // check if there is a popfile, otherwise create one - breeds_file = Channel.fromPath("${params.pops_folder}/*.txt") + popfiles_ch = Channel.fromPath("${params.pops_folder}/*.txt") + breeds_file = popfiles_ch | map { fname -> [fname.baseName, fname] @@ -51,6 +52,28 @@ workflow MUTYPER { // In the meanwhile, group the mutyper VCFs and concatenate them mutyper_vcf_ch = mutyper_variant.out | groupTuple(by: 0) | mutyper_concat + // Compute allele frequencies + bcftools_pops_ch = popfiles_ch + | map { + fname -> + [fname.baseName, fname] + } + | splitCsv + | map { + pop, iid -> [pop] + iid + } + | collectFile { + pop, sample -> + [ "bcftools_meta.txt", "${sample}\t${pop}\n" ] + } + colnames_ch = popfiles_ch + | map { + fname -> + ["%AF_${fname.baseName}"] + } + | collect + allele_frequencies(mutyper_vcf_ch, bcftools_pops_ch, colnames_ch) + /* Generate the counts manually */ mutyper_vcf_ch | filter{ it[0].toInteger() < 8 } From a8b703b6260bfe63a897558cd1c03eb3677b5c1d Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Thu, 2 Oct 2025 10:00:52 +0100 Subject: [PATCH 5/8] C.log --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6841a7e..1ed397f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ All new changes are documented here. ## [v1.1.5] ### Added -- Add mutation profile of individual elements of the SDMs +- Mutation profile of the individual variants composing the SDMs (first and second changes) +- Allele frequencies in the different populations considered ## [v1.1.4] ### Added From 6a671648018366b72615b7304ca4e74080e49049 Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Thu, 2 Oct 2025 13:02:34 +0100 Subject: [PATCH 6/8] Fix variant extraction --- include/workflow/sdm.nf | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf index bdbfb64..b5347c5 100755 --- a/include/workflow/sdm.nf +++ b/include/workflow/sdm.nf @@ -37,7 +37,7 @@ workflow SDM { // Run dinuc pipeline raw_sdm = breeds_ch | combine( - sdm( combined_ch, reffasta, reffai ) + sdm( combined_ch, ancfasta, ancfai ) | groupTuple(by: 0), by: 0 ) @@ -56,17 +56,18 @@ workflow SDM { // We collect first and second changes in a full list first_change = sdmcounts_ch.first_change - | collectFile("${params.outdir}/sdm_first_change.txt") + | splitCsv(sep: "\t") + | collectFile(name: "sdm_first_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true) second_change = sdmcounts_ch.second_change - | collectFile("${params.outdir}/sdm_second_change.txt") + | splitCsv(sep: "\t") + | collectFile(name: "sdm_second_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true) + // Extract the changes - fetch_sites( vcf_by_chr | collect(), ancfasta, ancfai, first_change | mix(second_change) ) + fetch_sites( vcf_by_chr | collect, ancfasta, ancfai, first_change | mix(second_change) ) | map { vcf, tbi -> tuple( "3", vcf, tbi, file("${baseDir}/assets/K3_mutations.txt"), "sdm/${vcf.simpleName}" ) - } - // Count individual mutations spectrums - | count_mutations + } | count_mutations // Count individual mutations spectrums // Prepare Ksfs files raw_sdm | make_ksfs From 0da231be4047c8fa788dfae3c09954aa4e7f431e Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Thu, 2 Oct 2025 13:03:01 +0100 Subject: [PATCH 7/8] Reverse change --- bin/maf2snp.py | 111 ++++++++++++++++++ bin/sfs2allele.py | 123 ++++++++++++++++++++ bin/snps2ests.py | 245 ++++++++++++++++++++++++++++++++++++++++ include/workflow/sdm.nf | 2 +- 4 files changed, 480 insertions(+), 1 deletion(-) create mode 100644 bin/maf2snp.py create mode 100644 bin/sfs2allele.py create mode 100644 bin/snps2ests.py diff --git a/bin/maf2snp.py b/bin/maf2snp.py new file mode 100644 index 0000000..b0554e7 --- /dev/null +++ b/bin/maf2snp.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +import argparse +from itertools import zip_longest + + +def parse_args(): + # Argument definition + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", + "--maf", + metavar="maffile.maf", + type=str, + help="Input alignments", + required=True, + ) + parser.add_argument( + "-o", + "--out", + metavar="prefix", + type=str, + help="Output file prefix.", + required=False, + default="snps", + ) + parser.add_argument( + "-d", + "--delim", + metavar="char", + type=str, + help="Delimiter.", + required=False, + default=",", + ) + parser.add_argument( + "--genomes", + type=str, + help="Genomes to consider.", + nargs="+", + required=True, + ) + return parser.parse_args() + + +def reader(maf): + lines = [] + infile = open(maf) + for line in infile: + if "#" in line: + continue + elif line[0] == "a": + continue + elif line[0] == "s": + lines.append(line) + continue + elif line.strip() == "" and len(lines) != 0: + yield lines + lines = [] + else: + lines = [] + if len(lines) > 0: + yield lines + + +SFX = { + ",": "csv", + ";": "csv", + "\t": "tsv", + " ": "txt", +} + + +def get_matching_prefix(string, prefixes): + for prefix in prefixes: + if string.startswith(prefix): + return prefix + raise Exception(f'Prefix not found for: {string}') + + +def main(): + # Get arguments + args = parse_args() + # Define unique list of input genomes + genomes = set(args.genomes) + + # Prepare output + if args.out == '-' or args.out == '/dev/stdout': + ofname = '/dev/stdout' + else: + ofname = f"{args.out}.{SFX.get(args.delim, 'txt')}" + outfile = open(ofname, "w") + + # Process the data + outfile.write("CHROM\tPOS\t{}\n".format("\t".join(args.genomes))) + for n, aln in enumerate(reader(args.maf)): + _, chrxspp, bpi, alnlen, _, _, _ = aln[0].split() + try: + _, chromosome = chrxspp.split(".", 1) + except: + raise Exception(f"Impossible to split: {chrxspp}") + splits = [a.split() for a in aln] + sequences = {get_matching_prefix(a[1], args.genomes): a[-1] for a in splits} + for genome in genomes.difference(set(sequences.keys())): + sequences[genome] = "-" * int(alnlen) + zipped = list(zip_longest(*[sequences[g] for g in args.genomes])) + for n, bp in enumerate(range(int(bpi), int(bpi) + int(alnlen))): + outfile.write("{}\t{}\t{}\n".format(chromosome, bp, "\t".join(zipped[n]))) + + +if __name__ == "__main__": + main() diff --git a/bin/sfs2allele.py b/bin/sfs2allele.py new file mode 100644 index 0000000..5fe58cf --- /dev/null +++ b/bin/sfs2allele.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +import argparse +import sys + +import numpy as np + + +def parse_args(): + # Argument definition + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--sfs", + metavar="TXT", + type=str, + help="SFS probabilities", + required=True, + ) + parser.add_argument( + "-i", + "--input", + metavar="TXT", + type=str, + help="Input file for EST-SFS", + required=True, + ) + parser.add_argument( + "-S", + "--sites", + metavar="TXT", + type=str, + help="Site positions for the SFS input", + required=False, + ) + parser.add_argument( + "-o", + "--out", + metavar="prefix", + type=str, + help="Output ancestral allele files.", + required=False, + default="-", + ) + return parser.parse_args() + + +NTS = ['A', 'C', 'G', 'T'] + + +def get_ancestral(maj_p, maj_a, nodes, counts): + """Define the ancestral allele.""" + # If major allele is more likely than the others, return it + if maj_p > 0.5: + return maj_a + else: + alleles = np.array(NTS) + counts = np.array(map(int, counts)) + min_allele_idx = (counts != np.max(counts)) & (counts != 0) + min_alleles = set(alleles[min_allele_idx]) + isecs = set(nodes).intersection(min_alleles) + # If one Nt only is possible, return it + if len(isecs) == 1: + return isecs[0] + # Otherwise, return the oldest node in the tree + else: + return nodes[-1] + + + +def main(): + # Get arguments + args = parse_args() + + # Get major allele for each site + majors = {} + with open(args.input) as inputfile: + for n, line in enumerate(inputfile): + allele_cnts = [int(i) for i in line.strip().split()[0].split(',')] + majors[str(n+1)] = { + 'major': NTS[np.argmax(allele_cnts)], + 'vector' : line.strip().split()[0], + 'chrom': None, + 'pos': None + } + + # Add site mapping + if args.sites: + with open(args.sites) as sitesfile: + for n, line in enumerate(sitesfile): + chrom, pos = line.strip().split() + majors[str(n+1)]['chrom'] = chrom + majors[str(n+1)]['pos'] = pos + + # Define output + if args.out == '-' or args.out == '/dev/stdout': + out = sys.stdout + else: + out = open(args.out, 'w') + + # Define the ancestral based on the probs + nts_combinations = [ NT1+NT2 for NT1 in NTS for NT2 in NTS ] + with open(args.sfs) as sfs: + for line in sfs: + if line[0] == '0': + continue + site_n, _, maj_prob, probs = line.strip().split(' ', 3) + maj_prob = float(maj_prob) + outgr_probs = tuple(map(float, probs.split(' '))) + max_og_idx = np.argmax(outgr_probs) + maj_allele = majors[site_n]['major'] + vect = majors[site_n]['vector'] + nodes = nts_combinations[max_og_idx] + ancestral = get_ancestral(maj_prob, maj_allele, nodes, vect.split(',')) + if args.sites: + chrom = majors[site_n]['chrom'] + pos = majors[site_n]['pos'] + out.write(f"{site_n}\t{chrom}\t{pos}\t{maj_prob}\t{outgr_probs[max_og_idx]}\t{maj_allele}\t{vect}\t{nodes}\t{ancestral}\n") + else: + out.write(f"{site_n}\t{maj_prob}\t{outgr_probs[max_og_idx]}\t{maj_allele}\t{vect}\t{nodes}\t{ancestral}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/snps2ests.py b/bin/snps2ests.py new file mode 100644 index 0000000..bc60de6 --- /dev/null +++ b/bin/snps2ests.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python +import argparse +import gzip + +from tqdm import tqdm + + +def parse_args(): + # Argument definition + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--input", + metavar="TXT", + type=str, + help="Input txt file with the positions (coordinates are zero-based)", + required=True, + ) + parser.add_argument( + "-s", + "--sites", + metavar="TXT", + type=str, + help="Input list of sites to keep (coordinates are zero-based)", + required=False, + ) + parser.add_argument( + "--sitefile", + metavar="TXT", + type=str, + help="List of sites that have been found.", + required=True, + ) + parser.add_argument( + "-d", + "--datafile", + metavar="prefix", + type=str, + help="Output data file name.", + required=False, + default="data-file.txt", + ) + parser.add_argument( + "-c", + "--configfile", + metavar="prefix", + type=str, + help="Output config file name.", + required=False, + default="config-file.txt", + ) + parser.add_argument( + "--targets", + type=str, + help="Genomes for the target species.", + nargs="+", + required=True, + ) + parser.add_argument( + "--outgroup1", + type=str, + help="Outgroup 1 name.", + required=True, + ) + parser.add_argument( + "--outgroup2", + type=str, + help="Outgroup 2 name.", + required=False, + ) + parser.add_argument( + "--outgroup3", + type=str, + help="Outgroup 3 name.", + required=False, + ) + parser.add_argument( + "--model", + type=int, + help="Model for ESFS.", + default=0, + choices=[0, 1, 2], + required=False, + ) + parser.add_argument( + "--nrandom", + type=int, + help="Random iteration for the ESFS.", + default=0, + required=False, + ) + return parser.parse_args() + + +def parse_gzipped_line(line): + return line.decode().strip().split('\t') + + +def parse_simple_line(line): + return line.strip().split('\t') + + +def main(): + # Run main + args = parse_args() + + # Define input variables + n_fields = 0 + targets = set(args.targets) + outgroup1 = args.outgroup1 + outgroup2 = args.outgroup2 + outgroup3 = args.outgroup3 + if outgroup2 is None and outgroup3 is not None: + raise Exception("Provided outgroup 3 without outgroup 2") + n_outgroups = 1 + if outgroup2 is not None: + n_outgroups = 2 + if outgroup3 is not None: + n_outgroups = 3 + # Variable indexes instantiate + tgt_idxs = [] + og1_idx = None + og2_idx = None + og3_idx = None + # Define the values to return + # as counts of [A, C, G, T] + cnt_idxs = {'a': 0, 'A': 0, 'c': 1, 'C': 1, 'g': 2, 'G': 2, 't': 3, 'T': 3} + target_cnt = [0, 0, 0, 0] + outgroup1_cnt = [0, 0, 0, 0] + outgroup2_cnt = [0, 0, 0, 0] + outgroup3_cnt = [0, 0, 0, 0] + + # Load sites + sites = {} + if args.sites: + print("Loading target sites...") + for site in open(args.sites): + site = site.strip().split() + sites[f"{site[0]}_{site[1]}"] = True + print("Found {} sites".format(len(sites))) + + # define the loader + if args.input.endswith('.gz'): + buffer = gzip.open + parser = parse_gzipped_line + else: + buffer = open + parser = parse_simple_line + + # Output file + file_counter = 1 + row_counter = 0 + outfile = open(f"{args.datafile}.{file_counter}.txt", 'w') + sitefile = open(f"{args.sitefile}.{file_counter}.txt", 'w') + # Prepare progress bar for the file + # Process the input + print("Processing input file") + print(f"Saving to file #: {file_counter}") + pbar = tqdm(total=1000000) + for n,line in enumerate(buffer(args.input)): + line = parser(line) + # First, we work out the header positions + if n == 0: + n_fields = len(line) + # Prepare the index for the targets + for tgt in targets: + if tgt not in line: + raise Exception(f"Missing key in header: {tgt}") + tgt_idxs.append( line.index(tgt) ) + # At least one outgroup is needed + if outgroup1 not in line: + raise Exception(f"Missing outgroup in header: {outgroup1}") + og1_idx = line.index(outgroup1) + # Add at most three outgroups + if outgroup2: + if outgroup2 not in line: + raise Exception(f"Missing outgroup in header: {outgroup2}") + og2_idx = line.index(outgroup2) + if outgroup3: + if outgroup3 not in line: + raise Exception(f"Missing outgroup in header: {outgroup3}") + og3_idx = line.index(outgroup3) + continue + if sites and not sites.get(f"{line[0]}_{line[1]}", False): + continue + # If there are already 1M rows, close the file and proceed to the next file + if row_counter == 1000000: + pbar.close() + print(f"Closing file {file_counter} with 1M sites") + sitefile.close() + outfile.close() + row_counter = 0 + file_counter += 1 + print(f"Saving to file {file_counter}") + pbar = tqdm(total=1000000) + outfile = open(f"{args.datafile}.{file_counter}.txt", 'w') + sitefile = open(f"{args.sitefile}.{file_counter}.txt", 'w') + # Write site id in the file + sitefile.write("{}\n".format('\t'.join(line[0:2]))) + while len(line) < n_fields: + line.append('') + # Then, we process the rest + for idx in tgt_idxs: + base = line[idx] + if base in cnt_idxs.keys(): + target_cnt[cnt_idxs[base]] += 1 + base = line[og1_idx] + if base in cnt_idxs.keys(): + outgroup1_cnt[cnt_idxs[base]] += 1 + if og2_idx: + base = line[og2_idx] + if base in cnt_idxs.keys(): + outgroup2_cnt[cnt_idxs[base]] += 1 + if og3_idx: + base = line[og3_idx] + if base in cnt_idxs.keys(): + outgroup3_cnt[cnt_idxs[base]] += 1 + outfile.write('{} {} {} {}\n'.format( + ','.join(map(str, target_cnt)), + ','.join(map(str, outgroup1_cnt)), + ','.join(map(str, outgroup2_cnt)), + ','.join(map(str, outgroup3_cnt)), + )) + target_cnt = [0, 0, 0, 0] + outgroup1_cnt = [0, 0, 0, 0] + outgroup2_cnt = [0, 0, 0, 0] + outgroup3_cnt = [0, 0, 0, 0] + row_counter += 1 + pbar.update(1) + + # Final closing of the output files & progress bar + pbar.close() + sitefile.close() + outfile.close() + + # Then, save the configuration file + print("Save configuration file") + with open(args.configfile, 'w') as outcfg: + outcfg.write(f"n_outgroup {n_outgroups}\n") + outcfg.write(f"model {args.model}\n") + outcfg.write(f"nrandom {args.nrandom}\n") + print("All done\n\n") + +if __name__ == "__main__": + main() diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf index b5347c5..f15a17a 100755 --- a/include/workflow/sdm.nf +++ b/include/workflow/sdm.nf @@ -37,7 +37,7 @@ workflow SDM { // Run dinuc pipeline raw_sdm = breeds_ch | combine( - sdm( combined_ch, ancfasta, ancfai ) + sdm( combined_ch, reffasta, reffai ) | groupTuple(by: 0), by: 0 ) From b783709e9ca74021a56b7c3aa6f8a06aa1b2383e Mon Sep 17 00:00:00 2001 From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com> Date: Tue, 7 Oct 2025 09:38:21 +0100 Subject: [PATCH 8/8] Working single changes --- include/process/sdm.nf | 12 +++++++----- include/workflow/sdm.nf | 16 +++++++++++++--- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/include/process/sdm.nf b/include/process/sdm.nf index ca22b9a..79c1d9f 100755 --- a/include/process/sdm.nf +++ b/include/process/sdm.nf @@ -236,7 +236,8 @@ process fetch_sites { input: - tuple val(chroms), path("vcfs/*"), path("vcfs/*") + path(vcfs, stageAs: "vcfs/*") + path(tbis, stageAs: "vcfs/*") path ancfasta path ancfai path positions @@ -247,10 +248,11 @@ process fetch_sites { script: """ bcftools concat vcfs/*.vcf.gz -a -O u | \ - bcftools sort -O u -T ./ -m 2G | \ - bcftools view -R ${positions} -Ov | \ - mutyper variants --k 3 ${ancfasta} | \ - bcftools view -Oz --write-index=tbi -o ${positions.baseName}.vcf.gz + bcftools sort -O u -T ./ -m 2G - | \ + bcftools view -T <( sort -k1,1 -k2,2n ${positions} ) -Ov - | \ + mutyper variants --k 3 ${ancfasta} - | \ + bcftools view -Oz - > ${positions.baseName}.vcf.gz && \ + tabix -p vcf ${positions.baseName}.vcf.gz """ stub: diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf index f15a17a..a5aae7b 100755 --- a/include/workflow/sdm.nf +++ b/include/workflow/sdm.nf @@ -57,13 +57,23 @@ workflow SDM { // We collect first and second changes in a full list first_change = sdmcounts_ch.first_change | splitCsv(sep: "\t") - | collectFile(name: "sdm_first_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true) + | collectFile(storeDir: "${params.outdir}/sdm/single_changes/", sort: true){ + chrom, pos -> + [ "sdm_first_change.txt", "${chrom}\t${pos}\n" ] + } second_change = sdmcounts_ch.second_change | splitCsv(sep: "\t") - | collectFile(name: "sdm_second_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true) + | collectFile(storeDir: "${params.outdir}/sdm/single_changes/", sort: true){ + chrom, pos -> + [ "sdm_second_change.txt", "${chrom}\t${pos}\n" ] + } // Extract the changes - fetch_sites( vcf_by_chr | collect, ancfasta, ancfai, first_change | mix(second_change) ) + fetch_sites( + vcf_by_chr | map{_chrom, vcf, _tbi -> [vcf]}, + vcf_by_chr | map{_chrom, _vcf, tbi -> [tbi]}, + ancfasta, ancfai, + first_change | mix(second_change) ) | map { vcf, tbi -> tuple( "3", vcf, tbi, file("${baseDir}/assets/K3_mutations.txt"), "sdm/${vcf.simpleName}" )