From 23c66b241f1992f63afa0f0247214ef2aaecbeff Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 12:37:36 +0100
Subject: [PATCH 1/8] Add individual changes counts

---
 CHANGELOG.md                |  4 +++
 bin/FreqBinner              | 50 +++++++++++++++++++++++++++++++++++++
 bin/getCounts3              | 11 ++++++++
 include/process/mutyper.nf  | 33 ++++++++++++++++++++++--
 include/process/sdm.nf      | 40 +++++++++++++++++++++++++++--
 include/workflow/mutyper.nf |  2 +-
 include/workflow/sdm.nf     | 26 ++++++++++++++++---
 main.nf                     |  4 +--
 nextflow.config             |  2 +-
 9 files changed, 160 insertions(+), 12 deletions(-)
 create mode 100644 bin/FreqBinner

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 763d934..cad410b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 All new changes are documented here.
 
+## [v1.1.4]
+### Added
+- Add process to compute the binned allele frequencies for the derived alleles in each group
+
 ## [v1.1.3]
 ### Fixed
 - `bcftools sort` failing in some single jobs in distributed systems
diff --git a/bin/FreqBinner b/bin/FreqBinner
new file mode 100644
index 0000000..2b9e449
--- /dev/null
+++ b/bin/FreqBinner
@@ -0,0 +1,50 @@
+import sys, gzip
+import numpy as np
+
+itv1 = np.round(
+    np.concatenate(( 
+        np.arange(0.0, 0.001, 0.0001), 
+        np.arange(0.001, 0.01, 0.001), 
+        np.arange(0.01, 0.1, 0.01), 
+        np.arange(0.1, 1, 0.1) )), 
+    decimals=5
+)
+itv2 = np.concatenate((itv1[1:], np.array([1.])))
+
+if sys.argv[1] == '/dev/stdin':
+    opn = sys.stdin
+elif '.gz' in sys.argv[1]:
+    opn = gzip.open(sys.argv[1])
+else:
+    opn = open(sys.argv[1])
+
+for n,line in enumerate(opn):
+    try:
+        line = line.decode()
+    except: 
+        pass
+    if n == 0:
+        line = line.strip().split('\t')
+        #vals = { i: [np.zeros(len(itv1), dtype = 'uint64'), np.zeros(len(itv1), dtype = 'uint64')] for i in line[5:] }
+        vals = { i: {'all' : np.zeros(len(itv1), dtype = 'uint64') } for i in line[6:] }
+        cols = line[6:]
+        continue
+    line = line.strip().split('\t')
+    values = map(float, line[6:])
+    for (col, val) in zip(cols, values):
+        if val > 0:
+            binning = np.where(np.logical_and(val <= itv2, val > itv1))[0][0]
+            vals[col]['all'][ binning ] += 1
+            if line[5] in vals[col].keys():
+                vals[col][line[5]][binning] += 1
+            else:
+                vals[col][line[5]] = np.zeros(len(itv1), dtype = 'uint64')
+                vals[col][line[5]][binning] += 1
+
+print('COL\tbinI\tbinE\tChange\ttotChanges\tnChange')
+for col in vals:
+    allvals = vals[col]['all']
+    changes = [k for k in vals[col].keys() if k!='all']
+    for change in changes:
+        for n in range(0, len(itv1)):
+            print( '{}\t{}\t{}\t{}\t{}\t{}'.format(col, itv1[n], itv2[n], change, allvals[n], vals[col][change][n]) )
\ No newline at end of file
diff --git a/bin/getCounts3 b/bin/getCounts3
index dd7ec5f..5fb935b 100755
--- a/bin/getCounts3
+++ b/bin/getCounts3
@@ -19,6 +19,17 @@ load(inRdata)
 dat<-dat[which(dat$SameCodon == 1), ]
 uniqued<-dat[!duplicated(dat[c("Id1","Id2","Cons1","Cons2")]),]
 
+# Save uniqued positions for both pos1 and pos2
+pos1 <- uniqued %>%
+  select(Chr, Pos1) %>%
+  unique()
+write_delim(pos1, paste("uniquePos1.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE)
+pos2 <- uniqued %>%
+  select(Chr, Pos2) %>%
+  unique()
+write_delim(pos2, paste("uniquePos2.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE)
+
+# Continue with the analyses
 indCounts<-vector()
 indCounts_uniqued<-vector()
 doubleCounts<-vector()
diff --git a/include/process/mutyper.nf b/include/process/mutyper.nf
index 05c6939..ec30b38 100755
--- a/include/process/mutyper.nf
+++ b/include/process/mutyper.nf
@@ -150,10 +150,10 @@ process consequence_table {
 process count_mutations {
     tag "medium"
     label "medium"
-    publishDir "${params.outdir}/mutyper/full_counts", mode: "${params.publish_dir_mode}", overwrite: true
+    publishDir "${params.outdir}/${outpath}", mode: "${params.publish_dir_mode}", overwrite: true
 
     input:
-    tuple val(k), path(vcf), path(tbi), path(levels)
+    tuple val(k), path(vcf), path(tbi), path(levels), val(outpath)
 
     output:
     tuple val(k), path("mutationSpectra_${params.species.capitalize()}_${k}.tsv")
@@ -374,4 +374,33 @@ process normalize_results {
     touch ${counts.baseName}.Knorm.csv
     touch ${counts.baseName}.KCnorm.csv
     """
+}
+
+process allele_frequencies {
+    tag 'kcnt'
+    label 'medium'
+    publishDir "${params.outdir}/mutyper/af_bins", mode: "${params.publish_dir_mode}", overwrite: true
+
+    input:
+    tuple val(k), path(vcf), path(tbi)
+    path groups
+    val af_fields
+
+    output:
+    path "frequency_bins_${params.species.capitalize()}.K${k}.tsv", emit: bins
+    path "frequencies_${params.species.capitalize()}.K${k}.tsv.gz", emit: afs
+
+    script:
+    """
+    bcftools +fill-tags ${vcf} -- -S ${groups} -t AF | \
+        bcftools query -H -f "%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%mutation_type\t%${af_fields}\n" | \
+        tee >( FreqBinner /dev/stdin > frequency_bins_${params.species.capitalize()}.K${k}.tsv ) | \
+        gzip -c > frequencies_${params.species.capitalize()}.K${k}.tsv.gz
+    """
+
+    stub:
+    """
+    touch frequencies_${params.species.capitalize()}.K${k}.tsv.gz
+    touch frequency_bins_${params.species.capitalize()}.K${k}.tsv
+    """
 }
\ No newline at end of file
diff --git a/include/process/sdm.nf b/include/process/sdm.nf
index 72f31d3..ca22b9a 100755
--- a/include/process/sdm.nf
+++ b/include/process/sdm.nf
@@ -90,8 +90,10 @@ process count_sdm {
 
 
     output:
-    path "*.txt"
-    path "indChanges.${samplename}.RData"
+    path "*.txt", emit: 'counts'
+    path "indChanges.${samplename}.RData", emit: 'rdata'
+    path "uniquePos1.${samplename}.tsv", emit: 'first_change'
+    path "uniquePos2.${samplename}.tsv", emit: 'second_change'
 
     script:
     """
@@ -109,6 +111,8 @@ process count_sdm {
     touch changeCounts_uniqued.${samplename}.txt
     touch doubleCounts.${samplename}.txt
     touch doubleCounts_uniqued.${samplename}.txt
+    touch uniquePos1.${samplename}.tsv
+    touch uniquePos2.${samplename}.tsv
     """
 }
 
@@ -222,4 +226,36 @@ process sdm_matrix {
     """
     touch sdm_${params.species.capitalize()}_K3.csv
     """
+}
+
+
+process fetch_sites {
+    tag "sdm"
+    publishDir "${params.outdir}/sdm/single_changes/", mode: "${params.publish_dir_mode}", overwrite: true
+    conda {params.enable_conda ? "${baseDir}/envs/sdm-environment.yml" : null}
+
+
+    input:
+    tuple val(chroms), path("vcfs/*"), path("vcfs/*")
+    path ancfasta
+    path ancfai
+    path positions
+
+    output:
+    tuple path("${positions.baseName}.vcf.gz"), path("${positions.baseName}.vcf.gz.tbi")
+
+    script:
+    """
+    bcftools concat vcfs/*.vcf.gz -a -O u | \
+        bcftools sort -O u -T ./ -m 2G | \
+        bcftools view -R ${positions} -Ov | \
+        mutyper variants --k 3 ${ancfasta} | \
+        bcftools view -Oz --write-index=tbi -o ${positions.baseName}.vcf.gz
+    """
+
+    stub:
+    """
+    touch ${positions.baseName}.vcf.gz
+    touch ${positions.baseName}.vcf.gz.tbi
+    """
 }
\ No newline at end of file
diff --git a/include/workflow/mutyper.nf b/include/workflow/mutyper.nf
index ed625e6..f6be509 100755
--- a/include/workflow/mutyper.nf
+++ b/include/workflow/mutyper.nf
@@ -56,7 +56,7 @@ workflow MUTYPER {
         | filter{ it[0].toInteger() < 8 }
         | map{
             k, vcf_fn, tbi_fn ->
-            [k, vcf_fn, tbi_fn, file("${baseDir}/assets/K${k}_mutations.txt")]
+            [k, vcf_fn, tbi_fn, file("${baseDir}/assets/K${k}_mutations.txt"), "mutyper/full_counts"]
         }
         | count_mutations
 
diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf
index 2b9ab44..bdbfb64 100755
--- a/include/workflow/sdm.nf
+++ b/include/workflow/sdm.nf
@@ -2,13 +2,17 @@ include { get_individuals; get_breeds; } from "../process/prerun"
 include { sdm; filter_sdm; count_sdm } from "../process/sdm"
 include { make_ksfs; sdm_plot } from "../process/sdm"
 include { repeat_mask_split_sdm} from "../process/sdm"
+include { fetch_sites } from '../process/sdm.nf'
 include { sdm_matrix } from '../process/sdm.nf'
+include { count_mutations } from '../process/mutyper.nf'
 
 workflow SDM {
     take:
         vcf_by_chr
         reffasta
         reffai
+        ancfasta
+        ancfai
         masks_ch
         chromosomeList
 
@@ -45,16 +49,30 @@ workflow SDM {
         repeat_mask_split_sdm(filtered_ch.bed.combine(masks_ch))
 
         // Generate outputs
-        count_sdm( filtered_ch.rdata )
+        sdmcounts_ch = count_sdm( filtered_ch.rdata )
 
         // Create matrix of SDMs
-        count_sdm.out[0] | collect | sdm_matrix
+        sdmcounts_ch.counts | collect | sdm_matrix
+
+        // We collect first and second changes in a full list
+        first_change = sdmcounts_ch.first_change
+            | collectFile("${params.outdir}/sdm_first_change.txt")
+        second_change = sdmcounts_ch.second_change
+            | collectFile("${params.outdir}/sdm_second_change.txt")
+        // Extract the changes
+        fetch_sites( vcf_by_chr | collect(), ancfasta, ancfai, first_change | mix(second_change) )
+        | map {
+            vcf, tbi ->
+            tuple( "3", vcf, tbi, file("${baseDir}/assets/K3_mutations.txt"), "sdm/${vcf.simpleName}" )
+        }
+        // Count individual mutations spectrums
+        | count_mutations
 
         // Prepare Ksfs files
         raw_sdm | make_ksfs
 
         // Make plots for sdm results
-        all_counts = count_sdm.out[0]
+        all_counts = sdmcounts_ch.counts
         //sdm_plot( breeds_ch, all_counts.collect() ) 
-        sdm_plot( all_counts.collect() ) 
+        sdm_plot( all_counts.collect() )
 }
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 096454f..49a809b 100755
--- a/main.nf
+++ b/main.nf
@@ -179,9 +179,9 @@ workflow {
     }
     if (params.mutyper){
         MUTYPER( ancestral_fna, ancestral_fai, ch_masks, vcf_by_chr )
-    } 
+    }
     if (params.sdm){
-        SDM( vcf_by_chr, reference_fna, reference_fai, ch_masks, ch_chr_lists )
+        SDM( vcf_by_chr, reference_fna, reference_fai, ancestral_fna, ancestral_fai, ch_masks, ch_chr_lists )
     }
   }
 }
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 8a5a6ec..be1623c 100755
--- a/nextflow.config
+++ b/nextflow.config
@@ -148,5 +148,5 @@ manifest {
   description     = 'Nextflow mutation spectra analysis workflow.'
   mainScript      = 'main.nf'
   nextflowVersion = '>=21.04.0'
-  version         = '1.1.3'
+  version         = '1.1.4'
 }
\ No newline at end of file

From 66ce97c410208959fa1e863923f10896eb06ab66 Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 12:38:43 +0100
Subject: [PATCH 2/8] Up version

---
 CHANGELOG.md    | 4 ++++
 nextflow.config | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cad410b..6841a7e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 All new changes are documented here.
 
+## [v1.1.5]
+### Added
+- Add mutation profile of individual elements of the SDMs
+
 ## [v1.1.4]
 ### Added
 - Add process to compute the binned allele frequencies for the derived alleles in each group
diff --git a/nextflow.config b/nextflow.config
index be1623c..dea2f64 100755
--- a/nextflow.config
+++ b/nextflow.config
@@ -148,5 +148,5 @@ manifest {
   description     = 'Nextflow mutation spectra analysis workflow.'
   mainScript      = 'main.nf'
   nextflowVersion = '>=21.04.0'
-  version         = '1.1.4'
+  version         = '1.1.5'
 }
\ No newline at end of file

From 7704cc4d373352f94d212f0e2acc80658bc35037 Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:51:01 +0100
Subject: [PATCH 3/8] Bop

---
 bin/getCounts3 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/getCounts3 b/bin/getCounts3
index 5fb935b..a8822bb 100755
--- a/bin/getCounts3
+++ b/bin/getCounts3
@@ -20,11 +20,11 @@ dat<-dat[which(dat$SameCodon == 1), ]
 uniqued<-dat[!duplicated(dat[c("Id1","Id2","Cons1","Cons2")]),]
 
 # Save uniqued positions for both pos1 and pos2
-pos1 <- uniqued %>%
+pos1 <- dat %>%
   select(Chr, Pos1) %>%
   unique()
 write_delim(pos1, paste("uniquePos1.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE)
-pos2 <- uniqued %>%
+pos2 <- dat %>%
   select(Chr, Pos2) %>%
   unique()
 write_delim(pos2, paste("uniquePos2.", popu, ".tsv", sep=""), delim="\t", col_names=FALSE)

From d6bca7dd4162fb456edbc407e10ca020e2dfd0cf Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:59:30 +0100
Subject: [PATCH 4/8] Add allelic frequencies calculation

---
 include/process/mutyper.nf  |  3 ++-
 include/workflow/mutyper.nf | 27 +++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/include/process/mutyper.nf b/include/process/mutyper.nf
index ec30b38..e4a0cdc 100755
--- a/include/process/mutyper.nf
+++ b/include/process/mutyper.nf
@@ -391,9 +391,10 @@ process allele_frequencies {
     path "frequencies_${params.species.capitalize()}.K${k}.tsv.gz", emit: afs
 
     script:
+    def fields = af_fields.join("\t")
     """
     bcftools +fill-tags ${vcf} -- -S ${groups} -t AF | \
-        bcftools query -H -f "%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%mutation_type\t%${af_fields}\n" | \
+        bcftools query -H -f "%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%mutation_type\t${fields}\n" | \
         tee >( FreqBinner /dev/stdin > frequency_bins_${params.species.capitalize()}.K${k}.tsv ) | \
         gzip -c > frequencies_${params.species.capitalize()}.K${k}.tsv.gz
     """
diff --git a/include/workflow/mutyper.nf b/include/workflow/mutyper.nf
index f6be509..bce8dcc 100755
--- a/include/workflow/mutyper.nf
+++ b/include/workflow/mutyper.nf
@@ -3,7 +3,7 @@ include { chromosomeList } from '../process/prerun'
 include { group_results; plot_results; ksfs } from '../process/mutyper'
 include { mutyper_variant; mutyper_spectra; mutyper_concat } from '../process/mutyper'
 include { count_mutations; count_mutations_csq; consequence_table } from '../process/mutyper'
-include { kmercount; normalize_results } from '../process/mutyper'
+include { kmercount; normalize_results; allele_frequencies } from '../process/mutyper'
 
 
 // Workflow
@@ -16,7 +16,8 @@ workflow MUTYPER {
 
     main:
         // check if there is a popfile, otherwise create one
-        breeds_file = Channel.fromPath("${params.pops_folder}/*.txt")
+        popfiles_ch = Channel.fromPath("${params.pops_folder}/*.txt")
+        breeds_file = popfiles_ch
             | map {
                 fname ->
                 [fname.baseName, fname]
@@ -51,6 +52,28 @@ workflow MUTYPER {
         // In the meanwhile, group the mutyper VCFs and concatenate them
         mutyper_vcf_ch = mutyper_variant.out | groupTuple(by: 0) | mutyper_concat
 
+        // Compute allele frequencies
+        bcftools_pops_ch = popfiles_ch
+            | map {
+                fname ->
+                [fname.baseName, fname]
+            }
+            | splitCsv
+            | map {
+                pop, iid -> [pop] + iid
+            }
+            | collectFile {
+                pop, sample ->
+                [ "bcftools_meta.txt", "${sample}\t${pop}\n" ]
+            }
+        colnames_ch = popfiles_ch
+            | map {
+                fname ->
+                ["%AF_${fname.baseName}"]
+            }
+            | collect
+        allele_frequencies(mutyper_vcf_ch, bcftools_pops_ch, colnames_ch)
+
         /* Generate the counts manually */
         mutyper_vcf_ch
         | filter{ it[0].toInteger() < 8 }

From a8b703b6260bfe63a897558cd1c03eb3677b5c1d Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Thu, 2 Oct 2025 10:00:52 +0100
Subject: [PATCH 5/8] C.log

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6841a7e..1ed397f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,8 @@ All new changes are documented here.
 
 ## [v1.1.5]
 ### Added
-- Add mutation profile of individual elements of the SDMs
+- Mutation profile of the individual variants composing the SDMs (first and second changes)
+- Allele frequencies in the different populations considered
 
 ## [v1.1.4]
 ### Added

From 6a671648018366b72615b7304ca4e74080e49049 Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Thu, 2 Oct 2025 13:02:34 +0100
Subject: [PATCH 6/8] Fix variant extraction

---
 include/workflow/sdm.nf | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf
index bdbfb64..b5347c5 100755
--- a/include/workflow/sdm.nf
+++ b/include/workflow/sdm.nf
@@ -37,7 +37,7 @@ workflow SDM {
         // Run dinuc pipeline
         raw_sdm = breeds_ch
         | combine(
-            sdm( combined_ch, reffasta, reffai )
+            sdm( combined_ch, ancfasta, ancfai )
             | groupTuple(by: 0),
             by: 0
         )
@@ -56,17 +56,18 @@ workflow SDM {
 
         // We collect first and second changes in a full list
         first_change = sdmcounts_ch.first_change
-            | collectFile("${params.outdir}/sdm_first_change.txt")
+            | splitCsv(sep: "\t")
+            | collectFile(name: "sdm_first_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true)
         second_change = sdmcounts_ch.second_change
-            | collectFile("${params.outdir}/sdm_second_change.txt")
+            | splitCsv(sep: "\t")
+            | collectFile(name: "sdm_second_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true)
+        
         // Extract the changes
-        fetch_sites( vcf_by_chr | collect(), ancfasta, ancfai, first_change | mix(second_change) )
+        fetch_sites( vcf_by_chr | collect, ancfasta, ancfai, first_change | mix(second_change) )
         | map {
             vcf, tbi ->
             tuple( "3", vcf, tbi, file("${baseDir}/assets/K3_mutations.txt"), "sdm/${vcf.simpleName}" )
-        }
-        // Count individual mutations spectrums
-        | count_mutations
+        } | count_mutations // Count individual mutations spectrums
 
         // Prepare Ksfs files
         raw_sdm | make_ksfs

From 0da231be4047c8fa788dfae3c09954aa4e7f431e Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Thu, 2 Oct 2025 13:03:01 +0100
Subject: [PATCH 7/8] Reverse change

---
 bin/maf2snp.py          | 111 ++++++++++++++++++
 bin/sfs2allele.py       | 123 ++++++++++++++++++++
 bin/snps2ests.py        | 245 ++++++++++++++++++++++++++++++++++++++++
 include/workflow/sdm.nf |   2 +-
 4 files changed, 480 insertions(+), 1 deletion(-)
 create mode 100644 bin/maf2snp.py
 create mode 100644 bin/sfs2allele.py
 create mode 100644 bin/snps2ests.py

diff --git a/bin/maf2snp.py b/bin/maf2snp.py
new file mode 100644
index 0000000..b0554e7
--- /dev/null
+++ b/bin/maf2snp.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+import argparse
+from itertools import zip_longest
+
+
+def parse_args():
+    # Argument definition
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--maf",
+        metavar="maffile.maf",
+        type=str,
+        help="Input alignments",
+        required=True,
+    )
+    parser.add_argument(
+        "-o",
+        "--out",
+        metavar="prefix",
+        type=str,
+        help="Output file prefix.",
+        required=False,
+        default="snps",
+    )
+    parser.add_argument(
+        "-d",
+        "--delim",
+        metavar="char",
+        type=str,
+        help="Delimiter.",
+        required=False,
+        default=",",
+    )
+    parser.add_argument(
+        "--genomes",
+        type=str,
+        help="Genomes to consider.",
+        nargs="+",
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def reader(maf):
+    lines = []
+    infile = open(maf)
+    for line in infile:
+        if "#" in line:
+            continue
+        elif line[0] == "a":
+            continue
+        elif line[0] == "s":
+            lines.append(line)
+            continue
+        elif line.strip() == "" and len(lines) != 0:
+            yield lines
+            lines = []
+        else:
+            lines = []
+    if len(lines) > 0:
+        yield lines
+
+
+SFX = {
+    ",": "csv",
+    ";": "csv",
+    "\t": "tsv",
+    " ": "txt",
+}
+
+
+def get_matching_prefix(string, prefixes):
+    for prefix in prefixes:
+        if string.startswith(prefix):
+            return prefix
+    raise Exception(f'Prefix not found for: {string}')
+
+
+def main():
+    # Get arguments
+    args = parse_args()
+    # Define unique list of input genomes
+    genomes = set(args.genomes)
+
+    # Prepare output
+    if args.out == '-' or args.out == '/dev/stdout':
+        ofname = '/dev/stdout'
+    else:
+        ofname = f"{args.out}.{SFX.get(args.delim, 'txt')}"
+    outfile = open(ofname, "w")
+
+    # Process the data
+    outfile.write("CHROM\tPOS\t{}\n".format("\t".join(args.genomes)))
+    for n, aln in enumerate(reader(args.maf)):
+        _, chrxspp, bpi, alnlen, _, _, _ = aln[0].split()
+        try:
+            _, chromosome = chrxspp.split(".", 1)
+        except:
+            raise Exception(f"Impossible to split: {chrxspp}")
+        splits = [a.split() for a in aln]            
+        sequences = {get_matching_prefix(a[1], args.genomes): a[-1] for a in splits}
+        for genome in genomes.difference(set(sequences.keys())):
+            sequences[genome] = "-" * int(alnlen)
+        zipped = list(zip_longest(*[sequences[g] for g in args.genomes]))
+        for n, bp in enumerate(range(int(bpi), int(bpi) + int(alnlen))):
+            outfile.write("{}\t{}\t{}\n".format(chromosome, bp, "\t".join(zipped[n])))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/sfs2allele.py b/bin/sfs2allele.py
new file mode 100644
index 0000000..5fe58cf
--- /dev/null
+++ b/bin/sfs2allele.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+import argparse
+import sys
+
+import numpy as np
+
+
+def parse_args():
+    # Argument definition
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--sfs",
+        metavar="TXT",
+        type=str,
+        help="SFS probabilities",
+        required=True,
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        metavar="TXT",
+        type=str,
+        help="Input file for EST-SFS",
+        required=True,
+    )
+    parser.add_argument(
+        "-S",
+        "--sites",
+        metavar="TXT",
+        type=str,
+        help="Site positions for the SFS input",
+        required=False,
+    )
+    parser.add_argument(
+        "-o",
+        "--out",
+        metavar="prefix",
+        type=str,
+        help="Output ancestral allele files.",
+        required=False,
+        default="-",
+    )
+    return parser.parse_args()
+
+
+NTS = ['A', 'C', 'G', 'T']
+
+
+def get_ancestral(maj_p, maj_a, nodes, counts):
+    """Define the ancestral allele."""
+    # If major allele is more likely than the others, return it 
+    if maj_p > 0.5:
+        return maj_a
+    else:
+        alleles = np.array(NTS)
+        counts = np.array(map(int, counts))
+        min_allele_idx = (counts != np.max(counts)) & (counts != 0)
+        min_alleles = set(alleles[min_allele_idx])
+        isecs = set(nodes).intersection(min_alleles)
+        # If one Nt only is possible, return it
+        if len(isecs) == 1:
+            return isecs[0]
+        # Otherwise, return the oldest node in the tree
+        else:
+            return nodes[-1]
+
+
+
+def main():
+    # Get arguments
+    args = parse_args()
+
+    # Get major allele for each site
+    majors = {}
+    with open(args.input) as inputfile:
+        for n, line in enumerate(inputfile):
+            allele_cnts = [int(i) for i in line.strip().split()[0].split(',')]
+            majors[str(n+1)] = {
+                'major': NTS[np.argmax(allele_cnts)],
+                'vector' : line.strip().split()[0],
+                'chrom': None,
+                'pos': None
+            }
+
+    # Add site mapping
+    if args.sites:
+        with open(args.sites) as sitesfile:
+            for n, line in enumerate(sitesfile):
+                chrom, pos = line.strip().split()
+                majors[str(n+1)]['chrom'] = chrom
+                majors[str(n+1)]['pos'] = pos
+
+    # Define output
+    if args.out == '-' or args.out == '/dev/stdout':
+        out = sys.stdout
+    else:
+        out = open(args.out, 'w')
+
+    # Define the ancestral based on the probs
+    nts_combinations = [ NT1+NT2 for NT1 in NTS for NT2 in NTS ]
+    with open(args.sfs) as sfs:
+        for line in sfs:
+            if line[0] == '0':
+                continue
+            site_n, _, maj_prob, probs = line.strip().split(' ', 3)
+            maj_prob = float(maj_prob)
+            outgr_probs = tuple(map(float, probs.split(' ')))
+            max_og_idx = np.argmax(outgr_probs)
+            maj_allele = majors[site_n]['major']
+            vect = majors[site_n]['vector']
+            nodes = nts_combinations[max_og_idx]
+            ancestral = get_ancestral(maj_prob, maj_allele, nodes, vect.split(','))
+            if args.sites:
+                chrom = majors[site_n]['chrom']
+                pos = majors[site_n]['pos']
+                out.write(f"{site_n}\t{chrom}\t{pos}\t{maj_prob}\t{outgr_probs[max_og_idx]}\t{maj_allele}\t{vect}\t{nodes}\t{ancestral}\n")
+            else:
+                out.write(f"{site_n}\t{maj_prob}\t{outgr_probs[max_og_idx]}\t{maj_allele}\t{vect}\t{nodes}\t{ancestral}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/snps2ests.py b/bin/snps2ests.py
new file mode 100644
index 0000000..bc60de6
--- /dev/null
+++ b/bin/snps2ests.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python
+import argparse
+import gzip
+
+from tqdm import tqdm
+
+
+def parse_args():
+    # Argument definition
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input",
+        metavar="TXT",
+        type=str,
+        help="Input txt file with the positions (coordinates are zero-based)",
+        required=True,
+    )
+    parser.add_argument(
+        "-s",
+        "--sites",
+        metavar="TXT",
+        type=str,
+        help="Input list of sites to keep (coordinates are zero-based)",
+        required=False,
+    )
+    parser.add_argument(
+        "--sitefile",
+        metavar="TXT",
+        type=str,
+        help="List of sites that have been found.",
+        required=True,
+    )
+    parser.add_argument(
+        "-d",
+        "--datafile",
+        metavar="prefix",
+        type=str,
+        help="Output data file name.",
+        required=False,
+        default="data-file.txt",
+    )
+    parser.add_argument(
+        "-c",
+        "--configfile",
+        metavar="prefix",
+        type=str,
+        help="Output config file name.",
+        required=False,
+        default="config-file.txt",
+    )
+    parser.add_argument(
+        "--targets",
+        type=str,
+        help="Genomes for the target species.",
+        nargs="+",
+        required=True,
+    )
+    parser.add_argument(
+        "--outgroup1",
+        type=str,
+        help="Outgroup 1 name.",
+        required=True,
+    )
+    parser.add_argument(
+        "--outgroup2",
+        type=str,
+        help="Outgroup 2 name.",
+        required=False,
+    )
+    parser.add_argument(
+        "--outgroup3",
+        type=str,
+        help="Outgroup 3 name.",
+        required=False,
+    )
+    parser.add_argument(
+        "--model",
+        type=int,
+        help="Model for ESFS.",
+        default=0,
+        choices=[0, 1, 2],
+        required=False,
+    )
+    parser.add_argument(
+        "--nrandom",
+        type=int,
+        help="Random iteration for the ESFS.",
+        default=0,
+        required=False,
+    )
+    return parser.parse_args()
+
+
+def parse_gzipped_line(line):
+    return line.decode().strip().split('\t')
+
+
+def parse_simple_line(line):
+    return line.strip().split('\t')
+
+
+def main():
+    # Run main
+    args = parse_args()
+
+    # Define input variables
+    n_fields = 0
+    targets = set(args.targets)
+    outgroup1 = args.outgroup1
+    outgroup2 = args.outgroup2
+    outgroup3 = args.outgroup3
+    if outgroup2 is None and outgroup3 is not None:
+        raise Exception("Provided outgroup 3 without outgroup 2")
+    n_outgroups = 1
+    if outgroup2 is not None:
+        n_outgroups = 2
+    if outgroup3 is not None:
+        n_outgroups = 3
+    # Variable indexes instantiate
+    tgt_idxs = []
+    og1_idx = None
+    og2_idx = None
+    og3_idx = None
+    # Define the values to return
+    # as counts of [A, C, G, T]
+    cnt_idxs = {'a': 0, 'A': 0, 'c': 1, 'C': 1, 'g': 2, 'G': 2, 't': 3, 'T': 3}
+    target_cnt = [0, 0, 0, 0]
+    outgroup1_cnt = [0, 0, 0, 0]
+    outgroup2_cnt = [0, 0, 0, 0]
+    outgroup3_cnt = [0, 0, 0, 0]
+
+    # Load sites
+    sites = {}
+    if args.sites:
+        print("Loading target sites...")
+        for site in open(args.sites):
+            site = site.strip().split()
+            sites[f"{site[0]}_{site[1]}"] = True
+        print("Found {} sites".format(len(sites)))
+
+    # define the loader
+    if args.input.endswith('.gz'):
+        buffer = gzip.open
+        parser = parse_gzipped_line
+    else:
+        buffer = open
+        parser = parse_simple_line
+
+    # Output file
+    file_counter = 1
+    row_counter = 0
+    outfile = open(f"{args.datafile}.{file_counter}.txt", 'w')
+    sitefile = open(f"{args.sitefile}.{file_counter}.txt", 'w')
+    # Prepare progress bar for the file
+    # Process the input
+    print("Processing input file")
+    print(f"Saving to file #: {file_counter}")
+    pbar = tqdm(total=1000000)
+    for n,line in enumerate(buffer(args.input)):
+        line = parser(line)
+        # First, we work out the header positions
+        if n == 0:
+            n_fields = len(line)
+            # Prepare the index for the targets
+            for tgt in targets:
+                if tgt not in line:
+                    raise Exception(f"Missing key in header: {tgt}")
+                tgt_idxs.append( line.index(tgt) )
+            # At least one outgroup is needed
+            if outgroup1 not in line:
+                raise Exception(f"Missing outgroup in header: {outgroup1}")
+            og1_idx = line.index(outgroup1)
+            # Add at most three outgroups
+            if outgroup2:
+                if outgroup2 not in line:
+                    raise Exception(f"Missing outgroup in header: {outgroup2}")
+                og2_idx = line.index(outgroup2)
+            if outgroup3:
+                if outgroup3 not in line:
+                    raise Exception(f"Missing outgroup in header: {outgroup3}")
+                og3_idx = line.index(outgroup3)
+            continue
+        if sites and not sites.get(f"{line[0]}_{line[1]}", False):
+                continue
+        # If there are already 1M rows, close the file and proceed to the next file
+        if row_counter == 1000000:
+            pbar.close()
+            print(f"Closing file {file_counter} with 1M sites")
+            sitefile.close()
+            outfile.close()
+            row_counter = 0
+            file_counter += 1
+            print(f"Saving to file {file_counter}")
+            pbar = tqdm(total=1000000)
+            outfile = open(f"{args.datafile}.{file_counter}.txt", 'w')
+            sitefile = open(f"{args.sitefile}.{file_counter}.txt", 'w')
+        # Write site id in the file
+        sitefile.write("{}\n".format('\t'.join(line[0:2])))
+        while len(line) < n_fields:
+            line.append('')
+        # Then, we process the rest
+        for idx in tgt_idxs:
+            base = line[idx]
+            if base in cnt_idxs.keys():
+                target_cnt[cnt_idxs[base]] += 1
+        base = line[og1_idx]
+        if base in cnt_idxs.keys():
+            outgroup1_cnt[cnt_idxs[base]] += 1
+        if og2_idx:
+            base = line[og2_idx]
+            if base in cnt_idxs.keys():
+                outgroup2_cnt[cnt_idxs[base]] += 1
+        if og3_idx:
+            base = line[og3_idx]
+            if base in cnt_idxs.keys():
+                outgroup3_cnt[cnt_idxs[base]] += 1
+        outfile.write('{} {} {} {}\n'.format(
+            ','.join(map(str, target_cnt)),
+            ','.join(map(str, outgroup1_cnt)),
+            ','.join(map(str, outgroup2_cnt)),
+            ','.join(map(str, outgroup3_cnt)),
+        ))
+        target_cnt = [0, 0, 0, 0]
+        outgroup1_cnt = [0, 0, 0, 0]
+        outgroup2_cnt = [0, 0, 0, 0]
+        outgroup3_cnt = [0, 0, 0, 0]
+        row_counter += 1
+        pbar.update(1)
+
+    # Final closing of the output files & progress bar
+    pbar.close()
+    sitefile.close()
+    outfile.close()
+
+    # Then, save the configuration file
+    print("Save configuration file")
+    with open(args.configfile, 'w') as outcfg:
+        outcfg.write(f"n_outgroup {n_outgroups}\n")
+        outcfg.write(f"model {args.model}\n")
+        outcfg.write(f"nrandom {args.nrandom}\n")
+    print("All done\n\n")
+
+if __name__ == "__main__":
+    main()
diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf
index b5347c5..f15a17a 100755
--- a/include/workflow/sdm.nf
+++ b/include/workflow/sdm.nf
@@ -37,7 +37,7 @@ workflow SDM {
         // Run dinuc pipeline
         raw_sdm = breeds_ch
         | combine(
-            sdm( combined_ch, ancfasta, ancfai )
+            sdm( combined_ch, reffasta, reffai )
             | groupTuple(by: 0),
             by: 0
         )

From b783709e9ca74021a56b7c3aa6f8a06aa1b2383e Mon Sep 17 00:00:00 2001
From: Andrea Talenti <23279528+RenzoTale88@users.noreply.github.com>
Date: Tue, 7 Oct 2025 09:38:21 +0100
Subject: [PATCH 8/8] Working single changes

---
 include/process/sdm.nf  | 12 +++++++-----
 include/workflow/sdm.nf | 16 +++++++++++++---
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/process/sdm.nf b/include/process/sdm.nf
index ca22b9a..79c1d9f 100755
--- a/include/process/sdm.nf
+++ b/include/process/sdm.nf
@@ -236,7 +236,8 @@ process fetch_sites {
 
 
     input:
-    tuple val(chroms), path("vcfs/*"), path("vcfs/*")
+    path(vcfs, stageAs: "vcfs/*")
+    path(tbis, stageAs: "vcfs/*")
     path ancfasta
     path ancfai
     path positions
@@ -247,10 +248,11 @@ process fetch_sites {
     script:
     """
     bcftools concat vcfs/*.vcf.gz -a -O u | \
-        bcftools sort -O u -T ./ -m 2G | \
-        bcftools view -R ${positions} -Ov | \
-        mutyper variants --k 3 ${ancfasta} | \
-        bcftools view -Oz --write-index=tbi -o ${positions.baseName}.vcf.gz
+        bcftools sort -O u -T ./ -m 2G - | \
+        bcftools view -T <( sort -k1,1 -k2,2n ${positions} ) -Ov - | \
+        mutyper variants --k 3 ${ancfasta} - | \
+        bcftools view -Oz - > ${positions.baseName}.vcf.gz && \
+        tabix -p vcf ${positions.baseName}.vcf.gz
     """
 
     stub:
diff --git a/include/workflow/sdm.nf b/include/workflow/sdm.nf
index f15a17a..a5aae7b 100755
--- a/include/workflow/sdm.nf
+++ b/include/workflow/sdm.nf
@@ -57,13 +57,23 @@ workflow SDM {
         // We collect first and second changes in a full list
         first_change = sdmcounts_ch.first_change
             | splitCsv(sep: "\t")
-            | collectFile(name: "sdm_first_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true)
+            | collectFile(storeDir: "${params.outdir}/sdm/single_changes/", sort: true){
+                chrom, pos ->
+                [ "sdm_first_change.txt", "${chrom}\t${pos}\n" ]
+            }
         second_change = sdmcounts_ch.second_change
             | splitCsv(sep: "\t")
-            | collectFile(name: "sdm_second_change.txt", storeDir: "${params.outdir}/sdm/single_changes/", newLine: true)
+            | collectFile(storeDir: "${params.outdir}/sdm/single_changes/", sort: true){
+                chrom, pos ->
+                [ "sdm_second_change.txt", "${chrom}\t${pos}\n" ]
+            }
         
         // Extract the changes
-        fetch_sites( vcf_by_chr | collect, ancfasta, ancfai, first_change | mix(second_change) )
+        fetch_sites(
+            vcf_by_chr | map{_chrom, vcf, _tbi -> [vcf]},
+            vcf_by_chr | map{_chrom, _vcf, tbi -> [tbi]},
+            ancfasta, ancfai,
+            first_change | mix(second_change) )
         | map {
             vcf, tbi ->
             tuple( "3", vcf, tbi, file("${baseDir}/assets/K3_mutations.txt"), "sdm/${vcf.simpleName}" )