From 3cbd8bd6f9d3b4ea1e0f2fe679eabd30f5571f23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Fri, 20 Oct 2023 18:40:33 +0200 Subject: [PATCH] Add debug logging to weighted distances --- workflow/scripts/weighted_distances.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/weighted_distances.py b/workflow/scripts/weighted_distances.py index af29377..fdea248 100644 --- a/workflow/scripts/weighted_distances.py +++ b/workflow/scripts/weighted_distances.py @@ -87,11 +87,14 @@ def build_cache(variant_table: pd.DataFrame, reference: Seq): if sample_name not in cache["hz"]: cache["hz"][sample_name] = {} cache["freq"][sample_name][position] = get_frequencies_in_position(variant_table, sample_name, position, reference) + logging.debug(f"Frequencies for '{sample_name}':{position} = {cache['freq'][sample_name][position]}") cache["hz"][sample_name][position] = heterozygosity(cache["freq"][sample_name][position]) + logging.debug(f"Heterozygosity for '{sample_name}':{position} = {cache['hz'][sample_name][position]}") return cache def calc_heterozygosities(sample1_name: str, sample2_name: str, pos: int, cache: dict): + logging.debug(f"Calculating heterozygosities at position {pos} for '{sample1_name}' and '{sample2_name}'") # Retrieve pre-computed values freqs1 = cache["freq"][sample1_name][pos] freqs2 = cache["freq"][sample2_name][pos] @@ -129,21 +132,28 @@ def calculate_distance_matrix(variant_table: pd.DataFrame, samples: List[str], r def main(): - logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) + logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.DEBUG) logging.info("Reading input FASTA files") ancestor = read_monofasta(snakemake.input.ancestor) + logging.debug(f"Ancestor: '{ancestor.description}', length={len(ancestor.seq)}") reference = read_monofasta(snakemake.input.reference) + logging.debug(f"Reference: '{reference.description}', length={len(reference.seq)}") logging.info("Reading input tables") masked_positions = read_masked_sites(snakemake.input.vcf, snakemake.params.mask_class) + logging.debug(f"Read {len(masked_positions)} masked positions") input_table = pd.read_table(snakemake.input.tsv, sep="\t") + logging.debug(f"Read {len(input_table)} rows in input TSV") ancestor_table = build_ancestor_variant_table(ancestor.seq, reference.seq, reference.id, masked_positions) + logging.debug(f"Ancestor has {len(ancestor_table)} variants") variant_table = pd.concat([input_table, ancestor_table], ignore_index=True) + logging.debug(f"Combined table has {len(ancestor_table)} variants") logging.info(f"Calculating distance matrix") sample_names = snakemake.params.samples + [reference.id] distances = calculate_distance_matrix(variant_table, sample_names, ancestor.seq) + logging.debug(f"Distance matrix has shape: {distances.shape}") logging.info("Writing results") distances.to_csv(snakemake.output.distances)