From cd52952b51e276557091f84af022522ae523b385 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 24 Apr 2024 09:46:00 +0100 Subject: [PATCH] Data structures for looping --- src/Newickform.c | 94 +++++++++++++++++++++++++++++++++++++++--- src/branch_sequences.c | 56 ++++++++++++++----------- src/branch_sequences.h | 2 +- 3 files changed, 122 insertions(+), 30 deletions(-) diff --git a/src/Newickform.c b/src/Newickform.c index 3eab66c6..2c1666c1 100644 --- a/src/Newickform.c +++ b/src/Newickform.c @@ -176,6 +176,72 @@ int max_distance_to_tips(newick_node *root) { return max_distance; } +// Recursive function to traverse the tree and populate the parents list +void populate_parents(newick_node *node, newick_node** nodeArray, int * parents, int num_nodes) { + + // Get index of parent + int parent_index; + for (int i = 0; i < num_nodes; ++i) + { + if (nodeArray[i]->taxon == node->taxon) + { + parent_index = i; + break; + } + } + + // Get indices of children + if (node->child != NULL) { + newick_child *child = node->child; + while (child != NULL) { + for (int j = 0; j < num_nodes; ++j) + { + if (nodeArray[j]->taxon == child->node->taxon) + { + parents[j] = parent_index; + break; + } + } + child = child->next; + } + } + + // Recursively traverse child nodes + if (node->child != NULL) { + newick_child *child = node->child; + while (child != NULL) { + populate_parents(child->node, nodeArray, parents, num_nodes); + child = child->next; + } + } +} + +// Function to initialize the parents list and call the recursive function +int **get_parents(newick_node *root, newick_node** nodeArray, int num_nodes) { + + // Initialise parent node array + int * parents = calloc(num_nodes,sizeof(int)); + + // Initialize all elements to NULL + for (int i = 0; i < num_nodes; i++) { + parents[i] = -1; + } + + // Populate the parents list recursively + populate_parents(root, nodeArray, parents, num_nodes); + + // Set root parent to NULL + for (int i = 0; i < num_nodes; i++) { + if (parents[i] == -1) + { + parents[i] = NULL; + break; + } + } + + return parents; +} + newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns,int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads) { int iLen, iMaxLen; @@ -254,6 +320,9 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp } fill_nodeArray(current_node,nodeArray,num_nodes); + // get parent of each node + int * parents = get_parents(root, nodeArray, num_nodes); + // get depths of each node from tips int max_depth = max_distance_to_tips(root); int *node_depths = malloc(num_nodes * sizeof(int)); @@ -356,11 +425,14 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp for (int node_num_index = 0; node_num_index < num_jobs; ++node_num_index) { int node_index = jobNodeIndexArray[node_num_index]; - fill_in_recombinations_with_gaps(nodeArray[node_index], - parent_recombinations_array[node_index], - parent_num_recombinations_array[node_index], - current_total_snps_array[node_index], - num_blocks_array[node_index], + int parent_node_index = parents[node_index]; + fill_in_recombinations_with_gaps(nodeArray, + node_index, + parent_node_index, + parent_recombinations_array, + parent_num_recombinations_array, + current_total_snps_array, + num_blocks_array, nodeArray[node_index]->block_coordinates, length_of_original_genome, snp_locations, @@ -368,9 +440,19 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp } } - // Free arrays + // Free gaps arrays + free(parent_num_recombinations_array); + free(current_total_snps_array); + free(num_blocks_array); + for (int i = 0; i < num_nodes; ++i) { + free(parent_recombinations_array[i]); + } + free(parent_recombinations_array); + + // Free general arrays free(nodeArray); free(node_depths); + free(parents); fclose(block_file_pointer); fclose(gff_file_pointer); diff --git a/src/branch_sequences.c b/src/branch_sequences.c index 19bd6e3c..b34bd26f 100644 --- a/src/branch_sequences.c +++ b/src/branch_sequences.c @@ -105,19 +105,17 @@ int get_list_of_snp_indices_which_fall_in_downstream_recombinations(int ** curre // Go through the tree and build up the recombinations list from root to branch. Print out each sample name and a list of recombinations -void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinations, int parent_num_recombinations, int current_total_snps,int num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps ) +void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** parent_recombinations, int * parent_num_recombinations, int * current_total_snps, int * num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps) { - newick_child *child; - int * current_recombinations; - int num_current_recombinations = 0 ; + newick_node * root = nodeArray[node_index]; char * child_sequence = (char *) calloc((length_of_original_genome +1),sizeof(char)); - current_recombinations = (int *) calloc((root->num_recombinations+1+parent_num_recombinations),sizeof(int)); - num_current_recombinations = copy_and_concat_integer_arrays(root->recombinations, + parent_recombinations[node_index] = (int *) calloc((root->num_recombinations+1+parent_num_recombinations[parent_node_index]),sizeof(int)); + parent_num_recombinations[node_index] = copy_and_concat_integer_arrays(root->recombinations, root->num_recombinations, - parent_recombinations, - parent_num_recombinations, - current_recombinations); + parent_recombinations[parent_node_index], + parent_num_recombinations[parent_node_index], + parent_recombinations[node_index]); // overwrite the bases of snps with N's int i; @@ -129,50 +127,60 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat get_sequence_for_sample_name(child_sequence, root->taxon); int genome_length_excluding_blocks_and_gaps = calculate_genome_length_excluding_blocks_and_gaps(child_sequence, - length_of_original_genome, - current_block_coordinates, - num_blocks); + length_of_original_genome, + current_block_coordinates, + num_blocks[parent_node_index]); set_genome_length_excluding_blocks_and_gaps_for_sample(root->taxon, genome_length_excluding_blocks_and_gaps); + // Merge block coordinates putting most recent blocks first int ** merged_block_coordinates; merged_block_coordinates = (int **) calloc(3,sizeof(int *)); - merged_block_coordinates[0] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int )); - merged_block_coordinates[1] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int )); + merged_block_coordinates[0] = (int*) calloc((num_blocks[parent_node_index] + root->number_of_blocks+1),sizeof(int )); + merged_block_coordinates[1] = (int*) calloc((num_blocks[parent_node_index] + root->number_of_blocks+1),sizeof(int )); copy_and_concat_2d_integer_arrays(current_block_coordinates, - num_blocks, + num_blocks[parent_node_index], root->block_coordinates, root->number_of_blocks, merged_block_coordinates ); + // Set the number of recombination blocks for the sample set_number_of_blocks_for_sample(root->taxon, root->number_of_blocks); - set_number_of_branch_bases_in_recombinations(root->taxon, + + // Set number of branch bases in recombination by iterating through + // the first part of merged blocks (i.e. only blocks on the branch to this node) + set_number_of_branch_bases_in_recombinations(root->taxon, calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, root->number_of_blocks, child_sequence, snp_locations, - current_total_snps) + current_total_snps[parent_node_index]) +// root->number_of_snps) ); +// current_total_snps[node_index] = current_total_snps[parent_node_index] + root->number_of_snps; // restore? + // Set number of total bases in recombination by iterating through + // all merged blocks leading to this node set_number_of_bases_in_recombinations(root->taxon, calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, - (num_blocks + root->number_of_blocks), + (num_blocks[parent_node_index] + root->number_of_blocks), child_sequence, snp_locations, - current_total_snps) + current_total_snps[parent_node_index]) +// current_total_snps[node_index]) ); free(child_sequence); - for(i = 0; i < num_current_recombinations; i++) + for(i = 0; i < parent_num_recombinations[node_index]; i++) { - update_sequence_base('N', sequence_index, current_recombinations[i]); + update_sequence_base('N', sequence_index, parent_recombinations[node_index][i]); } // TODO: The stats for the number of snps in recombinations will need to be updated. int * snps_in_recombinations = (int *) calloc((number_of_snps +1),sizeof(int)); int num_snps_in_recombinations = get_list_of_snp_indices_which_fall_in_downstream_recombinations(merged_block_coordinates, - (num_blocks + root->number_of_blocks), + (num_blocks[parent_node_index] + root->number_of_blocks), snp_locations, number_of_snps, snps_in_recombinations); @@ -187,6 +195,9 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat { // child = root->child; set_internal_node(1,sequence_index); + // Update number of SNPs + current_total_snps[node_index] = current_total_snps[parent_node_index] + root->number_of_snps; + num_blocks[node_index] = num_blocks[parent_node_index] + root->number_of_blocks; // while (child != NULL) // { @@ -208,7 +219,6 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat { set_internal_node(0,sequence_index); } - free(current_recombinations); free(merged_block_coordinates[0]); free(merged_block_coordinates[1]); free(merged_block_coordinates); diff --git a/src/branch_sequences.h b/src/branch_sequences.h index 8e9164ec..30a1b1e3 100644 --- a/src/branch_sequences.h +++ b/src/branch_sequences.h @@ -30,7 +30,7 @@ int calculate_window_size(int branch_genome_size, int number_of_branch_snps,int double calculate_threshold(int branch_genome_size, int window_size, float uncorrected_p_value); int p_value_test(int branch_genome_size, int window_size, int num_branch_snps, int block_snp_count, int min_snps, float uncorrected_p_value); double reduce_factorial(int l, int i); -void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinations, int parent_num_recombinations, int current_total_snps,int num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps); +void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** parent_recombinations, int * parent_num_recombinations, int * current_total_snps,int * num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps); int copy_and_concat_integer_arrays(int * array_1, int array_1_size, int * array_2, int array_2_size, int * output_array); int copy_and_concat_2d_integer_arrays(int ** array_1, int array_1_size, int ** array_2, int array_2_size, int ** output_array); double snp_density(int length_of_sequence, int number_of_snps);