Skip to content

Commit

Permalink
Data structures for looping
Browse files Browse the repository at this point in the history
  • Loading branch information
nickjcroucher committed Apr 24, 2024
1 parent ac194d1 commit cd52952
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 30 deletions.
94 changes: 88 additions & 6 deletions src/Newickform.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,72 @@ int max_distance_to_tips(newick_node *root) {
return max_distance;
}

// Recursive function to traverse the tree and populate the parents list
void populate_parents(newick_node *node, newick_node** nodeArray, int * parents, int num_nodes) {

// Get index of parent
int parent_index;
for (int i = 0; i < num_nodes; ++i)
{
if (nodeArray[i]->taxon == node->taxon)
{
parent_index = i;
break;
}
}

// Get indices of children
if (node->child != NULL) {
newick_child *child = node->child;
while (child != NULL) {
for (int j = 0; j < num_nodes; ++j)
{
if (nodeArray[j]->taxon == child->node->taxon)
{
parents[j] = parent_index;
break;
}
}
child = child->next;
}
}

// Recursively traverse child nodes
if (node->child != NULL) {
newick_child *child = node->child;
while (child != NULL) {
populate_parents(child->node, nodeArray, parents, num_nodes);
child = child->next;
}
}
}

// Function to initialize the parents list and call the recursive function
int **get_parents(newick_node *root, newick_node** nodeArray, int num_nodes) {

// Initialise parent node array
int * parents = calloc(num_nodes,sizeof(int));

// Initialize all elements to NULL
for (int i = 0; i < num_nodes; i++) {
parents[i] = -1;
}

// Populate the parents list recursively
populate_parents(root, nodeArray, parents, num_nodes);

// Set root parent to NULL
for (int i = 0; i < num_nodes; i++) {
if (parents[i] == -1)
{
parents[i] = NULL;
break;
}
}

return parents;
}

newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns,int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads)
{
int iLen, iMaxLen;
Expand Down Expand Up @@ -254,6 +320,9 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp
}
fill_nodeArray(current_node,nodeArray,num_nodes);

// get parent of each node
int * parents = get_parents(root, nodeArray, num_nodes);

// get depths of each node from tips
int max_depth = max_distance_to_tips(root);
int *node_depths = malloc(num_nodes * sizeof(int));
Expand Down Expand Up @@ -356,21 +425,34 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp
for (int node_num_index = 0; node_num_index < num_jobs; ++node_num_index)
{
int node_index = jobNodeIndexArray[node_num_index];
fill_in_recombinations_with_gaps(nodeArray[node_index],
parent_recombinations_array[node_index],
parent_num_recombinations_array[node_index],
current_total_snps_array[node_index],
num_blocks_array[node_index],
int parent_node_index = parents[node_index];
fill_in_recombinations_with_gaps(nodeArray,
node_index,
parent_node_index,
parent_recombinations_array,
parent_num_recombinations_array,
current_total_snps_array,
num_blocks_array,
nodeArray[node_index]->block_coordinates,
length_of_original_genome,
snp_locations,
number_of_snps);
}
}

// Free arrays
// Free gaps arrays
free(parent_num_recombinations_array);
free(current_total_snps_array);
free(num_blocks_array);
for (int i = 0; i < num_nodes; ++i) {
free(parent_recombinations_array[i]);
}
free(parent_recombinations_array);

// Free general arrays
free(nodeArray);
free(node_depths);
free(parents);

fclose(block_file_pointer);
fclose(gff_file_pointer);
Expand Down
56 changes: 33 additions & 23 deletions src/branch_sequences.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,19 +105,17 @@ int get_list_of_snp_indices_which_fall_in_downstream_recombinations(int ** curre


// Go through the tree and build up the recombinations list from root to branch. Print out each sample name and a list of recombinations
void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinations, int parent_num_recombinations, int current_total_snps,int num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps )
void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** parent_recombinations, int * parent_num_recombinations, int * current_total_snps, int * num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps)
{
newick_child *child;
int * current_recombinations;
int num_current_recombinations = 0 ;
newick_node * root = nodeArray[node_index];
char * child_sequence = (char *) calloc((length_of_original_genome +1),sizeof(char));

current_recombinations = (int *) calloc((root->num_recombinations+1+parent_num_recombinations),sizeof(int));
num_current_recombinations = copy_and_concat_integer_arrays(root->recombinations,
parent_recombinations[node_index] = (int *) calloc((root->num_recombinations+1+parent_num_recombinations[parent_node_index]),sizeof(int));
parent_num_recombinations[node_index] = copy_and_concat_integer_arrays(root->recombinations,
root->num_recombinations,
parent_recombinations,
parent_num_recombinations,
current_recombinations);
parent_recombinations[parent_node_index],
parent_num_recombinations[parent_node_index],
parent_recombinations[node_index]);

// overwrite the bases of snps with N's
int i;
Expand All @@ -129,50 +127,60 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat

get_sequence_for_sample_name(child_sequence, root->taxon);
int genome_length_excluding_blocks_and_gaps = calculate_genome_length_excluding_blocks_and_gaps(child_sequence,
length_of_original_genome,
current_block_coordinates,
num_blocks);
length_of_original_genome,
current_block_coordinates,
num_blocks[parent_node_index]);

set_genome_length_excluding_blocks_and_gaps_for_sample(root->taxon,
genome_length_excluding_blocks_and_gaps);

// Merge block coordinates putting most recent blocks first
int ** merged_block_coordinates;
merged_block_coordinates = (int **) calloc(3,sizeof(int *));
merged_block_coordinates[0] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int ));
merged_block_coordinates[1] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int ));
merged_block_coordinates[0] = (int*) calloc((num_blocks[parent_node_index] + root->number_of_blocks+1),sizeof(int ));
merged_block_coordinates[1] = (int*) calloc((num_blocks[parent_node_index] + root->number_of_blocks+1),sizeof(int ));
copy_and_concat_2d_integer_arrays(current_block_coordinates,
num_blocks,
num_blocks[parent_node_index],
root->block_coordinates,
root->number_of_blocks,
merged_block_coordinates
);

// Set the number of recombination blocks for the sample
set_number_of_blocks_for_sample(root->taxon, root->number_of_blocks);
set_number_of_branch_bases_in_recombinations(root->taxon,

// Set number of branch bases in recombination by iterating through
// the first part of merged blocks (i.e. only blocks on the branch to this node)
set_number_of_branch_bases_in_recombinations(root->taxon,
calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates,
root->number_of_blocks,
child_sequence,
snp_locations,
current_total_snps)
current_total_snps[parent_node_index])
// root->number_of_snps)
);
// current_total_snps[node_index] = current_total_snps[parent_node_index] + root->number_of_snps; // restore?
// Set number of total bases in recombination by iterating through
// all merged blocks leading to this node
set_number_of_bases_in_recombinations(root->taxon,
calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates,
(num_blocks + root->number_of_blocks),
(num_blocks[parent_node_index] + root->number_of_blocks),
child_sequence,
snp_locations,
current_total_snps)
current_total_snps[parent_node_index])
// current_total_snps[node_index])
);
free(child_sequence);

for(i = 0; i < num_current_recombinations; i++)
for(i = 0; i < parent_num_recombinations[node_index]; i++)
{
update_sequence_base('N', sequence_index, current_recombinations[i]);
update_sequence_base('N', sequence_index, parent_recombinations[node_index][i]);
}

// TODO: The stats for the number of snps in recombinations will need to be updated.
int * snps_in_recombinations = (int *) calloc((number_of_snps +1),sizeof(int));
int num_snps_in_recombinations = get_list_of_snp_indices_which_fall_in_downstream_recombinations(merged_block_coordinates,
(num_blocks + root->number_of_blocks),
(num_blocks[parent_node_index] + root->number_of_blocks),
snp_locations,
number_of_snps,
snps_in_recombinations);
Expand All @@ -187,6 +195,9 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat
{
// child = root->child;
set_internal_node(1,sequence_index);
// Update number of SNPs
current_total_snps[node_index] = current_total_snps[parent_node_index] + root->number_of_snps;
num_blocks[node_index] = num_blocks[parent_node_index] + root->number_of_blocks;

// while (child != NULL)
// {
Expand All @@ -208,7 +219,6 @@ void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinat
{
set_internal_node(0,sequence_index);
}
free(current_recombinations);
free(merged_block_coordinates[0]);
free(merged_block_coordinates[1]);
free(merged_block_coordinates);
Expand Down
2 changes: 1 addition & 1 deletion src/branch_sequences.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ int calculate_window_size(int branch_genome_size, int number_of_branch_snps,int
double calculate_threshold(int branch_genome_size, int window_size, float uncorrected_p_value);
int p_value_test(int branch_genome_size, int window_size, int num_branch_snps, int block_snp_count, int min_snps, float uncorrected_p_value);
double reduce_factorial(int l, int i);
void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinations, int parent_num_recombinations, int current_total_snps,int num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps);
void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** parent_recombinations, int * parent_num_recombinations, int * current_total_snps,int * num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps);
int copy_and_concat_integer_arrays(int * array_1, int array_1_size, int * array_2, int array_2_size, int * output_array);
int copy_and_concat_2d_integer_arrays(int ** array_1, int array_1_size, int ** array_2, int array_2_size, int ** output_array);
double snp_density(int length_of_sequence, int number_of_snps);
Expand Down

0 comments on commit cd52952

Please sign in to comment.