From 11a9bbea325a3b455ec42fd933d65be7701b5f47 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 11:28:02 +1100 Subject: [PATCH 01/24] Indices: optimise _calc_endemism_absolute Take advantage of the label hash global precalc, and use hash aliases instead of refs. --- lib/Biodiverse/Indices/Endemism.pm | 31 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/lib/Biodiverse/Indices/Endemism.pm b/lib/Biodiverse/Indices/Endemism.pm index 97d5a4f81..348ef6abb 100644 --- a/lib/Biodiverse/Indices/Endemism.pm +++ b/lib/Biodiverse/Indices/Endemism.pm @@ -6,6 +6,9 @@ use 5.020; our $VERSION = '4.99_002'; +use experimental 'refaliasing'; + + my $metadata_class = 'Biodiverse::Metadata::Indices'; sub get_metadata_calc_endemism_central_normalised { @@ -874,8 +877,9 @@ sub get_metadata__calc_endemism_absolute { my %metadata = ( description => $desc, name => 'Absolute endemism, internals', - uses_nbr_lists => 1, # how many sets of lists it must have - pre_calc => ['calc_abc2'], + uses_nbr_lists => 1, # how many sets of lists it must have + pre_calc => [ 'calc_abc2' ], + pre_calc_global => ['get_label_range_hash'] ); # add to if needed return $metadata_class->new(\%metadata); @@ -889,9 +893,11 @@ sub _calc_endemism_absolute { my $bd = $self->get_basedata_ref; - my $local_ranges = $args{label_hash_all}; - my $l_hash1 = $args{label_hash1}; - my $l_hash2 = $args{label_hash2}; + \my %local_ranges = $args{label_hash_all}; + \my %l_hash1 = $args{label_hash1}; + \my %l_hash2 = $args{label_hash2}; + + \my %ranges = $args{label_range_hash}; # allows us to use this for any other basedata get_* function my $function = 'get_range'; @@ -899,27 +905,28 @@ sub _calc_endemism_absolute { my ($end1, $end2, $end_all) = (0, 0, 0); my (%eh1, %eh2, %eh_all); - while (my ($sub_label, $local_range) = each %{$local_ranges}) { - my $range = $bd->$function (element => $sub_label); + foreach my $sub_label (keys %local_ranges) { + my $local_range = $local_ranges{$sub_label}; + my $range = $ranges{$sub_label}; next if $range > $local_range; # cannot be absolutely endemic $end_all++; $eh_all{$sub_label} = $local_range; - if (exists $l_hash1->{$sub_label} and $range <= $l_hash1->{$sub_label}) { + if ($l_hash1{$sub_label} and $range <= $l_hash1{$sub_label}) { $end1++; $eh1{$sub_label} = $local_range; } - if (exists $l_hash2->{$sub_label} and $range <= $l_hash2->{$sub_label}) { + if ($l_hash2{$sub_label} and $range <= $l_hash2{$sub_label}) { $end2++; $eh2{$sub_label} = $local_range; } } - my $end1_p = eval {$end1 / scalar keys %$l_hash1}; - my $end2_p = eval {$end2 / scalar keys %$l_hash2}; - my $end_all_p = eval {$end_all / scalar keys %$local_ranges}; + my $end1_p = eval {$end1 / scalar keys %l_hash1}; + my $end2_p = eval {$end2 / scalar keys %l_hash2}; + my $end_all_p = eval {$end_all / scalar keys %local_ranges}; my %results = ( END_ABS1 => $end1, From 501bd7bde1ce93881bcb0f8307bcc3f6b3547eab Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 11:52:19 +1100 Subject: [PATCH 02/24] Minor optimisations in _calc_endemism_hier_part --- lib/Biodiverse/Indices/Endemism.pm | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/Biodiverse/Indices/Endemism.pm b/lib/Biodiverse/Indices/Endemism.pm index 348ef6abb..a659a1ecb 100644 --- a/lib/Biodiverse/Indices/Endemism.pm +++ b/lib/Biodiverse/Indices/Endemism.pm @@ -433,15 +433,18 @@ sub _calc_endemism_hier_part { my @hash_ref_array = (); my @count_array = (); my $total_count = 0; - while (my ($label, $wt) = each %$wt_list) { + + foreach my $label (keys %$wt_list) { + my $wt = $wt_list->{$label}; my $contribution = $wt / $we; $total_count ++; - my $node_ref = $tree->get_node_ref (node => $label); + my $node_ref = $tree->get_node_ref_aa ($label); my $node_name = $label; - # climb the tree and add the contributions + # Climb the tree and add the contributions. + # Depth is off by one so the root is $i==-1. my $i = $depth; - while (! $node_ref->is_root_node) { + while ($i >= 0) { $hash_ref_array[$i]{$node_name} += $contribution; $count_array[$i]{$node_name} ++; $i--; From f9bcf98d18da19169c0847e8fa99f9e2d0cee220 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 12:13:09 +1100 Subject: [PATCH 03/24] TreeNode.pm: use a linear scan for get_hash_lists_below Might as well avoid any recursion overheads. --- lib/Biodiverse/TreeNode.pm | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/lib/Biodiverse/TreeNode.pm b/lib/Biodiverse/TreeNode.pm index ef78c7455..c361b9c87 100644 --- a/lib/Biodiverse/TreeNode.pm +++ b/lib/Biodiverse/TreeNode.pm @@ -1559,15 +1559,12 @@ sub get_shared_ancestor { # get the list of hashes in the nodes sub get_hash_lists { my $self = shift; - my %args = @_; - - my @list; - foreach my $tmp (keys %{$self}) { - next if $tmp =~ /^_/; # skip the internals - push @list, $tmp if is_hashref($self->{$tmp}); - } - return @list if wantarray; - return \@list; + + my @list + = grep {$_ !~ /^_/ and is_hashref $self->{$_}} + keys %$self; + + return wantarray ? @list : \@list; } sub get_hash_lists_below { @@ -1577,9 +1574,11 @@ sub get_hash_lists_below { my %hash_list; @hash_list{@list} = undef; - foreach my $child ($self->get_children) { - my $list_below = $child->get_hash_lists_below; + my @children = $self->get_children; + while (my $child = shift @children) { + my $list_below = $child->get_hash_lists; @hash_list{@$list_below} = undef; + push @children, $child->get_children; } return wantarray From 1e7deb8488660a94d81b8dfd80639c1d040d5fd3 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 13:14:23 +1100 Subject: [PATCH 04/24] Add an array args version of set_basedata_ref And stop throwing errors when ref is undefined in get_basedata_ref. --- lib/Biodiverse/Common.pm | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/Biodiverse/Common.pm b/lib/Biodiverse/Common.pm index cf34d7ba8..e79968d9b 100644 --- a/lib/Biodiverse/Common.pm +++ b/lib/Biodiverse/Common.pm @@ -266,12 +266,18 @@ sub load_yaml_file { croak 'Loading from a YAML file is no longer supported'; } +# Orig should never have used a hash. Oh well. +sub set_basedata_ref_aa { + my ($self, $ref) = @_; + $self->set_basedata_ref(BASEDATA_REF => $ref); +} + sub set_basedata_ref { my $self = shift; my %args = @_; $self->set_param (BASEDATA_REF => $args{BASEDATA_REF}); - $self->weaken_basedata_ref; + $self->weaken_basedata_ref if defined $args{BASEDATA_REF}; return; } @@ -279,10 +285,7 @@ sub set_basedata_ref { sub get_basedata_ref { my $self = shift; - my $bd = $self->get_param ('BASEDATA_REF') - || Biodiverse::MissingBasedataRef->throw ( - message => 'Parameter BASEDATA_REF not set' - ); + my $bd = $self->get_param ('BASEDATA_REF'); return $bd; } From 4dec15ff874e4c2f5083512558c4387dfc58b5a7 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 13:25:47 +1100 Subject: [PATCH 05/24] Trees: clone_without_caches also clears parameters This was we avoid cloning basedata refs, analysis args and the like. --- lib/Biodiverse/Randomise.pm | 5 ++++- lib/Biodiverse/Tree.pm | 12 +++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/Biodiverse/Randomise.pm b/lib/Biodiverse/Randomise.pm index 22a12387e..0439ec988 100644 --- a/lib/Biodiverse/Randomise.pm +++ b/lib/Biodiverse/Randomise.pm @@ -973,12 +973,15 @@ sub compare_cluster_calcs_per_node { # Cloning via newick format clears all the params, # so avoids lingering basedata refs and the like require Biodiverse::ReadNexus; - + my $read_nexus = Biodiverse::ReadNexus->new; $read_nexus->import_newick (data => $orig_analysis->to_newick); my @tree_array = $read_nexus->get_tree_array; my $clone = $tree_array[0]; bless $clone, blessed ($orig_analysis); + # This is more direct but actually takes longer under profiling. + # Also does not clean out the previous lists. + # my $clone = $orig_analysis->clone_without_caches; $clone->rename (new_name => $orig_analysis->get_param ('NAME') . ' rand sp_calc' . $args{rand_iter}); my %clone_analysis_args = %$analysis_args; diff --git a/lib/Biodiverse/Tree.pm b/lib/Biodiverse/Tree.pm index 396013404..f97687552 100644 --- a/lib/Biodiverse/Tree.pm +++ b/lib/Biodiverse/Tree.pm @@ -3115,19 +3115,29 @@ sub clone_without_caches { # maybe should generate a new version but blessing and parenting might take longer my %saved_node_caches; + my %params = $self->get_params_hash; + my $new_tree = do { # we have to delete the new tree's caches so avoid cloning them in the first place delete local $self->{_cache}; + delete local $self->{PARAMS} or say STDERR 'woap?'; # seem not to be able to use delete local on compound structure # or maybe it is the foreach loop even though postfix $saved_node_caches{$_} = delete $self->{TREE_BY_NAME}{$_}{_cache} foreach keys %{$self->{TREE_BY_NAME}}; $self->clone; }; - # reinstate the caches + + # reinstate the caches and other settings on the original tree + # could be done as a defer block with a more recent perl $self->{TREE_BY_NAME}{$_}{_cache} = $saved_node_caches{$_} foreach keys %{$self->{TREE_BY_NAME}}; + # assign the basic params + foreach my $param (qw /OUTSUFFIX OUTSUFFIX_YAML/) { + $new_tree->set_param($param => $params{$param}); + } + # reset all the total length values $new_tree->reset_total_length; From 965c1addf93ec7626d2c9ef799e79982561efd7e Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 13:35:17 +1100 Subject: [PATCH 06/24] Common::get_zscore_from_comp_results - avoid a lot of grepping No need to find the index names when they are in the base_list_ref already. Also use refaliasing to avoid some derefs and declutter loop variables. --- lib/Biodiverse/Common.pm | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/lib/Biodiverse/Common.pm b/lib/Biodiverse/Common.pm index e79968d9b..01ba862d3 100644 --- a/lib/Biodiverse/Common.pm +++ b/lib/Biodiverse/Common.pm @@ -2516,35 +2516,33 @@ sub get_zscore_from_comp_results { my %args = @_; # could alias this - my $comp_list_ref = $args{comp_list_ref} + \my %comp_list_ref = $args{comp_list_ref} // croak "comp_list_ref argument not specified\n"; # need the observed values - my $base_list_ref = $args{base_list_ref} + \my %base_list_ref = $args{base_list_ref} // croak "base_list_ref argument not specified\n"; my $results_list_ref = $args{results_list_ref} // {}; KEY: - foreach my $q_key (grep {$_ =~ /^Q_/} keys %$comp_list_ref) { - my $index_name = substr $q_key, 2; + foreach my $index_name (keys %base_list_ref) { - my $n = $comp_list_ref->{$q_key}; + my $n = $comp_list_ref{'Q_' . $index_name}; next KEY if !$n; my $x_key = 'SUMX_' . $index_name; my $xx_key = 'SUMXX_' . $index_name; # sum of x vals and x vals squared - my $sumx = $comp_list_ref->{$x_key}; - my $sumxx = $comp_list_ref->{$xx_key}; + my $sumx = $comp_list_ref{$x_key}; + my $sumxx = $comp_list_ref{$xx_key}; - my $z_key = $index_name; # n better be large, as we do not use n-1 - my $variance = max (0, ($sumxx - ($sumx**2) / $n) / $n); - my $obs = $base_list_ref->{$index_name}; - $results_list_ref->{$z_key} - = $variance - ? ($obs - ($sumx / $n)) / sqrt ($variance) + my $variance = ($sumxx - ($sumx**2) / $n) / $n; + + $results_list_ref->{$index_name} + = $variance > 0 + ? ($base_list_ref{$index_name} - ($sumx / $n)) / sqrt ($variance) : 0; } From 6ee11093f7ff7f1fbd5ad248fc6746d36aa0e726 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 14:46:11 +1100 Subject: [PATCH 07/24] optimise Tree::convert_comparisons_to_significances Passing in the base list allows fewer grep comparisons. This makes a large difference when there are many lists with many keys. --- lib/Biodiverse/Common.pm | 12 ++++++++---- lib/Biodiverse/Tree.pm | 9 +++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/Biodiverse/Common.pm b/lib/Biodiverse/Common.pm index 01ba862d3..1db264151 100644 --- a/lib/Biodiverse/Common.pm +++ b/lib/Biodiverse/Common.pm @@ -2667,15 +2667,19 @@ sub get_sig_rank_from_comp_results { my $self = shift; my %args = @_; - # could alias this \my %comp_list_ref = $args{comp_list_ref} // croak "comp_list_ref argument not specified\n"; \my %results_list_ref = $args{results_list_ref} // {}; - foreach my $c_key (grep {$_ =~ /^C_/} keys %comp_list_ref) { - - my $index_name = substr $c_key, 2; + # base_list_ref will usually be shorter so fewer comparisons will be needed + my @keys = $args{base_list_ref} + ? grep {exists $comp_list_ref{'C_' . $_}} keys %{$args{base_list_ref}} + : map {substr $_, 2} grep {$_ =~ /^C_/} keys %comp_list_ref; + + foreach my $index_name (@keys) { + + my $c_key = 'C_' . $index_name; if (!defined $comp_list_ref{$c_key}) { $results_list_ref{$index_name} = undef; diff --git a/lib/Biodiverse/Tree.pm b/lib/Biodiverse/Tree.pm index f97687552..3d20b53ba 100644 --- a/lib/Biodiverse/Tree.pm +++ b/lib/Biodiverse/Tree.pm @@ -2,7 +2,7 @@ package Biodiverse::Tree; # Package to build and store trees. # includes clustering methods -use 5.010; +use 5.020; use Carp; use strict; @@ -2251,7 +2251,12 @@ sub convert_comparisons_to_significances { ); } + # this will result in fewer greps inside the sig rank sub + my $base_ref_name = $list_name =~ s/.+>>//r; + my $base_list_ref = $base_node->get_list_ref_aa( $base_ref_name ); + $self->get_sig_rank_from_comp_results( + base_list_ref => $base_list_ref, comp_list_ref => $comp_ref, results_list_ref => $result_list_ref, # do it in-place ); @@ -2268,7 +2273,7 @@ sub convert_comparisons_to_zscores { if !defined $result_list_pfx; my $progress = Biodiverse::Progress->new(); - my $progress_text = "Calculating significances"; + my $progress_text = "Calculating z-scores"; $progress->update( $progress_text, 0 ); # find all the relevant lists for this target name From 80bc505bd822153743e7b4287a7c528d02c652ae Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 14:49:52 +1100 Subject: [PATCH 08/24] optimise Spatial::convert_comparisons_to_significances Passing in the base list allows fewer grep comparisons. This makes a large difference when there are many lists with many keys. --- lib/Biodiverse/Spatial.pm | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/Biodiverse/Spatial.pm b/lib/Biodiverse/Spatial.pm index f9fb567ed..f84d8fe79 100644 --- a/lib/Biodiverse/Spatial.pm +++ b/lib/Biodiverse/Spatial.pm @@ -511,7 +511,16 @@ sub convert_comparisons_to_significances { list => $result_list_name, ); + # this will result in fewer greps inside the sig rank sub + my $base_ref_name = $list_name =~ s/.+>>//r; + my $base_list_ref = $self->get_list_ref( + element => $element, + list => $base_ref_name, + autovivify => 0, + ); + $self->get_sig_rank_from_comp_results ( + base_list_ref => $base_list_ref, comp_list_ref => $comp_ref, results_list_ref => $result_list_ref, # do it in-place ); From 81f38b307c86546a6e7ac982fb08136c3d386096 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 15:29:38 +1100 Subject: [PATCH 09/24] Indices: add a hierarchical mode flag This allows future optimisations when calculating indices for cluster trees. --- lib/Biodiverse/Cluster.pm | 6 +++++- lib/Biodiverse/Indices.pm | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm index 002a47c0c..2778c822e 100644 --- a/lib/Biodiverse/Cluster.pm +++ b/lib/Biodiverse/Cluster.pm @@ -2777,7 +2777,9 @@ sub sp_calc { ); # drop out if we have none to do - return if $indices_object->get_valid_calculation_count == 0; + return if $indices_object->get_valid_calculation_count == 0; + + $indices_object->set_hierarchical_mode(1); delete $args{calculations}; # saves passing it onwards when we call the calculations delete $args{analyses}; # for backwards compat @@ -2830,6 +2832,8 @@ sub sp_calc { $self->delete_cached_metadata; + $indices_object->set_hierarchical_mode(0); + return 1; } diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index b4650b51e..e957cd486 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -1663,6 +1663,9 @@ sub set_pairwise_mode { $self->{pairwise_mode} = $mode; + croak "Cannot have both pairwise and hierarchical modes on at the same time" + if $mode && $self->get_hierarchical_mode; + return $mode; } @@ -1671,6 +1674,24 @@ sub get_pairwise_mode { $_[0]->{pairwise_mode}; } +sub set_hierarchical_mode { + my ( $self, $mode ) = @_; + + $self->{hierarchical_mode} = $mode; + + croak "Cannot have both pairwise and hierarchical modes on at the same time" + if $mode && $self->get_pairwise_mode; + + return $mode; +} + +# potential hot path so optimise to avoid arg handling +sub get_hierarchical_mode { + $_[0]->{hierarchical_mode}; +} + + + 1; __END__ From 1c6a509ab6c86c775dd7e8312bcc695599d55c38 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Fri, 23 Feb 2024 17:58:43 +1100 Subject: [PATCH 10/24] Indices: Support hierarchical calculations This allows several indices to be optimised when calculated for cluster nodes, providing they are done starting from the tips. PE has been optimised in this commit. --- lib/Biodiverse/Cluster.pm | 8 +- .../Indices/Phylogenetic/RefAlias.pm | 108 +++++++++++++++++- t/26-Cluster2.t | 49 ++++++++ 3 files changed, 161 insertions(+), 4 deletions(-) diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm index 2778c822e..a09eb0bab 100644 --- a/lib/Biodiverse/Cluster.pm +++ b/lib/Biodiverse/Cluster.pm @@ -2779,7 +2779,7 @@ sub sp_calc { # drop out if we have none to do return if $indices_object->get_valid_calculation_count == 0; - $indices_object->set_hierarchical_mode(1); + $indices_object->set_hierarchical_mode(!$args{no_hierarchical_mode}); delete $args{calculations}; # saves passing it onwards when we call the calculations delete $args{analyses}; # for backwards compat @@ -2811,9 +2811,13 @@ sub sp_calc { $count / $to_do, ); + my @child_names = map {$_->get_name} $node->get_children; + my %sp_calc_values = $indices_object->run_calculations( %args, - element_list1 => [keys %{$node->get_terminal_elements}] + element_list1 => [ keys %{$node->get_terminal_elements} ], + current_node_name => $node->get_name, + current_node_child_names => \@child_names, ); foreach my $key (keys %sp_calc_values) { diff --git a/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm b/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm index 1ed764e44..878064139 100644 --- a/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm +++ b/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm @@ -1,6 +1,9 @@ package Biodiverse::Indices::Phylogenetic::RefAlias; use strict; use warnings; +use 5.022; + +use Carp qw/croak/; our $VERSION = '4.99_002'; @@ -11,11 +14,17 @@ use List::Util qw /sum/; sub _calc_pe { my $self = shift; - my %args = @_; + my %args = @_; + + my $element_list_all = $args{element_list_all}; + + return $self->_calc_pe_hierarchical(%args) + if defined $args{current_node_name} + && $self->get_hierarchical_mode + && @$element_list_all > 1; my $tree_ref = $args{trimmed_tree}; my $results_cache = $args{PE_RESULTS_CACHE}; - my $element_list_all = $args{element_list_all}; \my %node_ranges = $args{node_range}; \my %rw_node_lengths = $args{inverse_range_weighted_node_lengths}; @@ -148,6 +157,101 @@ sub _calc_pe { return wantarray ? %results : \%results; } +# _calc_pe but taking advantage of hierarchical structures +# requires it be called bottom up +sub _calc_pe_hierarchical { + my ($self, %args) = @_; + + my $element_list_all = $args{element_list_all}; + + my $current_node_name = $args{current_node_name} + // croak 'Must pass the current node name when in hierarchical mode'; + my $child_names = $args{current_node_child_names}; + + my $tree_ref = $args{trimmed_tree}; + my $results_cache = $args{PE_RESULTS_CACHE}; + \my %node_ranges = $args{node_range}; + # \my %rw_node_lengths = $args{inverse_range_weighted_node_lengths}; + + # my $bd = $args{basedata_ref} || $self->get_basedata_ref; + + # default these to undef - more meaningful than zero + my ($PE_WE, $PE_WE_P); + + my (%wts, %local_ranges, %results); + + foreach my $group (@$child_names) { + my $results_this_gp; + # use the cached results for a group if present + if (exists $results_cache->{$group}) { + $results_this_gp = $results_cache->{$group}; + } + else { + # do it the hard way + delete local $args{current_node_name}; + $results_cache->{$group} = $self->_calc_pe (%args); + } + + if (defined $results_this_gp->{PE_WE}) { + $PE_WE += $results_this_gp->{PE_WE}; + } + + # Avoid some redundant slicing and dicing when we have only one group + # Pays off when processing large data sets + if (scalar @$element_list_all == 1) { + # no need to collate anything so make a shallow copy + @results{keys %$results_this_gp} = values %$results_this_gp; + # but we do need to add to the local range hash + my $hashref = $results_this_gp->{PE_WTLIST}; + @local_ranges{keys %$hashref} = (1) x scalar keys %$hashref; + } + else { + # refalias might be a nano-optimisation here... + \my %wt_hash = $results_this_gp->{PE_WTLIST}; + + # weights need to be summed, + # unless we are starting from a blank slate + if (keys %wts) { + foreach my $node (keys %wt_hash) { + $wts{$node} += $wt_hash{$node}; + $local_ranges{$node}++; + } + } + else { + %wts = %wt_hash; + @local_ranges{keys %wt_hash} = (1) x scalar keys %wt_hash; + } + } + } + + { + no warnings 'uninitialized'; + my $total_tree_length = $tree_ref->get_total_tree_length; + + #Phylogenetic endemism = sum for all nodes of: + # (branch length/total tree length) / node range + $PE_WE_P = eval {$PE_WE / $total_tree_length}; + } + # need the collated versions for multiple elements + if (scalar @$element_list_all > 1) { + $results{PE_WE} = $PE_WE; + $results{PE_WTLIST} = \%wts; + my %nranges = %node_ranges{keys %wts}; + $results{PE_RANGELIST} = \%nranges; + } + else { + my %nranges = %node_ranges{keys %{$results{PE_WTLIST}}}; + $results{PE_RANGELIST} = \%nranges; + } + + # need to set these + $results{PE_WE_P} = $PE_WE_P; + $results{PE_LOCAL_RANGELIST} = \%local_ranges; + + $results_cache->{$current_node_name} = {%results{qw/PE_WE PE_WTLIST/}}; + + return wantarray ? %results : \%results; +} 1; diff --git a/t/26-Cluster2.t b/t/26-Cluster2.t index 22b52830e..44415de71 100644 --- a/t/26-Cluster2.t +++ b/t/26-Cluster2.t @@ -216,6 +216,55 @@ sub test_rw_turnover_mx { } +sub test_cluster_node_calcs { + + my %args = @_; + my $bd = $args{basedata_ref} || get_basedata_object_from_site_data(CELL_SIZES => [300000, 300000]); + + my $prng_seed = $args{prng_seed} || $default_prng_seed; + my $tree_ref = $args{tree_ref} || get_tree_object_from_sample_data(); + + my $cl1 = $bd->add_cluster_output (name => 'cl1'); + $cl1->run_analysis ( + prng_seed => $prng_seed, + ); + $cl1->run_spatial_calculations ( + tree_ref => $tree_ref, + spatial_calculations => [ qw/calc_pe/ ], + no_hierarchical_mode => 1, + ); + my $cl2 = $bd->add_cluster_output (name => 'cl2'); + $cl2->run_analysis ( + prng_seed => $prng_seed, + ); + $cl2->run_spatial_calculations ( + tree_ref => $tree_ref, + spatial_calculations => [ qw/calc_pe/ ], + no_hierarchical_mode => 0, + ); + + my $node_hash1 = $cl1->get_node_hash; + my $node_hash2 = $cl2->get_node_hash; + + is [sort keys %$node_hash1], [sort keys %$node_hash2], 'paranoia check: same node names'; + + my (%aggregate1, %aggregate2); + foreach my $node_name (sort keys %$node_hash1) { + my $node1 = $node_hash1->{$node_name}; + my $node2 = $node_hash2->{$node_name}; + + foreach my $list_name (sort grep {$_ !~ /NODE_VALUES/}$node1->get_hash_lists) { + my $ref1 = $node1->get_list_ref_aa($list_name); + my $ref2 = $node2->get_list_ref_aa($list_name); + my $snapped1 = {map {$_ => sprintf "%.10f", $ref1->{$_}} keys %$ref1}; + my $snapped2 = {map {$_ => sprintf "%.10f", $ref2->{$_}} keys %$ref2}; + $aggregate1{$node_name}{$list_name} = $snapped1; + $aggregate2{$node_name}{$list_name} = $snapped2; + } + } + is \%aggregate2, \%aggregate1, 'same per-node index results with and without hierarchical mode'; +} + __DATA__ @@ CLUSTER_MINI_DATA From 5d6a8b44e00f42c960dded3aa425a54fa8831af8 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 08:35:16 +1100 Subject: [PATCH 11/24] Indices: calc_labels_not_on_tree: return early if nothing to work with Avoids a lot of hash creation and deletion with large datasets. --- lib/Biodiverse/Indices/Phylogenetic.pm | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/Biodiverse/Indices/Phylogenetic.pm b/lib/Biodiverse/Indices/Phylogenetic.pm index 707bcdb05..518183a80 100644 --- a/lib/Biodiverse/Indices/Phylogenetic.pm +++ b/lib/Biodiverse/Indices/Phylogenetic.pm @@ -1627,6 +1627,15 @@ sub calc_labels_not_on_tree { my $not_on_tree = $args{labels_not_on_tree}; + if (!keys %$not_on_tree) { + my $res = { + PHYLO_LABELS_NOT_ON_TREE => {}, + PHYLO_LABELS_NOT_ON_TREE_N => 0, + PHYLO_LABELS_NOT_ON_TREE_P => 0, + }; + return wantarray ? %$res : $res; + } + my %labels1 = %{$args{label_hash_all}}; my $richness = scalar keys %labels1; delete @labels1{keys %$not_on_tree}; From 67f655e1bcb8d264f3d104fd19e6b5e2e55d7345 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 10:02:23 +1100 Subject: [PATCH 12/24] Indices: add a hierarchical variant of get_path_lengths_to_root_node Speeds up PD calcs for cluster trees. --- lib/Biodiverse/Indices/Phylogenetic.pm | 42 +++++++++++++++++++++++--- t/26-Cluster2.t | 6 ++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/lib/Biodiverse/Indices/Phylogenetic.pm b/lib/Biodiverse/Indices/Phylogenetic.pm index 518183a80..9e2f02985 100644 --- a/lib/Biodiverse/Indices/Phylogenetic.pm +++ b/lib/Biodiverse/Indices/Phylogenetic.pm @@ -576,14 +576,24 @@ sub get_path_lengths_to_root_node { my $cache = !$args{no_cache}; #$cache = 0; # turn it off for debug my $el_list = $args{el_list} // []; - + + # if ($self->get_hierarchical_mode && $args{current_node_name}) { + # say STDERR 'KOO ' . ($args{current_node_name} // 'undef') . ' ' + # . scalar @{$args{element_list1} //[]}; + # } + return $self->_get_path_lengths_to_root_node_hierarchical(%args) + if defined $args{current_node_name} + && $self->get_hierarchical_mode + && scalar @{$args{element_list1} //[]} > 1; + # have we cached it? - #my $use_path_cache = $cache && $self->get_pairwise_mode(); + # caching makes sense only if we have + # only one element (group) containing labels + # or we are in hierarchical mode, but that is handled separately my $use_path_cache = $cache && $self->get_param('USE_PATH_LENGTH_CACHE_BY_GROUP') - && scalar @$el_list == 1; # caching makes sense only if we have - # only one element (group) containing labels + && scalar @$el_list == 1; if ($use_path_cache) { my $cache_h = $args{path_length_cache}; @@ -679,6 +689,30 @@ sub get_path_lengths_to_root_node { return wantarray ? %$path_hash : $path_hash; } +sub _get_path_lengths_to_root_node_hierarchical { + my ($self, %args) = @_; + + my $current_node_name = $args{current_node_name} + // croak 'Must pass the current node name when in hierarchical mode'; + my $child_names = $args{current_node_child_names}; + my $cache_h = $args{path_length_cache}; + + my %path_combined; + + foreach my $child (@$child_names) { + my $path = $cache_h->{$child}; + if (!$path) { + # need to calculate it + delete local $args{$current_node_name}; + $path = $self->get_path_lengths_to_root_node(%args); + } + @path_combined{keys %$path} = values %$path; + } + $cache_h->{$current_node_name} = \%path_combined; + + return wantarray ? %path_combined : \%path_combined; +} + sub get_metadata_calc_pe { diff --git a/t/26-Cluster2.t b/t/26-Cluster2.t index 44415de71..0100c5278 100644 --- a/t/26-Cluster2.t +++ b/t/26-Cluster2.t @@ -224,13 +224,15 @@ sub test_cluster_node_calcs { my $prng_seed = $args{prng_seed} || $default_prng_seed; my $tree_ref = $args{tree_ref} || get_tree_object_from_sample_data(); + my $calcs = [qw/calc_pe calc_pd/]; + my $cl1 = $bd->add_cluster_output (name => 'cl1'); $cl1->run_analysis ( prng_seed => $prng_seed, ); $cl1->run_spatial_calculations ( tree_ref => $tree_ref, - spatial_calculations => [ qw/calc_pe/ ], + spatial_calculations => $calcs, no_hierarchical_mode => 1, ); my $cl2 = $bd->add_cluster_output (name => 'cl2'); @@ -239,7 +241,7 @@ sub test_cluster_node_calcs { ); $cl2->run_spatial_calculations ( tree_ref => $tree_ref, - spatial_calculations => [ qw/calc_pe/ ], + spatial_calculations => $calcs, no_hierarchical_mode => 0, ); From 6d9c5539e44c7f9e54bc613e67a0129b4788f7b1 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 10:16:46 +1100 Subject: [PATCH 13/24] Indices: _calc_endemism_hier_part: avoid some method calls Use a treenode method that caches, rather than repeatedly calling methods to get the same answer. --- lib/Biodiverse/Indices/Endemism.pm | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/Biodiverse/Indices/Endemism.pm b/lib/Biodiverse/Indices/Endemism.pm index a659a1ecb..16a01056b 100644 --- a/lib/Biodiverse/Indices/Endemism.pm +++ b/lib/Biodiverse/Indices/Endemism.pm @@ -439,17 +439,18 @@ sub _calc_endemism_hier_part { my $contribution = $wt / $we; $total_count ++; my $node_ref = $tree->get_node_ref_aa ($label); - my $node_name = $label; # Climb the tree and add the contributions. # Depth is off by one so the root is $i==-1. my $i = $depth; - while ($i >= 0) { + # this call caches + my $path = $node_ref->get_path_name_array_to_root_node_aa; + + foreach my $node_name (@$path) { $hash_ref_array[$i]{$node_name} += $contribution; $count_array[$i]{$node_name} ++; $i--; - $node_ref = $node_ref->get_parent; - $node_name = $node_ref->get_name; + last if $i < 0; } } From 8a69ac2b9362f87aa946d977612abefe11403fb5 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 10:22:07 +1100 Subject: [PATCH 14/24] Indices: _calc_endemism_hier_part: refactor some variables --- lib/Biodiverse/Indices/Endemism.pm | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/Biodiverse/Indices/Endemism.pm b/lib/Biodiverse/Indices/Endemism.pm index 16a01056b..4a6512f08 100644 --- a/lib/Biodiverse/Indices/Endemism.pm +++ b/lib/Biodiverse/Indices/Endemism.pm @@ -432,12 +432,11 @@ sub _calc_endemism_hier_part { my @hash_ref_array = (); my @count_array = (); - my $total_count = 0; + my $total_count = keys %$wt_list; foreach my $label (keys %$wt_list) { - my $wt = $wt_list->{$label}; - my $contribution = $wt / $we; - $total_count ++; + my $contribution = $wt_list->{$label} / $we; + my $node_ref = $tree->get_node_ref_aa ($label); # Climb the tree and add the contributions. From a37dbb330175f97d953e94c8bf3b63085b75e90d Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 10:23:31 +1100 Subject: [PATCH 15/24] delete commented code --- lib/Biodiverse/Indices/Phylogenetic.pm | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/Biodiverse/Indices/Phylogenetic.pm b/lib/Biodiverse/Indices/Phylogenetic.pm index 9e2f02985..24a198bb6 100644 --- a/lib/Biodiverse/Indices/Phylogenetic.pm +++ b/lib/Biodiverse/Indices/Phylogenetic.pm @@ -577,10 +577,6 @@ sub get_path_lengths_to_root_node { #$cache = 0; # turn it off for debug my $el_list = $args{el_list} // []; - # if ($self->get_hierarchical_mode && $args{current_node_name}) { - # say STDERR 'KOO ' . ($args{current_node_name} // 'undef') . ' ' - # . scalar @{$args{element_list1} //[]}; - # } return $self->_get_path_lengths_to_root_node_hierarchical(%args) if defined $args{current_node_name} && $self->get_hierarchical_mode From 9b1846d9543352f97d7f14a9aa194063934f727e Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 11:08:12 +1100 Subject: [PATCH 16/24] Indices: refactor hierarchical node details It is cleaner to pack the node and child names in their own structure. That also enables later additions without adding yet more top level arguments. --- lib/Biodiverse/Cluster.pm | 11 ++++++---- lib/Biodiverse/Indices/Phylogenetic.pm | 17 ++++++++------- .../Indices/Phylogenetic/RefAlias.pm | 21 ++++++++++--------- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm index a09eb0bab..1fdd58ae8 100644 --- a/lib/Biodiverse/Cluster.pm +++ b/lib/Biodiverse/Cluster.pm @@ -2811,13 +2811,16 @@ sub sp_calc { $count / $to_do, ); - my @child_names = map {$_->get_name} $node->get_children; + # needs a better name + my $current_node_details = { + name => $node->get_name, + child_names => [map {$_->get_name} $node->get_children], + }; my %sp_calc_values = $indices_object->run_calculations( %args, - element_list1 => [ keys %{$node->get_terminal_elements} ], - current_node_name => $node->get_name, - current_node_child_names => \@child_names, + element_list1 => [ keys %{$node->get_terminal_elements} ], + current_node_details => $current_node_details, ); foreach my $key (keys %sp_calc_values) { diff --git a/lib/Biodiverse/Indices/Phylogenetic.pm b/lib/Biodiverse/Indices/Phylogenetic.pm index 24a198bb6..ab6e17918 100644 --- a/lib/Biodiverse/Indices/Phylogenetic.pm +++ b/lib/Biodiverse/Indices/Phylogenetic.pm @@ -578,7 +578,7 @@ sub get_path_lengths_to_root_node { my $el_list = $args{el_list} // []; return $self->_get_path_lengths_to_root_node_hierarchical(%args) - if defined $args{current_node_name} + if defined $args{current_node_details} && $self->get_hierarchical_mode && scalar @{$args{element_list1} //[]} > 1; @@ -688,10 +688,13 @@ sub get_path_lengths_to_root_node { sub _get_path_lengths_to_root_node_hierarchical { my ($self, %args) = @_; - my $current_node_name = $args{current_node_name} - // croak 'Must pass the current node name when in hierarchical mode'; - my $child_names = $args{current_node_child_names}; - my $cache_h = $args{path_length_cache}; + my $node_data = $args{current_node_details} + // croak 'Must pass the current node details when in hierarchical mode'; + my $node_name = $node_data->{name} + // croak 'Missing current node name in hierarchical mode'; + my $child_names = $node_data->{child_names}; + + my $cache_h = $args{path_length_cache}; my %path_combined; @@ -699,12 +702,12 @@ sub _get_path_lengths_to_root_node_hierarchical { my $path = $cache_h->{$child}; if (!$path) { # need to calculate it - delete local $args{$current_node_name}; + delete local $args{current_node_details}; $path = $self->get_path_lengths_to_root_node(%args); } @path_combined{keys %$path} = values %$path; } - $cache_h->{$current_node_name} = \%path_combined; + $cache_h->{$node_name} = \%path_combined; return wantarray ? %path_combined : \%path_combined; } diff --git a/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm b/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm index 878064139..c4f42c8a3 100644 --- a/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm +++ b/lib/Biodiverse/Indices/Phylogenetic/RefAlias.pm @@ -19,7 +19,7 @@ sub _calc_pe { my $element_list_all = $args{element_list_all}; return $self->_calc_pe_hierarchical(%args) - if defined $args{current_node_name} + if defined $args{current_node_details} && $self->get_hierarchical_mode && @$element_list_all > 1; @@ -164,16 +164,15 @@ sub _calc_pe_hierarchical { my $element_list_all = $args{element_list_all}; - my $current_node_name = $args{current_node_name} - // croak 'Must pass the current node name when in hierarchical mode'; - my $child_names = $args{current_node_child_names}; + my $node_data = $args{current_node_details} + // croak 'Must pass the current node details when in hierarchical mode'; + my $node_name = $node_data->{name} + // croak 'Missing current node name in hierarchical mode'; + my $child_names = $node_data->{child_names}; my $tree_ref = $args{trimmed_tree}; my $results_cache = $args{PE_RESULTS_CACHE}; \my %node_ranges = $args{node_range}; - # \my %rw_node_lengths = $args{inverse_range_weighted_node_lengths}; - - # my $bd = $args{basedata_ref} || $self->get_basedata_ref; # default these to undef - more meaningful than zero my ($PE_WE, $PE_WE_P); @@ -188,8 +187,10 @@ sub _calc_pe_hierarchical { } else { # do it the hard way - delete local $args{current_node_name}; - $results_cache->{$group} = $self->_calc_pe (%args); + delete local $args{current_node_details}; + $results_this_gp + = $results_cache->{$group} + = $self->_calc_pe (%args); } if (defined $results_this_gp->{PE_WE}) { @@ -249,7 +250,7 @@ sub _calc_pe_hierarchical { $results{PE_WE_P} = $PE_WE_P; $results{PE_LOCAL_RANGELIST} = \%local_ranges; - $results_cache->{$current_node_name} = {%results{qw/PE_WE PE_WTLIST/}}; + $results_cache->{$node_name} = {%results{qw/PE_WE PE_WTLIST/}}; return wantarray ? %results : \%results; } From 744c430fd23410a8d53b1cca28655bcccee60434 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 12:08:49 +1100 Subject: [PATCH 17/24] Squeeze a little more performance out of compare_lists_by_item Maybe. --- lib/Biodiverse/Common.pm | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/Biodiverse/Common.pm b/lib/Biodiverse/Common.pm index 1db264151..16cd4d377 100644 --- a/lib/Biodiverse/Common.pm +++ b/lib/Biodiverse/Common.pm @@ -2423,7 +2423,7 @@ sub compare_lists_by_item { foreach my $index (keys %base_ref) { next COMP_BY_ITEM - if !(defined $base_ref{$index} && defined $comp_ref{$index}); + if !(defined $comp_ref{$index} && defined $base_ref{$index}); # compare at 10 decimal place precision # this also allows for serialisation which @@ -2441,18 +2441,18 @@ sub compare_lists_by_item { # SUMX is the sum of compared values # SUMXX is the sum of squared compared values # The latter two are used in z-score calcs - $results{"C_$index"} += $increment; - $results{"Q_$index"} ++; - $results{"P_$index"} = $results{"C_$index"} - / $results{"Q_$index"}; + # obfuscated to squeeze as much speed as we can + # $results{"C_$index"} += $increment; + # $results{"Q_$index"} ++; + $results{"P_$index"} = ($results{"C_$index"} += $increment) + / (++$results{"Q_$index"}); # use original vals for sums - $results{"SUMX_$index"} += $comp_ref{$index}; - $results{"SUMXX_$index"} += ($comp_ref{$index}**2); + $results{"SUMX_$index"} += $comp_ref{$index}; + $results{"SUMXX_$index"} += ($comp_ref{$index}**2); # track the number of ties - if (abs($diff) <= DEFAULT_PRECISION_SMALL) { - $results{"T_$index"} ++; - } + $results{"T_$index"} ++ + if (abs($diff) <= DEFAULT_PRECISION_SMALL); } return; From 758652828a9d2a4f879795057df0c5dc502f3904 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 12:18:58 +1100 Subject: [PATCH 18/24] TreeNode::add_to_lists: optimise Use direct assignment if starting with empty list. --- lib/Biodiverse/TreeNode.pm | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/lib/Biodiverse/TreeNode.pm b/lib/Biodiverse/TreeNode.pm index c361b9c87..db297d372 100644 --- a/lib/Biodiverse/TreeNode.pm +++ b/lib/Biodiverse/TreeNode.pm @@ -2445,14 +2445,22 @@ sub add_to_lists { $self->{$list} = $values; } elsif (is_hashref($values)) { - $self->{$list} = {} if ! exists $self->{$list}; - next if ! scalar keys %$values; - @{$self->{$list}}{keys %$values} = values %$values; # add using a slice - } + if (!$self->{$list}) { + $self->{$list} = {%$values}; + } + else { + next if !scalar keys %$values; + @{$self->{$list}}{keys %$values} = values %$values; + } + } elsif (is_arrayref($values)) { - $self->{$list} = [] if ! exists $self->{$list}; - next if ! scalar @$values; - push @{$self->{$list}}, @$values; + if (!$self->{$list}) { + $self->{$list} = [@$values]; + } + else { + next if !scalar @$values; + push @{$self->{$list}}, @$values; + } } else { croak "add_to_lists warning, no valid list ref passed\n"; From 0c34d12c3fef05591025543a2c769172c38e8013 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 12:21:45 +1100 Subject: [PATCH 19/24] Cluster spatial calcs: add lists by ref Avoids a lot of copying. --- lib/Biodiverse/Cluster.pm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm index 1fdd58ae8..61c3a2593 100644 --- a/lib/Biodiverse/Cluster.pm +++ b/lib/Biodiverse/Cluster.pm @@ -2827,7 +2827,10 @@ sub sp_calc { if (is_arrayref($sp_calc_values{$key}) || is_hashref($sp_calc_values{$key})) { - $node->add_to_lists ($key => $sp_calc_values{$key}); + $node->add_to_lists ( + $key => $sp_calc_values{$key}, + use_ref => 1, + ); delete $sp_calc_values{$key}; } } From 41685467e60578063f487d34dbfb267970cad42a Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 12:44:17 +1100 Subject: [PATCH 20/24] compare_lists_by_item: lift a var outside the loop This can be a _very_ hot loop so even small differences add up. --- lib/Biodiverse/Common.pm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/Biodiverse/Common.pm b/lib/Biodiverse/Common.pm index 16cd4d377..feb2d0f38 100644 --- a/lib/Biodiverse/Common.pm +++ b/lib/Biodiverse/Common.pm @@ -2418,6 +2418,7 @@ sub compare_lists_by_item { \my %base_ref = $args{base_list_ref}; \my %comp_ref = $args{comp_list_ref}; \my %results = $args{results_list_ref}; + my ($diff, $increment); COMP_BY_ITEM: foreach my $index (keys %base_ref) { @@ -2428,8 +2429,8 @@ sub compare_lists_by_item { # compare at 10 decimal place precision # this also allows for serialisation which # rounds the numbers to 15 decimals - my $diff = $base_ref{$index} - $comp_ref{$index}; - my $increment = $diff > DEFAULT_PRECISION_SMALL ? 1 : 0; + $diff = $base_ref{$index} - $comp_ref{$index}; + $increment = $diff > DEFAULT_PRECISION_SMALL ? 1 : 0; # for debug, but leave just in case #carp "$element, $op\n$comp\n$base " . ($comp - $base) if $increment; From b2ee3eddab6174829674a5124d7c6ceeba62d95b Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 14:01:29 +1100 Subject: [PATCH 21/24] Indices: cache the current results from each sub These are cleared as we go to avoid leakage. --- lib/Biodiverse/Indices.pm | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index e957cd486..d30c65d20 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -1533,11 +1533,16 @@ sub run_dependencies { my $tmp = $self->get_param('AS_RESULTS_FROM_GLOBAL') || {}; my %as_results_from_global = %$tmp; # make a copy + state $cache_name_local_results = 'AS_RESULTS_FROM_LOCAL'; + # Now we run the calculations at this level. # We also keep track of what has been run # to avoid repetition through multiple dependencies. my %results; my %as_results_from; + # make sure this is new each iteration + $self->set_cached_value ($cache_name_local_results => \%as_results_from); + foreach my $calc (@$calc_list) { my $calc_results; @@ -1574,6 +1579,9 @@ sub run_dependencies { $results{$calc} = $calc_results; } + # We refresh each call above, but this ensures last one is cleaned up. + $self->delete_cached_value($cache_name_local_results); + if ( $type eq 'pre_calc_global' ) { $self->set_param( AS_RESULTS_FROM_GLOBAL => \%as_results_from_global ); } From e173a475ad317c05ed74dbd0d4f284811ae975b4 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 14:21:03 +1100 Subject: [PATCH 22/24] Indices: reuse whole and central endemism results when appropriate If the second neighbour set is empty then the whole and central variants return the same results. So short circuit in these cases. --- lib/Biodiverse/Indices/Endemism.pm | 66 +++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/lib/Biodiverse/Indices/Endemism.pm b/lib/Biodiverse/Indices/Endemism.pm index 4a6512f08..03cd22ce4 100644 --- a/lib/Biodiverse/Indices/Endemism.pm +++ b/lib/Biodiverse/Indices/Endemism.pm @@ -298,11 +298,24 @@ sub get_metadata_calc_endemism_central_hier_part { } sub calc_endemism_central_hier_part { - my $self = shift; + my ($self, %args) = @_; + + # If we have no nbrs in set 2 then we are the same as the "whole" variant. + # So just grab its values if it has already been calculated. + if (!keys %{$args{label_hash2}}) { + my $cache_hash = $self->get_cached_value('AS_RESULTS_FROM_LOCAL'); + my $cached = $cache_hash->{calc_endemism_whole_hier_part}; + if ($cached) { + my %remapped; + @remapped{map {$_ =~ s/ENDW/ENDC/r} keys %$cached} + = values %$cached; + return wantarray ? %remapped : \%remapped; + } + } return $self->_calc_endemism_hier_part ( - @_, - prefix => 'ENDC_HPART_', + %args, + prefix => 'ENDC_HPART_', ); } @@ -317,11 +330,25 @@ sub get_metadata_calc_endemism_whole_hier_part { } sub calc_endemism_whole_hier_part { - my $self = shift; + my ($self, %args) = @_; + + # If we have no nbrs in set 2 then we are the same as the "central" variant. + # So just grab its values if it has already been calculated. + if (!keys %{$args{label_hash2}}) { + my $cache_hash = $self->get_cached_value('AS_RESULTS_FROM_LOCAL'); + my $cached = $cache_hash->{calc_endemism_central_hier_part}; + if ($cached) { + # say STDERR join ' ', sort keys %$cached; + my %remapped; + @remapped{map {$_ =~ s/ENDC/ENDW/r} keys %$cached} + = values %$cached; + return wantarray ? %remapped : \%remapped; + } + } return $self->_calc_endemism_hier_part ( - @_, - prefix => 'ENDW_HPART_', + %args, + prefix => 'ENDW_HPART_', ); } @@ -407,6 +434,7 @@ sub metadata_for_calc_endemism_hier_part { formula => $formula, pre_calc => [ "_calc_endemism_$endemism_type", + 'calc_abc2', ], pre_calc_global => 'get_basedata_labels_as_tree', uses_nbr_lists => 1, # how many sets of lists it must have @@ -555,6 +583,15 @@ sub _calc_endemism_central { my $self = shift; my %args = @_; + # If we have no nbrs in set 2 then we are the same as the "whole" variant. + # So just grab its values if it has already been calculated. + if (!keys %{$args{label_hash2}}) { + my $cache_hash = $self->get_cached_value('AS_RESULTS_FROM_LOCAL'); + my $cached = $cache_hash->{_calc_endemism_whole}; + return wantarray ? %$cached : $cached + if $cached; + } + return $self->_calc_endemism(%args, end_central => 1); } @@ -694,8 +731,18 @@ sub get_metadata__calc_endemism_whole { # wrapper sub sub _calc_endemism_whole { - my $self = shift; - return $self->_calc_endemism(@_, end_central => 0); + my ($self, %args) = @_; + + # If we have no nbrs in set 2 then we are the same as the "central" variant. + # So just grab its values if it has already been calculated. + if (!keys %{$args{label_hash2}}) { + my $cache_hash = $self->get_cached_value('AS_RESULTS_FROM_LOCAL'); + my $cached = $cache_hash->{_calc_endemism_central}; + return wantarray ? %$cached : $cached + if $cached; + } + + return $self->_calc_endemism(%args, end_central => 0); } # Calculate endemism. Private method called by others @@ -707,8 +754,7 @@ sub _calc_endemism { my $bd = $self->get_basedata_ref; # if element_list2 is specified and end_central == 1, - # then it will consider those elements in the local range calculations, - # but only use those labels that occur in the element_list1 + # then it will use the local ranges across sets 1 and 2 my $local_ranges = $args{label_hash_all}; my $label_list = $args{end_central} From 16a4f86f0cd535e0b701e075bc54874214172661 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 16:18:13 +1100 Subject: [PATCH 23/24] formatting --- lib/Biodiverse/Indices.pm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index d30c65d20..337a234d3 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -794,13 +794,11 @@ sub parse_dependencies_for_calc { $self->_convert_to_array( input => $required_args ); foreach my $required_arg ( sort @$reqd_args_a ) { - my $re = qr /^($required_arg)$/ - ; # match is used in the grep? Was used in now-removed code. + my $re = qr /^($required_arg)$/; my $is_defined; CALC_ARG: foreach - my $calc_arg ( sort grep { $_ =~ $re } keys %$calc_args ) - { + my $calc_arg ( sort grep { $_ =~ $re } keys %$calc_args ) { if ( defined $calc_args->{$calc_arg} ) { $is_defined++; last CALC_ARG; From 62fd8a1eab94b9f7d43e2024627f3ac74dcee807 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Sat, 24 Feb 2024 16:36:06 +1100 Subject: [PATCH 24/24] simplify by using List::Util::any --- lib/Biodiverse/Indices.pm | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index 337a234d3..48cbec882 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -12,7 +12,7 @@ use warnings; #use Data::Dumper; use Scalar::Util qw /blessed weaken/; use List::MoreUtils qw /uniq/; -use List::Util qw /sum/; +use List::Util qw /sum any/; use English ( -no_match_vars ); use Ref::Util qw { :all }; use JSON::MaybeXS; @@ -795,15 +795,9 @@ sub parse_dependencies_for_calc { foreach my $required_arg ( sort @$reqd_args_a ) { my $re = qr /^($required_arg)$/; - my $is_defined; - CALC_ARG: - foreach - my $calc_arg ( sort grep { $_ =~ $re } keys %$calc_args ) { - if ( defined $calc_args->{$calc_arg} ) { - $is_defined++; - last CALC_ARG; - } - } + my $is_defined + = any { $_ =~ $re && defined $calc_args->{$_}} + sort keys %$calc_args; if ( !$is_defined ) { Biodiverse::Indices::MissingRequiredArguments->throw(