From bb0e7abf2532d4ae0b2b631965f011541b676d75 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Tue, 20 Feb 2024 19:55:46 +1100 Subject: [PATCH 01/15] Micro-optimise a few Matrix methods --- lib/Biodiverse/Matrix.pm | 5 ++- lib/Biodiverse/Matrix/Base.pm | 71 ++++++++++++++++++++------------- lib/Biodiverse/Matrix/LowMem.pm | 57 ++++++++++++++++---------- 3 files changed, 81 insertions(+), 52 deletions(-) diff --git a/lib/Biodiverse/Matrix.pm b/lib/Biodiverse/Matrix.pm index 4e93fa030..178866240 100644 --- a/lib/Biodiverse/Matrix.pm +++ b/lib/Biodiverse/Matrix.pm @@ -467,8 +467,9 @@ sub delete_element { my $exists = $self->element_pair_exists(@_) || return 0; - croak "element1 and/or element2 not defined\n" - if !( defined $args{element1} && defined $args{element2} ); + # handled in the exists check + # croak "element1 and/or element2 not defined\n" + # if !( defined $args{element1} && defined $args{element2} ); my ( $element1, $element2 ) = $exists == 1 diff --git a/lib/Biodiverse/Matrix/Base.pm b/lib/Biodiverse/Matrix/Base.pm index 81f4850b1..feb3e0de0 100644 --- a/lib/Biodiverse/Matrix/Base.pm +++ b/lib/Biodiverse/Matrix/Base.pm @@ -36,6 +36,10 @@ sub element_is_in_matrix { return exists $self->{ELEMENTS}{$element}; } +sub element_is_in_matrix_aa { + exists $_[0]->{ELEMENTS}{$_[1]}; +} + # syntactic sugar sub set_value { my $self = shift; @@ -45,32 +49,24 @@ sub set_value { sub get_value { # return the value of a pair of elements. argument checking is done by element_pair_exists. my $self = shift; my %args = @_; - - my ($element1, $element2); - my $exists = $args{pair_exists} || $self->element_pair_exists (@_); - if ($exists == 1) { - $element1 = $args{element1}; - $element2 = $args{element2}; - return $self->{BYELEMENT}{$element1}{$element2}; - } - elsif ($exists == 2) { # elements exist, but in different order - switch them - $element1 = $args{element2}; - $element2 = $args{element1}; - return $self->{BYELEMENT}{$element1}{$element2}; - } - elsif (! $exists) { - if ($args{element1} eq $args{element2} - and $self->element_is_in_matrix (element => $args{element1}) - ) { - return $self->get_param ('SELF_SIMILARITY'); # defaults to undef - } + my $exists = $args{pair_exists} // $self->element_pair_exists (@_); - return; # if we get this far then the combination does not exist - cannot get the value - } + return $self->{BYELEMENT}{$args{element1}}{$args{element2}} + if $exists == 1; + # elements exist, but in different order - switch them + return $self->{BYELEMENT}{$args{element2}}{$args{element1}} + if $exists == 2; + + # defaults to undef + return $self->get_param ('SELF_SIMILARITY') + if !$exists + and $args{element1} eq $args{element2} + and $self->element_is_in_matrix_aa ($args{element1}); - croak "[MATRICES] You seem to have added an extra result (value $exists) to" . - " sub element_pair_exists. What were you thinking?\n"; + # if we get this far then the combination does not exist + # and we cannot get the value + return; } # Same as get_value except it does not check for existence or self-similarity @@ -120,6 +116,23 @@ sub element_pair_exists { : 0; } +sub element_pair_exists_aa { + my ($self, $element1, $element2) = @_; + + Biodiverse::MissingArgument->throw ('element1 and/or element2 not defined') + if ! (defined $element1 && defined $element2); + + # avoid some excess hash lookups + my $hash_ref = $self->{BYELEMENT}; + + # need to stop autovivification of element1 or 2 + no autovivification; + return exists $hash_ref->{$element1}{$element2} ? 1 + : exists $hash_ref->{$element2}{$element1} ? 2 + : 0; +} + + sub get_element_pair_count { my $self = shift; @@ -128,9 +141,9 @@ sub get_element_pair_count { my $elements = $self->{BYELEMENT}; my $count = 0; - foreach my $subhash (values %$elements) { - $count += scalar keys %$subhash; - } + # postfix for speed + $count += scalar keys %$_ + foreach values %$elements; return $count; } @@ -141,8 +154,10 @@ sub delete_all_pairs_with_element { my $self = shift; my %args = @_; - croak "element not specified\n" if ! defined $args{element}; - croak "element does not exist\n" if ! $self->element_is_in_matrix (element => $args{element}); + croak "element not specified\n" + if ! defined $args{element}; + croak "element does not exist\n" + if ! $self->element_is_in_matrix_aa ($args{element}); my $element = $args{element}; diff --git a/lib/Biodiverse/Matrix/LowMem.pm b/lib/Biodiverse/Matrix/LowMem.pm index 2087a4d8b..c81341a39 100644 --- a/lib/Biodiverse/Matrix/LowMem.pm +++ b/lib/Biodiverse/Matrix/LowMem.pm @@ -51,16 +51,32 @@ sub new { return $self; } -sub element_is_in_matrix { +sub element_is_in_matrix { my $self = shift; my %args = @_; - + croak "element not defined\n" if ! defined $args{element}; my $element = $args{element}; return 1 if exists $self->{BYELEMENT}{$element}; - + + my $el_hash = $self->{BYELEMENT}; + foreach my $hashref (values %$el_hash) { + return 1 if exists $hashref->{$element}; + } + + return; +} + +# no real speedup with array args but we need symmetry with Base.pm +sub element_is_in_matrix_aa { + my ($self, $element) = @_; + + croak "element not defined\n" if ! defined $element; + + return 1 if exists $self->{BYELEMENT}{$element}; + my $el_hash = $self->{BYELEMENT}; foreach my $hashref (values %$el_hash) { return 1 if exists $hashref->{$element}; @@ -113,34 +129,31 @@ sub add_element_aa { # add an element pair to the object sub delete_element { # should be called delete_element_pair, but need to find where it's used first my $self = shift; my %args = @_; - croak "element1 or element2 not defined\n" - if ! defined $args{element1} - || ! defined $args{element2}; + + # taken care of in the exists sub + # croak "element1 or element2 not defined\n" + # if ! defined $args{element1} + # || ! defined $args{element2}; my $element1 = $args{element1}; my $element2 = $args{element2}; - my $exists = $self->element_pair_exists (@_); + my $exists = $self->element_pair_exists_aa ($element1, $element2) + || return 0; - if (! $exists) { - #print "WARNING: element combination does not exist\n"; - return 0; # combination does not exist - cannot delete it - } - elsif ($exists == 2) { # elements exist, but in different order - switch them - #print "DELETE ELEMENTS SWITCHING $element1 $element2\n"; - $element1 = $args{element2}; - $element2 = $args{element1}; - } + # elements exist but in different order - switch them + ($element1, $element2) = ($element2, $element1) + if $exists == 2; # now we get to the cleanup, including the containing hashes if they are now empty # all the undef - delete pairs are to ensure they get deleted properly # the hash ref must be empty (undef) or it won't be deleted # autovivification of $self->{BYELEMENT}{$element1} is avoided by $exists above - delete $self->{BYELEMENT}{$element1}{$element2}; # if exists $self->{BYELEMENT}{$element1}{$element2}; - if (scalar keys %{$self->{BYELEMENT}{$element1}} == 0) { + my $href = $self->{BYELEMENT}; + delete $href->{$element1}{$element2}; # if exists $self->{BYELEMENT}{$element1}{$element2}; + if (!keys %{$href->{$element1}}) { #print "Deleting BYELEMENT{$element1}\n"; #undef $self->{BYELEMENT}{$element1}; - defined (delete $self->{BYELEMENT}{$element1}) - || warn "ISSUES BYELEMENT $element1 $element2\n"; + delete $href->{$element1} // warn "ISSUES BYELEMENT $element1 $element2\n"; } return 1; # return success if we get this far @@ -189,8 +202,8 @@ sub get_element_pairs_with_value { while (my ($el1, $hash_ref) = each %$element_hash) { foreach my $el2 (keys %$hash_ref) { my $value = $self->get_value (element1 => $el1, element2 => $el2); - next if $val ne $value; - $results{$el1}{$el2} ++; + $results{$el1}{$el2} ++ + if $val eq $value; } } From 0cd7ab8d14916913601f308f08841123a2f6ab10 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 12:31:52 +1100 Subject: [PATCH 02/15] Matrices: move get_element_values into Base.pm And optimise. Plus some smaller changes. --- lib/Biodiverse/Matrix.pm | 36 ++--------------------------------- lib/Biodiverse/Matrix/Base.pm | 18 ++++++++++++++++++ t/25-Matrix.t | 4 ++++ 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/lib/Biodiverse/Matrix.pm b/lib/Biodiverse/Matrix.pm index 178866240..352f79256 100644 --- a/lib/Biodiverse/Matrix.pm +++ b/lib/Biodiverse/Matrix.pm @@ -216,8 +216,7 @@ sub rebuild_value_index { # we want pairs in their stored order next EL2 - if 1 != - $self->element_pair_exists( element1 => $el1, element2 => $el2 ); + if 1 != $self->element_pair_exists_aa( $el1, $el2 ); my $val = $self->get_value( element1 => $el1, element2 => $el2 ); @@ -553,7 +552,7 @@ sub get_elements { sub get_elements_ref { my $self = shift; - return $self->{ELEMENTS} // do { $self->{ELEMENTS} = {} }; + return $self->{ELEMENTS} //= {}; } sub get_elements_as_array { @@ -598,40 +597,9 @@ sub get_element_pairs_with_value { return wantarray ? %results : \%results; } -sub get_element_values { # get all values associated with one element - my $self = shift; - my %args = @_; - - croak "element not specified (matrix)\n" if !defined $args{element}; - croak "matrix element does not exist\n" - if !$self->element_is_in_matrix( element => $args{element} ); - - my @elements = $self->get_elements_as_array; - - my %values; - foreach my $el (@elements) { - if ( - $self->element_pair_exists( - element1 => $el, - element2 => $args{element} - ) - ) - { - $values{$el} = $self->get_value( - element1 => $el, - element2 => $args{element} - ); - } - } - - return wantarray ? %values : \%values; -} - sub delete_all_elements { my $self = shift; - no autovivification; - $self->{BYVALUE} = undef; $self->{BYELEMENT} = undef; $self->{ELEMENTS} = undef; diff --git a/lib/Biodiverse/Matrix/Base.pm b/lib/Biodiverse/Matrix/Base.pm index feb3e0de0..9d20f0770 100644 --- a/lib/Biodiverse/Matrix/Base.pm +++ b/lib/Biodiverse/Matrix/Base.pm @@ -92,7 +92,25 @@ sub get_defined_value_aa { $el_ref->{$_[1]}{$_[2]} // $el_ref->{$_[2]}{$_[1]}; } +sub get_element_values { # get all values associated with one element + my $self = shift; + my %args = @_; + + croak "element not specified (matrix)\n" if !defined $args{element}; + croak "matrix element does not exist\n" + if !$self->element_is_in_matrix_aa( $args{element} ); + + my $elements = $self->get_elements_as_array; + my %values; + foreach my $el (@$elements) { + my $v = $self->get_defined_value_aa($el, $args{element}) + // next; + $values{$el} = $v; + } + + return wantarray ? %values : \%values; +} # check an element pair exists, returning: # 1 if yes, diff --git a/t/25-Matrix.t b/t/25-Matrix.t index dc4a9ee51..e4d6a5166 100644 --- a/t/25-Matrix.t +++ b/t/25-Matrix.t @@ -484,6 +484,10 @@ sub run_main_tests { } }; + my $vals = $mx->get_element_values (element => 'a'); + my $exp_vals = {b => 101, d => 104, f => 101}; + is $vals, $exp_vals, 'get_element_values'; + } From 46fa4202b093a6705e66e320d928e028d345d626 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 12:34:00 +1100 Subject: [PATCH 03/15] Matrix.pm: avoid a while-each loop --- lib/Biodiverse/Matrix.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Biodiverse/Matrix.pm b/lib/Biodiverse/Matrix.pm index 352f79256..3d252690c 100644 --- a/lib/Biodiverse/Matrix.pm +++ b/lib/Biodiverse/Matrix.pm @@ -584,8 +584,8 @@ sub get_element_pairs_with_value { # could special case $val_key == 0 when index precision is %.2g # and we know we only have defined values - while ( my ( $el1, $hash_ref ) = each %$element_hash ) { - foreach my $el2 ( keys %$hash_ref ) { + foreach my $el1 (keys %$element_hash) { + foreach my $el2 ( keys %{$element_hash->{$el1}} ) { # Deliberately micro-optimised code # to reduce book-keeping overheads. # Note that stringification implicitly uses %.15f precision From ad6257a2b7d3838c6a477874a8158b8ae4744da7 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 12:39:37 +1100 Subject: [PATCH 04/15] Matrix.pm: add array args variant of get_value_index_key --- lib/Biodiverse/Matrix.pm | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/lib/Biodiverse/Matrix.pm b/lib/Biodiverse/Matrix.pm index 3d252690c..bbb7e6f60 100644 --- a/lib/Biodiverse/Matrix.pm +++ b/lib/Biodiverse/Matrix.pm @@ -220,7 +220,7 @@ sub rebuild_value_index { my $val = $self->get_value( element1 => $el1, element2 => $el2 ); - my $index_val = $self->get_value_index_key( value => $val ); + my $index_val = $self->get_value_index_key_aa( $val ); $self->{BYVALUE}{$index_val}{$el1}{$el2}++; } @@ -246,6 +246,18 @@ sub get_value_index_key { return $val; } +sub get_value_index_key_aa { + my ($self, $val) = @_; + + $val // return 'undef'; + + if ( my $prec = $self->get_param('VAL_INDEX_PRECISION') ) { + $val = sprintf $prec, $val; + } + + return $val; +} + # need to flesh this out - total number of elements, symmetry, summary stats etc sub _describe { my $self = shift; @@ -447,7 +459,7 @@ sub add_element { return; } - my $index_val = $self->get_value_index_key( value => $val ); + my $index_val = $self->get_value_index_key_aa( $val ); $self->{BYELEMENT}{$element1}{$element2} = $val; $self->{BYVALUE}{$index_val}{$element1}{$element2}++; @@ -496,7 +508,7 @@ sub delete_element { // warn "ISSUES BYELEMENT $element1 $element2\n"; } - my $index_val = $self->get_value_index_key( value => $value ); + my $index_val = $self->get_value_index_key_aa( $value ); if ( !$val_index->{$index_val} ) { # a bit underhanded, but this ensures we upgrade old matrices $self->rebuild_value_index; @@ -574,7 +586,7 @@ sub get_element_pairs_with_value { my $val = $args{value}; - my $val_key = $self->get_value_index_key (value => $val); + my $val_key = $self->get_value_index_key_aa ($val); my %results; From 22b409abf46560346ea0a8a00d39b48a38f8be2c Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 13:02:58 +1100 Subject: [PATCH 05/15] _calc_abc: properly disable redundant code block --- lib/Biodiverse/Indices/Indices.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Biodiverse/Indices/Indices.pm b/lib/Biodiverse/Indices/Indices.pm index 9b6fbf0e3..e7647065f 100644 --- a/lib/Biodiverse/Indices/Indices.pm +++ b/lib/Biodiverse/Indices/Indices.pm @@ -1894,7 +1894,7 @@ sub _calc_abc { # required by all the other indices, as it gets the labels in } # set the counts to one if using plain old abc, as the elements section doesn't obey it properly - if (0 || !($count_labels || $count_samples)) { + if (0 && !($count_labels || $count_samples)) { @label_list_master{keys %label_list_master} = (1) x scalar keys %label_list_master; @{$label_list{1}}{keys %{$label_list{1}}} = (1) x scalar keys %{$label_list{1}}; @{$label_list{2}}{keys %{$label_list{2}}} = (1) x scalar keys %{$label_list{2}}; From 2f394475935ebcb1018e011134ca82ae606ca7aa Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 13:06:18 +1100 Subject: [PATCH 06/15] Indices.pm: pre/post calcs: avoid creating %args when it is not used Makes a difference for matrices where these are called many times. --- lib/Biodiverse/Indices.pm | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index faf3cbafa..eba8abf4c 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -1631,32 +1631,28 @@ sub get_results_from_pre_calc_global { sub run_precalc_globals { my $self = shift; - my %args = @_; - my $results = $self->run_dependencies( %args, type => 'pre_calc_global', ); + my $results = $self->run_dependencies( @_, type => 'pre_calc_global', ); return wantarray ? %$results : $results; } sub run_precalc_locals { my $self = shift; - my %args = @_; - return $self->run_dependencies( %args, type => 'pre_calc', ); + return $self->run_dependencies( @_, type => 'pre_calc', ); } sub run_postcalc_locals { my $self = shift; - my %args = @_; - return $self->run_dependencies( %args, type => 'post_calc', ); + return $self->run_dependencies( @_, type => 'post_calc' ); } sub run_postcalc_globals { my $self = shift; - my %args = @_; - return $self->run_dependencies( %args, type => 'post_calc_global', ); + return $self->run_dependencies( @_, type => 'post_calc_global' ); } sub set_pairwise_mode { From b4ae3e1830ec337eb0dacb776d9c9482fe1542c8 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 13:07:31 +1100 Subject: [PATCH 07/15] Cluster.pm: faster checks for existing value in other matrices --- lib/Biodiverse/Cluster.pm | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm index 1c2b7698e..730d5388a 100644 --- a/lib/Biodiverse/Cluster.pm +++ b/lib/Biodiverse/Cluster.pm @@ -1098,13 +1098,11 @@ sub build_matrix_elements { foreach my $mx (@$matrices) { # second is shadow matrix, if given #last MX if $ofh; - $value = $mx->get_defined_value_aa ($element1, $element2); - if (defined $value) { # don't redo them... - $exists ++; - } - else { - $not_exists_iter{$iter} = 1; - } + # $value = $mx->get_defined_value_aa ($element1, $element2); + # don't redo them... + defined $mx->get_defined_value_aa ($element1, $element2) + ? ($exists++) + : ($not_exists_iter{$iter} = 1); $iter ++; } From 310f9d7d70bc4e661e443375404a2b0125dc18d8 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 13:18:36 +1100 Subject: [PATCH 08/15] Localise some variables And fix an overly zealous change from a recent commit. We need $value in more than one place. --- lib/Biodiverse/Cluster.pm | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm index 730d5388a..002a47c0c 100644 --- a/lib/Biodiverse/Cluster.pm +++ b/lib/Biodiverse/Cluster.pm @@ -1070,7 +1070,6 @@ sub build_matrix_elements { $n++; { - no autovivification; # save a bit of memory next ELEMENT2 if $already_calculated{$element2}; next ELEMENT2 if $element1 eq $element2; @@ -1087,20 +1086,20 @@ sub build_matrix_elements { # If we already have this value then get it and assign it. # Some of these contortions appear to be due to an old approach # where all matrices were built in one loop. - # Could probably drop out sooner now. - my $exists = 0; - my $iter = 0; - my %not_exists_iter; - my $value; - + # Could probably drop out sooner now. if (!$ofh) { + my $iter = 0; + my $exists = 0; + my %not_exists_iter; + my $value; + MX: foreach my $mx (@$matrices) { # second is shadow matrix, if given #last MX if $ofh; - # $value = $mx->get_defined_value_aa ($element1, $element2); + $value = $mx->get_defined_value_aa ($element1, $element2); # don't redo them... - defined $mx->get_defined_value_aa ($element1, $element2) + defined $value ? ($exists++) : ($not_exists_iter{$iter} = 1); $iter ++; From 57c9f9a73ab2db43649a67b43b19abcaf773adf2 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 14:08:04 +1100 Subject: [PATCH 09/15] Indices: no need to call post_calc dependencies if none are defined Avoids a lot of calls when building matrices. --- lib/Biodiverse/Indices.pm | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index eba8abf4c..bbbd669d3 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -1589,12 +1589,13 @@ sub run_calculations { my $self = shift; my %args = @_; - $self - ->reset_results; # clear any previous local results - poss redundant now + # clear any previous local results - poss redundant now + $self->reset_results; my $pre_calc_local_results = $self->run_precalc_locals(%args); - my %calcs_to_run = $self->get_valid_calculations_to_run; + use experimental qw/refaliasing/; + \my %calcs_to_run = $self->get_valid_calculations_to_run; my %results; # stores the results foreach my $calc ( keys %calcs_to_run ) { @@ -1646,6 +1647,12 @@ sub run_precalc_locals { sub run_postcalc_locals { my $self = shift; + # Most cases do not have local post calcs so we can save some time, + # especially when building pairwise matrices. + # Should perhaps be a method with caching - has_post_calc_locals + my $validated_calcs = $self->get_param('VALID_CALCULATIONS'); + return if !$validated_calcs->{calc_lists_by_type}{post_calc_local}; + return $self->run_dependencies( @_, type => 'post_calc' ); } From 42bf1a4da6077ee14398bc2efcc875091908d43b Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 14:20:28 +1100 Subject: [PATCH 10/15] Indices.pm: streamline some dependency handling in run_dependencies --- lib/Biodiverse/Indices.pm | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index bbbd669d3..b4650b51e 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -1547,23 +1547,19 @@ sub run_dependencies { } else { my %dep_results; - if ( exists $dep_list->{$calc} ) { - my $deps = $dep_list->{$calc} || []; + if (my $deps = $dep_list->{$calc} ) { + LOCAL_DEP: foreach my $dep (@$deps) { - my $dep_res = - exists $as_results_from{$dep} - ? $as_results_from{$dep} - : {}; + my $dep_res = $as_results_from{$dep} + || next LOCAL_DEP; @dep_results{ keys %$dep_res } = values %$dep_res; } } - if ( exists $dep_list_global->{$calc} ) { - my $deps = $dep_list_global->{$calc} || []; + if (my $deps = $dep_list_global->{$calc}) { + GLOBAL_DEP: foreach my $dep (@$deps) { - my $dep_res = - exists $as_results_from_global{$dep} - ? $as_results_from_global{$dep} - : {}; + my $dep_res = $as_results_from_global{$dep} + || next GLOBAL_DEP; @dep_results{ keys %$dep_res } = values %$dep_res; } } From be0565533fe91f27e1623f2588da8c6140e40a1f Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 14:24:48 +1100 Subject: [PATCH 11/15] micro-optimise Matrix::get_value_index_key_aa --- lib/Biodiverse/Matrix.pm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/Biodiverse/Matrix.pm b/lib/Biodiverse/Matrix.pm index bbb7e6f60..585b29832 100644 --- a/lib/Biodiverse/Matrix.pm +++ b/lib/Biodiverse/Matrix.pm @@ -251,11 +251,11 @@ sub get_value_index_key_aa { $val // return 'undef'; - if ( my $prec = $self->get_param('VAL_INDEX_PRECISION') ) { - $val = sprintf $prec, $val; - } + my $prec = $self->get_param('VAL_INDEX_PRECISION'); - return $val; + return $prec + ? sprintf $prec, $val + : $val; } # need to flesh this out - total number of elements, symmetry, summary stats etc From cf18e674dcc90658120850ed86c2b47eb97c146c Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 17:03:37 +1100 Subject: [PATCH 12/15] Indices: add pairwise calc_abc variant for singleton element lists In such cases caching becomes beneficial as we are often building a matrix and repeatedly calling subs to get the same answer. --- lib/Biodiverse/Indices/Indices.pm | 108 ++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/lib/Biodiverse/Indices/Indices.pm b/lib/Biodiverse/Indices/Indices.pm index e7647065f..88310d069 100644 --- a/lib/Biodiverse/Indices/Indices.pm +++ b/lib/Biodiverse/Indices/Indices.pm @@ -1626,17 +1626,24 @@ sub calc_abc { # wrapper for _calc_abc - use the other wrappers for actual GUI my ($self, %args) = @_; delete @args{qw/count_samples count_labels}/}; + my $have_lb_lists = defined ( + $args{label_hash1} + // $args{label_hash2} + // $args{label_list1} + // $args{label_list2} + ); + + return $self->_calc_abc_pairwise_mode(%args) + if $self->get_pairwise_mode + && @{$args{element_list1} // []} == 1 + && @{$args{element_list2} // []} == 1 + && !$have_lb_lists; return $self->_calc_abc(%args) if is_hashref($args{element_list1}) || @{$args{element_list1} // []} != 1 - || defined( - $args{element_list2} - // $args{label_hash1} - // $args{label_hash2} - // $args{label_list1} - // $args{label_list2} - ); + || defined $args{element_list2} + || $have_lb_lists; return $self->_calc_abc_one_element(%args); } @@ -1746,6 +1753,93 @@ sub _calc_abc_one_element { return wantarray ? %results : \%results; } +# If we are in pairwise mode and only processing two elements +# then we can cache some of the results. +# Assumes only one of each of element1 and element2 passed. +sub _calc_abc_pairwise_mode { + my ($self, %args) = @_; + + my $element1 = $args{element_list1}[0]; + my $element2 = $args{element_list2}[0]; + + my $count_samples = $args{count_samples}; + my $count_labels = !$count_samples && $args{count_labels}; + + my (%label_hash1, %label_hash2); + + my $cache = $self->get_cached_value_dor_set_default_href ( + '_calc_abc_pairwise_mode_' . ($count_labels || '_') . ($count_samples || '_') + ); + + if (!$cache->{$element1}) { + \my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element1); + if ($count_labels) { + %label_hash1 = %labels; + } + else { + @label_hash1{keys %labels} = (1) x keys %labels; + } + $cache->{$element1} = \%label_hash1; + } + else { + \%label_hash1 = $cache->{$element1}; + } + + if (!$cache->{$element2}) { + \my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element2); + if ($count_labels) { + %label_hash2 = %labels; + } + else { + @label_hash2{keys %labels} = (1) x keys %labels; + } + $cache->{$element2} = \%label_hash2; + } + else { + \%label_hash2 = $cache->{$element2}; + } + + # now merge + my %label_list_master; + if ($count_samples || $count_labels) { + %label_list_master = %label_hash1; + pairmap {$label_list_master{$a} += $b} %label_hash2; + } + else { + %label_list_master = (%label_hash1, %label_hash2); + } + + my $abc = scalar keys %label_list_master; + + # a, b and c are simply differences of the lists + # doubled letters are to avoid clashes with globals $a and $b + my $aa + = (scalar keys %label_hash1) + + (scalar keys %label_hash2) + - $abc; + my $bb = $abc - (scalar keys %label_hash2); + my $cc = $abc - (scalar keys %label_hash1); + + my %results = ( + A => $aa, + B => $bb, + C => $cc, + ABC => $abc, + + label_hash_all => \%label_list_master, + label_hash1 => \%label_hash1, + label_hash2 => \%label_hash2, + element_list1 => {$element1 => 1}, + element_list2 => {$element2 => 1}, + element_list_all => [$element1, $element2], + element_count1 => 1, + element_count2 => 1, + element_count_all => 2, + ); + + return wantarray ? %results : \%results; +} + sub _calc_abc { # required by all the other indices, as it gets the labels in the elements my $self = shift; my %args = @_; From 12b69e4d42a837e8063732df6b8c78d263e554fe Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Wed, 21 Feb 2024 18:42:37 +1100 Subject: [PATCH 13/15] Indices: use pairwise calcs with abc2&3, centralise via a dispatcher The pairwise abc will make all the other calcs faster. The dispatcher allows centralisation of all the logic controlling which inner sub is used, thus removing a lot of repetition. --- lib/Biodiverse/Indices/Indices.pm | 77 +++++++++++++------------------ 1 file changed, 31 insertions(+), 46 deletions(-) diff --git a/lib/Biodiverse/Indices/Indices.pm b/lib/Biodiverse/Indices/Indices.pm index 88310d069..308be12a1 100644 --- a/lib/Biodiverse/Indices/Indices.pm +++ b/lib/Biodiverse/Indices/Indices.pm @@ -1626,26 +1626,8 @@ sub calc_abc { # wrapper for _calc_abc - use the other wrappers for actual GUI my ($self, %args) = @_; delete @args{qw/count_samples count_labels}/}; - my $have_lb_lists = defined ( - $args{label_hash1} - // $args{label_hash2} - // $args{label_list1} - // $args{label_list2} - ); - - return $self->_calc_abc_pairwise_mode(%args) - if $self->get_pairwise_mode - && @{$args{element_list1} // []} == 1 - && @{$args{element_list2} // []} == 1 - && !$have_lb_lists; - return $self->_calc_abc(%args) - if is_hashref($args{element_list1}) - || @{$args{element_list1} // []} != 1 - || defined $args{element_list2} - || $have_lb_lists; - - return $self->_calc_abc_one_element(%args); + return $self->_calc_abc_dispatcher(%args); } sub get_metadata_calc_abc2 { @@ -1662,22 +1644,11 @@ sub get_metadata_calc_abc2 { return $metadata_class->new(\%metadata); } +# run calc_abc, but keep a track of the label counts across groups sub calc_abc2 { - # run calc_abc, but keep a track of the label counts across groups - my ($self, %args) = @_; - - return $self->_calc_abc(%args, count_labels => 1) - if is_hashref($args{element_list1}) - || @{$args{element_list1} // []} != 1 - || defined( - $args{element_list2} - // $args{label_hash1} - // $args{label_hash2} - // $args{label_list1} - // $args{label_list2} - ); + my $self = shift; - return $self->_calc_abc_one_element(%args, count_labels => 1); + return $self->_calc_abc_dispatcher(@_, count_labels => 1); } sub get_metadata_calc_abc3 { @@ -1697,20 +1668,35 @@ sub get_metadata_calc_abc3 { # run calc_abc, but keep a track of the label counts and samples across groups sub calc_abc3 { + my $self = shift; + + return $self->_calc_abc_dispatcher(@_, count_samples => 1); +} + +# keep a lot of logic in one place +sub _calc_abc_dispatcher { my ($self, %args) = @_; - return $self->_calc_abc(%args, count_samples => 1) + my $have_lb_lists = defined ( + $args{label_hash1} + // $args{label_hash2} + // $args{label_list1} + // $args{label_list2} + ); + + return $self->_calc_abc_pairwise_mode(%args) + if $self->get_pairwise_mode + && @{$args{element_list1} // []} == 1 + && @{$args{element_list2} // []} == 1 + && !$have_lb_lists; + + return $self->_calc_abc(%args) if is_hashref($args{element_list1}) || @{$args{element_list1} // []} != 1 - || defined( - $args{element_list2} - // $args{label_hash1} - // $args{label_hash2} - // $args{label_list1} - // $args{label_list2} - ); + || defined $args{element_list2} + || $have_lb_lists; - return $self->_calc_abc_one_element(%args, count_samples => 1); + return $self->_calc_abc_one_element(%args); } # A simplified version of _calc_abc for a single element. @@ -1766,14 +1752,13 @@ sub _calc_abc_pairwise_mode { my $count_labels = !$count_samples && $args{count_labels}; my (%label_hash1, %label_hash2); - my $cache = $self->get_cached_value_dor_set_default_href ( - '_calc_abc_pairwise_mode_' . ($count_labels || '_') . ($count_samples || '_') + '_calc_abc_pairwise_mode_' . ($count_labels ? 2 : $count_samples ? 3 : 1) ); if (!$cache->{$element1}) { \my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element1); - if ($count_labels) { + if ($count_samples) { %label_hash1 = %labels; } else { @@ -1787,7 +1772,7 @@ sub _calc_abc_pairwise_mode { if (!$cache->{$element2}) { \my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element2); - if ($count_labels) { + if ($count_samples) { %label_hash2 = %labels; } else { From 17a344d7fca6a02be93253416b4637f7ace3887b Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Thu, 22 Feb 2024 07:30:14 +1100 Subject: [PATCH 14/15] calc_rw_turnover: use calc_abc2 as a prereq This avoids needless calls to calc_abc and thus less processing when building cluster matrices. --- lib/Biodiverse/Indices/RWTurnover.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Biodiverse/Indices/RWTurnover.pm b/lib/Biodiverse/Indices/RWTurnover.pm index c2241a29e..c4a4fd188 100644 --- a/lib/Biodiverse/Indices/RWTurnover.pm +++ b/lib/Biodiverse/Indices/RWTurnover.pm @@ -21,7 +21,7 @@ sub get_metadata_calc_rw_turnover { name => 'Range weighted Sorenson', reference => 'Laffan et al. (2016) https://doi.org/10.1111/2041-210X.12513', type => 'Taxonomic Dissimilarity and Comparison', - pre_calc => [qw /calc_endemism_whole_lists calc_abc/], + pre_calc => [qw /calc_endemism_whole_lists calc_abc2/], uses_nbr_lists => 2, # how many lists it must have distribution => 'nonnegative', # for A, B and C indices => { From efd90ff334f6ea54dbcd3bc5bd7a4c4f9d6e4080 Mon Sep 17 00:00:00 2001 From: shawnlaffan Date: Thu, 22 Feb 2024 11:37:35 +1100 Subject: [PATCH 15/15] Indices: simplify dependencies for RW turnover No need to calculate calc_abc for the phylo variant. This has the knock-on effect of needing to handle arrays and hashes for the element_list[12] args. The intent is to clean that up as part of issue #919. --- lib/Biodiverse/Indices/RWTurnover.pm | 30 +++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/lib/Biodiverse/Indices/RWTurnover.pm b/lib/Biodiverse/Indices/RWTurnover.pm index c4a4fd188..71c47752c 100644 --- a/lib/Biodiverse/Indices/RWTurnover.pm +++ b/lib/Biodiverse/Indices/RWTurnover.pm @@ -9,6 +9,7 @@ no warnings 'experimental::refaliasing'; use Carp; use List::Util qw /sum reduce/; +use Ref::Util qw /is_hashref/; our $VERSION = '4.99_002'; @@ -65,12 +66,20 @@ sub calc_rw_turnover { # or inverse of ranges my $cache = $self->get_cached_value_dor_set_default_href ('_calc_phylo_rwt_pairwise_branch_sum_cache'); + # use postfix idiom? + # ideally we would only be passed arrays, but see issue #919 + my $key1 = is_hashref ($args{element_list1}) + ? ((keys %{$args{element_list1}})[0]) + : (${$args{element_list1}}[0]); + my $key2 = is_hashref ($args{element_list2}) + ? ((keys %{$args{element_list2}})[0]) + : (${$args{element_list2} //[]}[0]); # Could use a reduce call to collapse the "sum map {} @list" idiom, # thus avoiding a list generation. These are only run once per group, # though, so it might not matter. - my $sum_i = $cache->{(keys %{$args{element_list1}})[0]} # use postfix deref? + my $sum_i = $cache->{$key1} //= (sum map {1 / $_} @ranges{keys %list1}) // 0; - my $sum_j = $cache->{(keys %{$args{element_list2}})[0]} + my $sum_j = $cache->{$key2} //= (sum map {1 / $_} @ranges{keys %list2}) // 0; # save some looping, mainly when there are large differences in key counts if (keys %list1 <= keys %list2) { @@ -121,7 +130,7 @@ sub get_metadata_calc_phylo_rw_turnover { name => 'Phylo Range weighted Turnover', reference => 'Laffan et al. (2016) https://doi.org/10.1111/2041-210X.12513', type => 'Phylogenetic Turnover', - pre_calc => [qw /calc_abc _calc_pe_lists_per_element_set/], + pre_calc => [qw /_calc_pe_lists_per_element_set/], # pre_calc_global => [qw / # get_node_range_hash_as_lists # get_trimmed_tree_parent_name_hash @@ -164,9 +173,20 @@ sub calc_phylo_rw_turnover { # simplify the calcs as we only need to find $aa my $cache = $self->get_cached_value_dor_set_default_href ('_calc_phylo_rwt_pairwise_branch_sum_cache'); - my $sum_i = $cache->{(keys %{$args{element_list1}})[0]} # use postfix deref? + # use postfix idiom? + # ideally we would only be passed arrays, but see issue #919 + my $key1 = is_hashref ($args{element_list1}) + ? ((keys %{$args{element_list1}})[0]) + : (${$args{element_list1}}[0]); + my $key2 = is_hashref ($args{element_list2}) + ? ((keys %{$args{element_list2}})[0]) + : (${$args{element_list2} //[]}[0]); + # Could use a reduce call to collapse the "sum map {} @list" idiom, + # thus avoiding a list generation. These are only run once per group, + # though, so it might not matter. + my $sum_i = $cache->{$key1} //= (sum values %list1) // 0; - my $sum_j = $cache->{(keys %{$args{element_list2}})[0]} + my $sum_j = $cache->{$key2} //= (sum values %list2) // 0; # save some looping, mainly when there are large differences in key counts if (keys %list1 <= keys %list2) {