Merge pull request #920 from shawnlaffan/matrices_2024

Faster cluster matrix creation Initially focused on the matrix classes but then extended to the index calculations.
shawnlaffan · Feb 22, 2024 · 865f15d · 865f15d
2 parents da1e625 + efd90ff
commit 865f15d
Show file tree

Hide file tree

Showing 8 changed files with 296 additions and 170 deletions.
diff --git a/lib/Biodiverse/Cluster.pm b/lib/Biodiverse/Cluster.pm
@@ -1070,7 +1070,6 @@ sub build_matrix_elements {
         $n++;
 
         {
-            no autovivification;  #  save a bit of memory
             next ELEMENT2 if $already_calculated{$element2};
             next ELEMENT2 if $element1 eq $element2;
 
@@ -1087,24 +1086,22 @@ sub build_matrix_elements {
         #  If we already have this value then get it and assign it.
         #  Some of these contortions appear to be due to an old approach
         #  where all matrices were built in one loop.
-        #  Could probably drop out sooner now. 
-        my $exists = 0;
-        my $iter   = 0;
-        my %not_exists_iter;
-        my $value;
-
+        #  Could probably drop out sooner now.
         if (!$ofh) {
+            my $iter   = 0;
+            my $exists = 0;
+            my %not_exists_iter;
+            my $value;
+
           MX:
             foreach my $mx (@$matrices) {  #  second is shadow matrix, if given
                 #last MX if $ofh;
 
                 $value = $mx->get_defined_value_aa ($element1, $element2);
-                if (defined $value) {  #  don't redo them...
-                    $exists ++;
-                }
-                else {
-                    $not_exists_iter{$iter} = 1;
-                }
+                #  don't redo them...
+                defined $value
+                    ? ($exists++)
+                    : ($not_exists_iter{$iter} = 1);
                 $iter ++;
             }
 

diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm
@@ -1547,23 +1547,19 @@ sub run_dependencies {
         }
         else {
             my %dep_results;
-            if ( exists $dep_list->{$calc} ) {
-                my $deps = $dep_list->{$calc} || [];
+            if (my $deps = $dep_list->{$calc} ) {
+              LOCAL_DEP:
                 foreach my $dep (@$deps) {
-                    my $dep_res =
-                      exists $as_results_from{$dep}
-                      ? $as_results_from{$dep}
-                      : {};
+                    my $dep_res = $as_results_from{$dep}
+                        || next LOCAL_DEP;
                     @dep_results{ keys %$dep_res } = values %$dep_res;
                 }
             }
-            if ( exists $dep_list_global->{$calc} ) {
-                my $deps = $dep_list_global->{$calc} || [];
+            if (my $deps = $dep_list_global->{$calc}) {
+              GLOBAL_DEP:
                 foreach my $dep (@$deps) {
-                    my $dep_res =
-                      exists $as_results_from_global{$dep}
-                      ? $as_results_from_global{$dep}
-                      : {};
+                    my $dep_res = $as_results_from_global{$dep}
+                        || next GLOBAL_DEP;
                     @dep_results{ keys %$dep_res } = values %$dep_res;
                 }
             }
@@ -1589,12 +1585,13 @@ sub run_calculations {
     my $self = shift;
     my %args = @_;
 
-    $self
-      ->reset_results;  #  clear any previous local results - poss redundant now
+    #  clear any previous local results - poss redundant now
+    $self->reset_results;
 
     my $pre_calc_local_results = $self->run_precalc_locals(%args);
 
-    my %calcs_to_run = $self->get_valid_calculations_to_run;
+    use experimental qw/refaliasing/;
+    \my %calcs_to_run = $self->get_valid_calculations_to_run;
 
     my %results;        #  stores the results
     foreach my $calc ( keys %calcs_to_run ) {
@@ -1631,32 +1628,34 @@ sub get_results_from_pre_calc_global {
 
 sub run_precalc_globals {
     my $self = shift;
-    my %args = @_;
 
-    my $results = $self->run_dependencies( %args, type => 'pre_calc_global', );
+    my $results = $self->run_dependencies( @_, type => 'pre_calc_global', );
 
     return wantarray ? %$results : $results;
 }
 
 sub run_precalc_locals {
     my $self = shift;
-    my %args = @_;
 
-    return $self->run_dependencies( %args, type => 'pre_calc', );
+    return $self->run_dependencies( @_, type => 'pre_calc', );
 }
 
 sub run_postcalc_locals {
     my $self = shift;
-    my %args = @_;
 
-    return $self->run_dependencies( %args, type => 'post_calc', );
+    #  Most cases do not have local post calcs so we can save some time,
+    #  especially when building pairwise matrices.
+    #  Should perhaps be a method with caching - has_post_calc_locals
+    my $validated_calcs = $self->get_param('VALID_CALCULATIONS');
+    return if !$validated_calcs->{calc_lists_by_type}{post_calc_local};
+
+    return $self->run_dependencies( @_, type => 'post_calc' );
 }
 
 sub run_postcalc_globals {
     my $self = shift;
-    my %args = @_;
 
-    return $self->run_dependencies( %args, type => 'post_calc_global', );
+    return $self->run_dependencies( @_, type => 'post_calc_global' );
 }
 
 sub set_pairwise_mode {

diff --git a/lib/Biodiverse/Indices/Indices.pm b/lib/Biodiverse/Indices/Indices.pm
@@ -1627,18 +1627,7 @@ sub calc_abc {  #  wrapper for _calc_abc - use the other wrappers for actual GUI
 
     delete @args{qw/count_samples count_labels}/};
 
-    return $self->_calc_abc(%args)
-        if is_hashref($args{element_list1})
-            || @{$args{element_list1} // []} != 1
-            || defined(
-                   $args{element_list2}
-                // $args{label_hash1}
-                // $args{label_hash2}
-                // $args{label_list1}
-                // $args{label_list2}
-        );
-
-    return $self->_calc_abc_one_element(%args);
+    return $self->_calc_abc_dispatcher(%args);
 }
 
 sub get_metadata_calc_abc2 {
@@ -1655,22 +1644,11 @@ sub get_metadata_calc_abc2 {
     return $metadata_class->new(\%metadata);
 }
 
+#  run calc_abc, but keep a track of the label counts across groups
 sub calc_abc2 {
-    #  run calc_abc, but keep a track of the label counts across groups
-    my ($self, %args) = @_;
-
-    return $self->_calc_abc(%args, count_labels => 1)
-        if is_hashref($args{element_list1})
-            || @{$args{element_list1} // []} != 1
-            || defined(
-                   $args{element_list2}
-                // $args{label_hash1}
-                // $args{label_hash2}
-                // $args{label_list1}
-                // $args{label_list2}
-        );
+    my $self = shift;
 
-    return $self->_calc_abc_one_element(%args, count_labels => 1);
+    return $self->_calc_abc_dispatcher(@_, count_labels => 1);
 }
 
 sub get_metadata_calc_abc3 {
@@ -1690,20 +1668,35 @@ sub get_metadata_calc_abc3 {
 
 #  run calc_abc, but keep a track of the label counts and samples across groups
 sub calc_abc3 {
+    my $self = shift;
+
+    return $self->_calc_abc_dispatcher(@_, count_samples => 1);
+}
+
+#  keep a lot of logic in one place
+sub _calc_abc_dispatcher {
     my ($self, %args) = @_;
 
-    return $self->_calc_abc(%args, count_samples => 1)
+    my $have_lb_lists = defined (
+           $args{label_hash1}
+        // $args{label_hash2}
+        // $args{label_list1}
+        // $args{label_list2}
+    );
+
+    return $self->_calc_abc_pairwise_mode(%args)
+        if $self->get_pairwise_mode
+            && @{$args{element_list1} // []} == 1
+            && @{$args{element_list2} // []} == 1
+            && !$have_lb_lists;
+
+    return $self->_calc_abc(%args)
         if is_hashref($args{element_list1})
             || @{$args{element_list1} // []} != 1
-            || defined(
-                   $args{element_list2}
-                // $args{label_hash1}
-                // $args{label_hash2}
-                // $args{label_list1}
-                // $args{label_list2}
-        );
+            || defined $args{element_list2}
+            || $have_lb_lists;
 
-    return $self->_calc_abc_one_element(%args, count_samples => 1);
+    return $self->_calc_abc_one_element(%args);
 }
 
 #  A simplified version of _calc_abc for a single element.
@@ -1746,6 +1739,92 @@ sub _calc_abc_one_element {
     return wantarray ? %results : \%results;
 }
 
+#  If we are in pairwise mode and only processing two elements
+#  then we can cache some of the results.
+#  Assumes only one of each of element1 and element2 passed.
+sub _calc_abc_pairwise_mode {
+    my ($self, %args) = @_;
+
+    my $element1 = $args{element_list1}[0];
+    my $element2 = $args{element_list2}[0];
+
+    my $count_samples = $args{count_samples};
+    my $count_labels  = !$count_samples && $args{count_labels};
+
+    my (%label_hash1, %label_hash2);
+    my $cache = $self->get_cached_value_dor_set_default_href (
+        '_calc_abc_pairwise_mode_' . ($count_labels ? 2 : $count_samples ? 3 : 1)
+    );
+
+    if (!$cache->{$element1}) {
+        \my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element1);
+        if ($count_samples) {
+            %label_hash1 = %labels;
+        }
+        else {
+            @label_hash1{keys %labels} = (1) x keys %labels;
+        }
+        $cache->{$element1} = \%label_hash1;
+    }
+    else {
+        \%label_hash1 = $cache->{$element1};
+    }
+
+    if (!$cache->{$element2}) {
+        \my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element2);
+        if ($count_samples) {
+            %label_hash2 = %labels;
+        }
+        else {
+            @label_hash2{keys %labels} = (1) x keys %labels;
+        }
+        $cache->{$element2} = \%label_hash2;
+    }
+    else {
+        \%label_hash2 = $cache->{$element2};
+    }
+
+    #  now merge
+    my %label_list_master;
+    if ($count_samples || $count_labels) {
+        %label_list_master = %label_hash1;
+        pairmap {$label_list_master{$a} += $b} %label_hash2;
+    }
+    else {
+        %label_list_master = (%label_hash1, %label_hash2);
+    }
+
+    my $abc = scalar keys %label_list_master;
+
+    #  a, b and c are simply differences of the lists
+    #  doubled letters are to avoid clashes with globals $a and $b
+    my $aa
+        = (scalar keys %label_hash1)
+        + (scalar keys %label_hash2)
+        - $abc;
+    my $bb = $abc - (scalar keys %label_hash2);
+    my $cc = $abc - (scalar keys %label_hash1);
+
+    my %results = (
+        A   => $aa,
+        B   => $bb,
+        C   => $cc,
+        ABC => $abc,
+
+        label_hash_all    => \%label_list_master,
+        label_hash1       => \%label_hash1,
+        label_hash2       => \%label_hash2,
+        element_list1     => {$element1 => 1},
+        element_list2     => {$element2 => 1},
+        element_list_all  => [$element1, $element2],
+        element_count1    => 1,
+        element_count2    => 1,
+        element_count_all => 2,
+    );
+
+    return wantarray ? %results : \%results;
+}
+
 sub _calc_abc {  #  required by all the other indices, as it gets the labels in the elements
     my $self = shift;
     my %args = @_;
@@ -1894,7 +1973,7 @@ sub _calc_abc {  #  required by all the other indices, as it gets the labels in
     }
 
     #  set the counts to one if using plain old abc, as the elements section doesn't obey it properly
-    if (0 || !($count_labels || $count_samples)) {
+    if (0 && !($count_labels || $count_samples)) {
         @label_list_master{keys %label_list_master} = (1) x scalar keys %label_list_master;
         @{$label_list{1}}{keys %{$label_list{1}}}   = (1) x scalar keys %{$label_list{1}};
         @{$label_list{2}}{keys %{$label_list{2}}}   = (1) x scalar keys %{$label_list{2}};

diff --git a/lib/Biodiverse/Indices/RWTurnover.pm b/lib/Biodiverse/Indices/RWTurnover.pm
@@ -9,6 +9,7 @@ no warnings 'experimental::refaliasing';
 
 use Carp;
 use List::Util qw /sum reduce/;
+use Ref::Util qw /is_hashref/;
 
 our $VERSION = '4.99_002';
 
@@ -21,7 +22,7 @@ sub get_metadata_calc_rw_turnover {
         name            => 'Range weighted Sorenson',
         reference       => 'Laffan et al. (2016) https://doi.org/10.1111/2041-210X.12513',
         type            => 'Taxonomic Dissimilarity and Comparison',
-        pre_calc        => [qw /calc_endemism_whole_lists calc_abc/],
+        pre_calc        => [qw /calc_endemism_whole_lists calc_abc2/],
         uses_nbr_lists  => 2,  #  how many lists it must have
         distribution    => 'nonnegative',  # for A, B and C
         indices         => {
@@ -65,12 +66,20 @@ sub calc_rw_turnover {
         #  or inverse of ranges
         my $cache
             = $self->get_cached_value_dor_set_default_href ('_calc_phylo_rwt_pairwise_branch_sum_cache');
+        #  use postfix idiom?
+        #  ideally we would only be passed arrays, but see issue #919
+        my $key1 = is_hashref ($args{element_list1})
+            ? ((keys %{$args{element_list1}})[0])
+            : (${$args{element_list1}}[0]);
+        my $key2 = is_hashref ($args{element_list2})
+            ? ((keys %{$args{element_list2}})[0])
+            : (${$args{element_list2} //[]}[0]);
         #  Could use a reduce call to collapse the "sum map {} @list" idiom,
         #  thus avoiding a list generation.  These are only run once per group,
         #  though, so it might not matter.
-        my $sum_i = $cache->{(keys %{$args{element_list1}})[0]}  # use postfix deref?
+        my $sum_i = $cache->{$key1}
             //= (sum map {1 / $_} @ranges{keys %list1}) // 0;
-        my $sum_j = $cache->{(keys %{$args{element_list2}})[0]}
+        my $sum_j = $cache->{$key2}
             //= (sum map {1 / $_} @ranges{keys %list2}) // 0;
         #  save some looping, mainly when there are large differences in key counts
         if (keys %list1 <= keys %list2) {
@@ -121,7 +130,7 @@ sub get_metadata_calc_phylo_rw_turnover {
         name            => 'Phylo Range weighted Turnover',
         reference       => 'Laffan et al. (2016) https://doi.org/10.1111/2041-210X.12513',
         type            => 'Phylogenetic Turnover',
-        pre_calc        => [qw /calc_abc _calc_pe_lists_per_element_set/],
+        pre_calc        => [qw /_calc_pe_lists_per_element_set/],
         # pre_calc_global => [qw /
         #     get_node_range_hash_as_lists
         #     get_trimmed_tree_parent_name_hash
@@ -164,9 +173,20 @@ sub calc_phylo_rw_turnover {
         #  simplify the calcs as we only need to find $aa
         my $cache
             = $self->get_cached_value_dor_set_default_href ('_calc_phylo_rwt_pairwise_branch_sum_cache');
-        my $sum_i = $cache->{(keys %{$args{element_list1}})[0]}  # use postfix deref?
+        #  use postfix idiom?
+        #  ideally we would only be passed arrays, but see issue #919
+        my $key1 = is_hashref ($args{element_list1})
+            ? ((keys %{$args{element_list1}})[0])
+            : (${$args{element_list1}}[0]);
+        my $key2 = is_hashref ($args{element_list2})
+            ? ((keys %{$args{element_list2}})[0])
+            : (${$args{element_list2} //[]}[0]);
+        #  Could use a reduce call to collapse the "sum map {} @list" idiom,
+        #  thus avoiding a list generation.  These are only run once per group,
+        #  though, so it might not matter.
+        my $sum_i = $cache->{$key1}
             //= (sum values %list1) // 0;
-        my $sum_j = $cache->{(keys %{$args{element_list2}})[0]}
+        my $sum_j = $cache->{$key2}
             //= (sum values %list2) // 0;
         #  save some looping, mainly when there are large differences in key counts
         if (keys %list1 <= keys %list2) {