Skip to content

Commit

Permalink
Merge pull request #920 from shawnlaffan/matrices_2024
Browse files Browse the repository at this point in the history
Faster cluster matrix creation

Initially focused on the matrix classes but then extended to the index calculations.
  • Loading branch information
shawnlaffan authored Feb 22, 2024
2 parents da1e625 + efd90ff commit 865f15d
Show file tree
Hide file tree
Showing 8 changed files with 296 additions and 170 deletions.
23 changes: 10 additions & 13 deletions lib/Biodiverse/Cluster.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1070,7 +1070,6 @@ sub build_matrix_elements {
$n++;

{
no autovivification; # save a bit of memory
next ELEMENT2 if $already_calculated{$element2};
next ELEMENT2 if $element1 eq $element2;

Expand All @@ -1087,24 +1086,22 @@ sub build_matrix_elements {
# If we already have this value then get it and assign it.
# Some of these contortions appear to be due to an old approach
# where all matrices were built in one loop.
# Could probably drop out sooner now.
my $exists = 0;
my $iter = 0;
my %not_exists_iter;
my $value;

# Could probably drop out sooner now.
if (!$ofh) {
my $iter = 0;
my $exists = 0;
my %not_exists_iter;
my $value;

MX:
foreach my $mx (@$matrices) { # second is shadow matrix, if given
#last MX if $ofh;

$value = $mx->get_defined_value_aa ($element1, $element2);
if (defined $value) { # don't redo them...
$exists ++;
}
else {
$not_exists_iter{$iter} = 1;
}
# don't redo them...
defined $value
? ($exists++)
: ($not_exists_iter{$iter} = 1);
$iter ++;
}

Expand Down
45 changes: 22 additions & 23 deletions lib/Biodiverse/Indices.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1547,23 +1547,19 @@ sub run_dependencies {
}
else {
my %dep_results;
if ( exists $dep_list->{$calc} ) {
my $deps = $dep_list->{$calc} || [];
if (my $deps = $dep_list->{$calc} ) {
LOCAL_DEP:
foreach my $dep (@$deps) {
my $dep_res =
exists $as_results_from{$dep}
? $as_results_from{$dep}
: {};
my $dep_res = $as_results_from{$dep}
|| next LOCAL_DEP;
@dep_results{ keys %$dep_res } = values %$dep_res;
}
}
if ( exists $dep_list_global->{$calc} ) {
my $deps = $dep_list_global->{$calc} || [];
if (my $deps = $dep_list_global->{$calc}) {
GLOBAL_DEP:
foreach my $dep (@$deps) {
my $dep_res =
exists $as_results_from_global{$dep}
? $as_results_from_global{$dep}
: {};
my $dep_res = $as_results_from_global{$dep}
|| next GLOBAL_DEP;
@dep_results{ keys %$dep_res } = values %$dep_res;
}
}
Expand All @@ -1589,12 +1585,13 @@ sub run_calculations {
my $self = shift;
my %args = @_;

$self
->reset_results; # clear any previous local results - poss redundant now
# clear any previous local results - poss redundant now
$self->reset_results;

my $pre_calc_local_results = $self->run_precalc_locals(%args);

my %calcs_to_run = $self->get_valid_calculations_to_run;
use experimental qw/refaliasing/;
\my %calcs_to_run = $self->get_valid_calculations_to_run;

my %results; # stores the results
foreach my $calc ( keys %calcs_to_run ) {
Expand Down Expand Up @@ -1631,32 +1628,34 @@ sub get_results_from_pre_calc_global {

sub run_precalc_globals {
my $self = shift;
my %args = @_;

my $results = $self->run_dependencies( %args, type => 'pre_calc_global', );
my $results = $self->run_dependencies( @_, type => 'pre_calc_global', );

return wantarray ? %$results : $results;
}

sub run_precalc_locals {
my $self = shift;
my %args = @_;

return $self->run_dependencies( %args, type => 'pre_calc', );
return $self->run_dependencies( @_, type => 'pre_calc', );
}

sub run_postcalc_locals {
my $self = shift;
my %args = @_;

return $self->run_dependencies( %args, type => 'post_calc', );
# Most cases do not have local post calcs so we can save some time,
# especially when building pairwise matrices.
# Should perhaps be a method with caching - has_post_calc_locals
my $validated_calcs = $self->get_param('VALID_CALCULATIONS');
return if !$validated_calcs->{calc_lists_by_type}{post_calc_local};

return $self->run_dependencies( @_, type => 'post_calc' );
}

sub run_postcalc_globals {
my $self = shift;
my %args = @_;

return $self->run_dependencies( %args, type => 'post_calc_global', );
return $self->run_dependencies( @_, type => 'post_calc_global' );
}

sub set_pairwise_mode {
Expand Down
151 changes: 115 additions & 36 deletions lib/Biodiverse/Indices/Indices.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1627,18 +1627,7 @@ sub calc_abc { # wrapper for _calc_abc - use the other wrappers for actual GUI

delete @args{qw/count_samples count_labels}/};

return $self->_calc_abc(%args)
if is_hashref($args{element_list1})
|| @{$args{element_list1} // []} != 1
|| defined(
$args{element_list2}
// $args{label_hash1}
// $args{label_hash2}
// $args{label_list1}
// $args{label_list2}
);

return $self->_calc_abc_one_element(%args);
return $self->_calc_abc_dispatcher(%args);
}

sub get_metadata_calc_abc2 {
Expand All @@ -1655,22 +1644,11 @@ sub get_metadata_calc_abc2 {
return $metadata_class->new(\%metadata);
}

# run calc_abc, but keep a track of the label counts across groups
sub calc_abc2 {
# run calc_abc, but keep a track of the label counts across groups
my ($self, %args) = @_;

return $self->_calc_abc(%args, count_labels => 1)
if is_hashref($args{element_list1})
|| @{$args{element_list1} // []} != 1
|| defined(
$args{element_list2}
// $args{label_hash1}
// $args{label_hash2}
// $args{label_list1}
// $args{label_list2}
);
my $self = shift;

return $self->_calc_abc_one_element(%args, count_labels => 1);
return $self->_calc_abc_dispatcher(@_, count_labels => 1);
}

sub get_metadata_calc_abc3 {
Expand All @@ -1690,20 +1668,35 @@ sub get_metadata_calc_abc3 {

# run calc_abc, but keep a track of the label counts and samples across groups
sub calc_abc3 {
my $self = shift;

return $self->_calc_abc_dispatcher(@_, count_samples => 1);
}

# keep a lot of logic in one place
sub _calc_abc_dispatcher {
my ($self, %args) = @_;

return $self->_calc_abc(%args, count_samples => 1)
my $have_lb_lists = defined (
$args{label_hash1}
// $args{label_hash2}
// $args{label_list1}
// $args{label_list2}
);

return $self->_calc_abc_pairwise_mode(%args)
if $self->get_pairwise_mode
&& @{$args{element_list1} // []} == 1
&& @{$args{element_list2} // []} == 1
&& !$have_lb_lists;

return $self->_calc_abc(%args)
if is_hashref($args{element_list1})
|| @{$args{element_list1} // []} != 1
|| defined(
$args{element_list2}
// $args{label_hash1}
// $args{label_hash2}
// $args{label_list1}
// $args{label_list2}
);
|| defined $args{element_list2}
|| $have_lb_lists;

return $self->_calc_abc_one_element(%args, count_samples => 1);
return $self->_calc_abc_one_element(%args);
}

# A simplified version of _calc_abc for a single element.
Expand Down Expand Up @@ -1746,6 +1739,92 @@ sub _calc_abc_one_element {
return wantarray ? %results : \%results;
}

# If we are in pairwise mode and only processing two elements
# then we can cache some of the results.
# Assumes only one of each of element1 and element2 passed.
sub _calc_abc_pairwise_mode {
my ($self, %args) = @_;

my $element1 = $args{element_list1}[0];
my $element2 = $args{element_list2}[0];

my $count_samples = $args{count_samples};
my $count_labels = !$count_samples && $args{count_labels};

my (%label_hash1, %label_hash2);
my $cache = $self->get_cached_value_dor_set_default_href (
'_calc_abc_pairwise_mode_' . ($count_labels ? 2 : $count_samples ? 3 : 1)
);

if (!$cache->{$element1}) {
\my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element1);
if ($count_samples) {
%label_hash1 = %labels;
}
else {
@label_hash1{keys %labels} = (1) x keys %labels;
}
$cache->{$element1} = \%label_hash1;
}
else {
\%label_hash1 = $cache->{$element1};
}

if (!$cache->{$element2}) {
\my %labels = $self->get_basedata_ref->get_labels_in_group_as_hash_aa($element2);
if ($count_samples) {
%label_hash2 = %labels;
}
else {
@label_hash2{keys %labels} = (1) x keys %labels;
}
$cache->{$element2} = \%label_hash2;
}
else {
\%label_hash2 = $cache->{$element2};
}

# now merge
my %label_list_master;
if ($count_samples || $count_labels) {
%label_list_master = %label_hash1;
pairmap {$label_list_master{$a} += $b} %label_hash2;
}
else {
%label_list_master = (%label_hash1, %label_hash2);
}

my $abc = scalar keys %label_list_master;

# a, b and c are simply differences of the lists
# doubled letters are to avoid clashes with globals $a and $b
my $aa
= (scalar keys %label_hash1)
+ (scalar keys %label_hash2)
- $abc;
my $bb = $abc - (scalar keys %label_hash2);
my $cc = $abc - (scalar keys %label_hash1);

my %results = (
A => $aa,
B => $bb,
C => $cc,
ABC => $abc,

label_hash_all => \%label_list_master,
label_hash1 => \%label_hash1,
label_hash2 => \%label_hash2,
element_list1 => {$element1 => 1},
element_list2 => {$element2 => 1},
element_list_all => [$element1, $element2],
element_count1 => 1,
element_count2 => 1,
element_count_all => 2,
);

return wantarray ? %results : \%results;
}

sub _calc_abc { # required by all the other indices, as it gets the labels in the elements
my $self = shift;
my %args = @_;
Expand Down Expand Up @@ -1894,7 +1973,7 @@ sub _calc_abc { # required by all the other indices, as it gets the labels in
}

# set the counts to one if using plain old abc, as the elements section doesn't obey it properly
if (0 || !($count_labels || $count_samples)) {
if (0 && !($count_labels || $count_samples)) {
@label_list_master{keys %label_list_master} = (1) x scalar keys %label_list_master;
@{$label_list{1}}{keys %{$label_list{1}}} = (1) x scalar keys %{$label_list{1}};
@{$label_list{2}}{keys %{$label_list{2}}} = (1) x scalar keys %{$label_list{2}};
Expand Down
32 changes: 26 additions & 6 deletions lib/Biodiverse/Indices/RWTurnover.pm
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ no warnings 'experimental::refaliasing';

use Carp;
use List::Util qw /sum reduce/;
use Ref::Util qw /is_hashref/;

our $VERSION = '4.99_002';

Expand All @@ -21,7 +22,7 @@ sub get_metadata_calc_rw_turnover {
name => 'Range weighted Sorenson',
reference => 'Laffan et al. (2016) https://doi.org/10.1111/2041-210X.12513',
type => 'Taxonomic Dissimilarity and Comparison',
pre_calc => [qw /calc_endemism_whole_lists calc_abc/],
pre_calc => [qw /calc_endemism_whole_lists calc_abc2/],
uses_nbr_lists => 2, # how many lists it must have
distribution => 'nonnegative', # for A, B and C
indices => {
Expand Down Expand Up @@ -65,12 +66,20 @@ sub calc_rw_turnover {
# or inverse of ranges
my $cache
= $self->get_cached_value_dor_set_default_href ('_calc_phylo_rwt_pairwise_branch_sum_cache');
# use postfix idiom?
# ideally we would only be passed arrays, but see issue #919
my $key1 = is_hashref ($args{element_list1})
? ((keys %{$args{element_list1}})[0])
: (${$args{element_list1}}[0]);
my $key2 = is_hashref ($args{element_list2})
? ((keys %{$args{element_list2}})[0])
: (${$args{element_list2} //[]}[0]);
# Could use a reduce call to collapse the "sum map {} @list" idiom,
# thus avoiding a list generation. These are only run once per group,
# though, so it might not matter.
my $sum_i = $cache->{(keys %{$args{element_list1}})[0]} # use postfix deref?
my $sum_i = $cache->{$key1}
//= (sum map {1 / $_} @ranges{keys %list1}) // 0;
my $sum_j = $cache->{(keys %{$args{element_list2}})[0]}
my $sum_j = $cache->{$key2}
//= (sum map {1 / $_} @ranges{keys %list2}) // 0;
# save some looping, mainly when there are large differences in key counts
if (keys %list1 <= keys %list2) {
Expand Down Expand Up @@ -121,7 +130,7 @@ sub get_metadata_calc_phylo_rw_turnover {
name => 'Phylo Range weighted Turnover',
reference => 'Laffan et al. (2016) https://doi.org/10.1111/2041-210X.12513',
type => 'Phylogenetic Turnover',
pre_calc => [qw /calc_abc _calc_pe_lists_per_element_set/],
pre_calc => [qw /_calc_pe_lists_per_element_set/],
# pre_calc_global => [qw /
# get_node_range_hash_as_lists
# get_trimmed_tree_parent_name_hash
Expand Down Expand Up @@ -164,9 +173,20 @@ sub calc_phylo_rw_turnover {
# simplify the calcs as we only need to find $aa
my $cache
= $self->get_cached_value_dor_set_default_href ('_calc_phylo_rwt_pairwise_branch_sum_cache');
my $sum_i = $cache->{(keys %{$args{element_list1}})[0]} # use postfix deref?
# use postfix idiom?
# ideally we would only be passed arrays, but see issue #919
my $key1 = is_hashref ($args{element_list1})
? ((keys %{$args{element_list1}})[0])
: (${$args{element_list1}}[0]);
my $key2 = is_hashref ($args{element_list2})
? ((keys %{$args{element_list2}})[0])
: (${$args{element_list2} //[]}[0]);
# Could use a reduce call to collapse the "sum map {} @list" idiom,
# thus avoiding a list generation. These are only run once per group,
# though, so it might not matter.
my $sum_i = $cache->{$key1}
//= (sum values %list1) // 0;
my $sum_j = $cache->{(keys %{$args{element_list2}})[0]}
my $sum_j = $cache->{$key2}
//= (sum values %list2) // 0;
# save some looping, mainly when there are large differences in key counts
if (keys %list1 <= keys %list2) {
Expand Down
Loading

0 comments on commit 865f15d

Please sign in to comment.