Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index and other optimisations #922

Merged
merged 24 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
11a9bbe
Indices: optimise _calc_endemism_absolute
shawnlaffan Feb 23, 2024
501bd7b
Minor optimisations in _calc_endemism_hier_part
shawnlaffan Feb 23, 2024
f9bcf98
TreeNode.pm: use a linear scan for get_hash_lists_below
shawnlaffan Feb 23, 2024
1e7deb8
Add an array args version of set_basedata_ref
shawnlaffan Feb 23, 2024
4dec15f
Trees: clone_without_caches also clears parameters
shawnlaffan Feb 23, 2024
965c1ad
Common::get_zscore_from_comp_results - avoid a lot of grepping
shawnlaffan Feb 23, 2024
6ee1109
optimise Tree::convert_comparisons_to_significances
shawnlaffan Feb 23, 2024
80bc505
optimise Spatial::convert_comparisons_to_significances
shawnlaffan Feb 23, 2024
81f38b3
Indices: add a hierarchical mode flag
shawnlaffan Feb 23, 2024
1c6a509
Indices: Support hierarchical calculations
shawnlaffan Feb 23, 2024
5d6a8b4
Indices: calc_labels_not_on_tree: return early if nothing to work with
shawnlaffan Feb 23, 2024
67f655e
Indices: add a hierarchical variant of get_path_lengths_to_root_node
shawnlaffan Feb 23, 2024
6d9c553
Indices: _calc_endemism_hier_part: avoid some method calls
shawnlaffan Feb 23, 2024
8a69ac2
Indices: _calc_endemism_hier_part: refactor some variables
shawnlaffan Feb 23, 2024
a37dbb3
delete commented code
shawnlaffan Feb 23, 2024
9b1846d
Indices: refactor hierarchical node details
shawnlaffan Feb 24, 2024
744c430
Squeeze a little more performance out of compare_lists_by_item
shawnlaffan Feb 24, 2024
7586528
TreeNode::add_to_lists: optimise
shawnlaffan Feb 24, 2024
0c34d12
Cluster spatial calcs: add lists by ref
shawnlaffan Feb 24, 2024
4168546
compare_lists_by_item: lift a var outside the loop
shawnlaffan Feb 24, 2024
b2ee3ed
Indices: cache the current results from each sub
shawnlaffan Feb 24, 2024
e173a47
Indices: reuse whole and central endemism results when appropriate
shawnlaffan Feb 24, 2024
16a4f86
formatting
shawnlaffan Feb 24, 2024
62fd8a1
simplify by using List::Util::any
shawnlaffan Feb 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions lib/Biodiverse/Cluster.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2777,7 +2777,9 @@ sub sp_calc {
);

# drop out if we have none to do
return if $indices_object->get_valid_calculation_count == 0;
return if $indices_object->get_valid_calculation_count == 0;

$indices_object->set_hierarchical_mode(!$args{no_hierarchical_mode});

delete $args{calculations}; # saves passing it onwards when we call the calculations
delete $args{analyses}; # for backwards compat
Expand Down Expand Up @@ -2809,16 +2811,26 @@ sub sp_calc {
$count / $to_do,
);

# needs a better name
my $current_node_details = {
name => $node->get_name,
child_names => [map {$_->get_name} $node->get_children],
};

my %sp_calc_values = $indices_object->run_calculations(
%args,
element_list1 => [keys %{$node->get_terminal_elements}]
element_list1 => [ keys %{$node->get_terminal_elements} ],
current_node_details => $current_node_details,
);

foreach my $key (keys %sp_calc_values) {
if (is_arrayref($sp_calc_values{$key})
|| is_hashref($sp_calc_values{$key})) {

$node->add_to_lists ($key => $sp_calc_values{$key});
$node->add_to_lists (
$key => $sp_calc_values{$key},
use_ref => 1,
);
delete $sp_calc_values{$key};
}
}
Expand All @@ -2830,6 +2842,8 @@ sub sp_calc {

$self->delete_cached_metadata;

$indices_object->set_hierarchical_mode(0);

return 1;
}

Expand Down
74 changes: 40 additions & 34 deletions lib/Biodiverse/Common.pm
Original file line number Diff line number Diff line change
Expand Up @@ -266,23 +266,26 @@ sub load_yaml_file {
croak 'Loading from a YAML file is no longer supported';
}

# Orig should never have used a hash. Oh well.
sub set_basedata_ref_aa {
my ($self, $ref) = @_;
$self->set_basedata_ref(BASEDATA_REF => $ref);
}

sub set_basedata_ref {
my $self = shift;
my %args = @_;

$self->set_param (BASEDATA_REF => $args{BASEDATA_REF});
$self->weaken_basedata_ref;
$self->weaken_basedata_ref if defined $args{BASEDATA_REF};

return;
}

sub get_basedata_ref {
my $self = shift;

my $bd = $self->get_param ('BASEDATA_REF')
|| Biodiverse::MissingBasedataRef->throw (
message => 'Parameter BASEDATA_REF not set'
);
my $bd = $self->get_param ('BASEDATA_REF');

return $bd;
}
Expand Down Expand Up @@ -2415,18 +2418,19 @@ sub compare_lists_by_item {
\my %base_ref = $args{base_list_ref};
\my %comp_ref = $args{comp_list_ref};
\my %results = $args{results_list_ref};
my ($diff, $increment);

COMP_BY_ITEM:
foreach my $index (keys %base_ref) {

next COMP_BY_ITEM
if !(defined $base_ref{$index} && defined $comp_ref{$index});
if !(defined $comp_ref{$index} && defined $base_ref{$index});

# compare at 10 decimal place precision
# this also allows for serialisation which
# rounds the numbers to 15 decimals
my $diff = $base_ref{$index} - $comp_ref{$index};
my $increment = $diff > DEFAULT_PRECISION_SMALL ? 1 : 0;
$diff = $base_ref{$index} - $comp_ref{$index};
$increment = $diff > DEFAULT_PRECISION_SMALL ? 1 : 0;

# for debug, but leave just in case
#carp "$element, $op\n$comp\n$base " . ($comp - $base) if $increment;
Expand All @@ -2438,18 +2442,18 @@ sub compare_lists_by_item {
# SUMX is the sum of compared values
# SUMXX is the sum of squared compared values
# The latter two are used in z-score calcs
$results{"C_$index"} += $increment;
$results{"Q_$index"} ++;
$results{"P_$index"} = $results{"C_$index"}
/ $results{"Q_$index"};
# obfuscated to squeeze as much speed as we can
# $results{"C_$index"} += $increment;
# $results{"Q_$index"} ++;
$results{"P_$index"} = ($results{"C_$index"} += $increment)
/ (++$results{"Q_$index"});
# use original vals for sums
$results{"SUMX_$index"} += $comp_ref{$index};
$results{"SUMXX_$index"} += ($comp_ref{$index}**2);
$results{"SUMX_$index"} += $comp_ref{$index};
$results{"SUMXX_$index"} += ($comp_ref{$index}**2);

# track the number of ties
if (abs($diff) <= DEFAULT_PRECISION_SMALL) {
$results{"T_$index"} ++;
}
$results{"T_$index"} ++
if (abs($diff) <= DEFAULT_PRECISION_SMALL);
}

return;
Expand Down Expand Up @@ -2513,35 +2517,33 @@ sub get_zscore_from_comp_results {
my %args = @_;

# could alias this
my $comp_list_ref = $args{comp_list_ref}
\my %comp_list_ref = $args{comp_list_ref}
// croak "comp_list_ref argument not specified\n";
# need the observed values
my $base_list_ref = $args{base_list_ref}
\my %base_list_ref = $args{base_list_ref}
// croak "base_list_ref argument not specified\n";

my $results_list_ref = $args{results_list_ref} // {};

KEY:
foreach my $q_key (grep {$_ =~ /^Q_/} keys %$comp_list_ref) {
my $index_name = substr $q_key, 2;
foreach my $index_name (keys %base_list_ref) {

my $n = $comp_list_ref->{$q_key};
my $n = $comp_list_ref{'Q_' . $index_name};
next KEY if !$n;

my $x_key = 'SUMX_' . $index_name;
my $xx_key = 'SUMXX_' . $index_name;

# sum of x vals and x vals squared
my $sumx = $comp_list_ref->{$x_key};
my $sumxx = $comp_list_ref->{$xx_key};
my $sumx = $comp_list_ref{$x_key};
my $sumxx = $comp_list_ref{$xx_key};

my $z_key = $index_name;
# n better be large, as we do not use n-1
my $variance = max (0, ($sumxx - ($sumx**2) / $n) / $n);
my $obs = $base_list_ref->{$index_name};
$results_list_ref->{$z_key}
= $variance
? ($obs - ($sumx / $n)) / sqrt ($variance)
my $variance = ($sumxx - ($sumx**2) / $n) / $n;

$results_list_ref->{$index_name}
= $variance > 0
? ($base_list_ref{$index_name} - ($sumx / $n)) / sqrt ($variance)
: 0;
}

Expand Down Expand Up @@ -2666,15 +2668,19 @@ sub get_sig_rank_from_comp_results {
my $self = shift;
my %args = @_;

# could alias this
\my %comp_list_ref = $args{comp_list_ref}
// croak "comp_list_ref argument not specified\n";

\my %results_list_ref = $args{results_list_ref} // {};

foreach my $c_key (grep {$_ =~ /^C_/} keys %comp_list_ref) {

my $index_name = substr $c_key, 2;
# base_list_ref will usually be shorter so fewer comparisons will be needed
my @keys = $args{base_list_ref}
? grep {exists $comp_list_ref{'C_' . $_}} keys %{$args{base_list_ref}}
: map {substr $_, 2} grep {$_ =~ /^C_/} keys %comp_list_ref;

foreach my $index_name (@keys) {

my $c_key = 'C_' . $index_name;

if (!defined $comp_list_ref{$c_key}) {
$results_list_ref{$index_name} = undef;
Expand Down
47 changes: 34 additions & 13 deletions lib/Biodiverse/Indices.pm
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use warnings;
#use Data::Dumper;
use Scalar::Util qw /blessed weaken/;
use List::MoreUtils qw /uniq/;
use List::Util qw /sum/;
use List::Util qw /sum any/;
use English ( -no_match_vars );
use Ref::Util qw { :all };
use JSON::MaybeXS;
Expand Down Expand Up @@ -794,18 +794,10 @@ sub parse_dependencies_for_calc {
$self->_convert_to_array( input => $required_args );

foreach my $required_arg ( sort @$reqd_args_a ) {
my $re = qr /^($required_arg)$/
; # match is used in the grep? Was used in now-removed code.
my $is_defined;
CALC_ARG:
foreach
my $calc_arg ( sort grep { $_ =~ $re } keys %$calc_args )
{
if ( defined $calc_args->{$calc_arg} ) {
$is_defined++;
last CALC_ARG;
}
}
my $re = qr /^($required_arg)$/;
my $is_defined
= any { $_ =~ $re && defined $calc_args->{$_}}
sort keys %$calc_args;

if ( !$is_defined ) {
Biodiverse::Indices::MissingRequiredArguments->throw(
Expand Down Expand Up @@ -1533,11 +1525,16 @@ sub run_dependencies {
my $tmp = $self->get_param('AS_RESULTS_FROM_GLOBAL') || {};
my %as_results_from_global = %$tmp; # make a copy

state $cache_name_local_results = 'AS_RESULTS_FROM_LOCAL';

# Now we run the calculations at this level.
# We also keep track of what has been run
# to avoid repetition through multiple dependencies.
my %results;
my %as_results_from;
# make sure this is new each iteration
$self->set_cached_value ($cache_name_local_results => \%as_results_from);

foreach my $calc (@$calc_list) {
my $calc_results;

Expand Down Expand Up @@ -1574,6 +1571,9 @@ sub run_dependencies {
$results{$calc} = $calc_results;
}

# We refresh each call above, but this ensures last one is cleaned up.
$self->delete_cached_value($cache_name_local_results);

if ( $type eq 'pre_calc_global' ) {
$self->set_param( AS_RESULTS_FROM_GLOBAL => \%as_results_from_global );
}
Expand Down Expand Up @@ -1663,6 +1663,9 @@ sub set_pairwise_mode {

$self->{pairwise_mode} = $mode;

croak "Cannot have both pairwise and hierarchical modes on at the same time"
if $mode && $self->get_hierarchical_mode;

return $mode;
}

Expand All @@ -1671,6 +1674,24 @@ sub get_pairwise_mode {
$_[0]->{pairwise_mode};
}

sub set_hierarchical_mode {
my ( $self, $mode ) = @_;

$self->{hierarchical_mode} = $mode;

croak "Cannot have both pairwise and hierarchical modes on at the same time"
if $mode && $self->get_pairwise_mode;

return $mode;
}

# potential hot path so optimise to avoid arg handling
sub get_hierarchical_mode {
$_[0]->{hierarchical_mode};
}



1;

__END__
Expand Down
Loading
Loading