Skip to content

Commit

Permalink
Merge pull request #917 from shawnlaffan/general_changes_20240212
Browse files Browse the repository at this point in the history
A set of general optimisations and cleanups
  • Loading branch information
shawnlaffan authored Feb 13, 2024
2 parents 61bbcde + 22a6e83 commit f9078c1
Show file tree
Hide file tree
Showing 10 changed files with 183 additions and 152 deletions.
45 changes: 31 additions & 14 deletions lib/Biodiverse/BaseData.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use Carp;
#use Data::Dumper;
use POSIX qw {fmod floor};
use Scalar::Util qw /looks_like_number blessed/;
use List::Util 1.45 qw /max min sum pairs uniq/;
use List::Util 1.45 qw /max min sum pairs uniq pairmap/;
use List::MoreUtils qw /first_index/;
use Path::Tiny qw /path/;
use Geo::Converter::dms2dd qw {dms2dd};
Expand Down Expand Up @@ -1361,26 +1361,38 @@ sub add_elements_collated {

# simplified array args version for speed
sub add_elements_collated_simple_aa {
my ( $self, $gp_lb_hash, $csv_object, $allow_empty_groups ) = @_;
my ( $self, $gp_lb_hash, $csv_object, $allow_empty_groups, $transpose ) = @_;

croak "csv_object arg not passed\n"
if !$csv_object;

# blank slate so set directly
return $self->_set_elements_collated_simple_aa($gp_lb_hash, $csv_object, $allow_empty_groups)
if (!$self->get_group_count && !$self->get_label_count);
return $self->_set_elements_collated_simple_aa($gp_lb_hash, $csv_object, $allow_empty_groups, $transpose)
if !$self->get_group_count && !$self->get_label_count;

# now add the collated data
foreach my $gp_lb_pair ( pairs %$gp_lb_hash ) {
my ( $gp, $lb_hash ) = @$gp_lb_pair;
# duplicated loops to avoid conditions inside them
if (!$transpose) {
foreach my $gp_lb_pair (pairs % $gp_lb_hash) {
my ($gp, $lb_hash) = @$gp_lb_pair;

if ( $allow_empty_groups && !scalar %$lb_hash ) {
$self->add_element_simple_aa ( undef, $gp, 0, $csv_object );
if ($allow_empty_groups && !scalar %$lb_hash) {
$self->add_element_simple_aa(undef, $gp, 0, $csv_object);
}
else {
pairmap {$self->add_element_simple_aa($a, $gp, $b, $csv_object)} %$lb_hash;
}
}
else {
foreach my $lb_count_pair ( pairs %$lb_hash ) {
my ( $lb, $count ) = @$lb_count_pair;
$self->add_element_simple_aa( $lb, $gp, $count, $csv_object );
}
else {
foreach my $pair (pairs % $gp_lb_hash) {
my ($lb, $gp_hash) = @$pair;

if ($allow_empty_groups && !scalar %$gp_hash) {
$self->add_element_simple_aa($lb, undef, 0, $csv_object);
}
else {
pairmap {$self->add_element_simple_aa($lb, $a, $b, $csv_object)} %$gp_hash;
}
}
}
Expand All @@ -1391,7 +1403,7 @@ sub add_elements_collated_simple_aa {
# currently an internal sub as we might later take ownership of the input data
# using refaliasing to squeeze a bit more speed
sub _set_elements_collated_simple_aa {
my ( $self, $gp_lb_hash, $csv_object, $allow_empty_groups ) = @_;
my ( $self, $gp_lb_hash, $csv_object, $allow_empty_groups, $transpose ) = @_;

croak "csv_object arg not passed\n"
if !$csv_object;
Expand All @@ -1400,6 +1412,9 @@ sub _set_elements_collated_simple_aa {

my $groups_ref = $self->get_groups_ref;
my $labels_ref = $self->get_labels_ref;
if ($transpose) {
($groups_ref, $labels_ref) = ($labels_ref, $groups_ref);
}

# now add the collated data to the groups object
foreach \my @gp_lb_pair ( pairs %$gp_lb_hash ) {
Expand Down Expand Up @@ -1606,7 +1621,9 @@ sub transfer_element_properties {
my $to_name = $to_bd->get_param('NAME');
my $text = "Transferring $type properties from $name to $to_name";

my $progress_bar = Biodiverse::Progress->new();
my $progress_bar = Biodiverse::Progress->new(
no_gui_progress => $args{no_gui_progress},
);
my $total_to_do = $elements_ref->get_element_count;
print "[BASEDATA] Transferring properties for $total_to_do $type\n";

Expand Down
73 changes: 28 additions & 45 deletions lib/Biodiverse/BaseStruct.pm
Original file line number Diff line number Diff line change
Expand Up @@ -362,21 +362,33 @@ sub get_element_hash {
sub get_element_name_as_array_aa {
my ($self, $element, $csv_object) = @_;

# caching saves a little time for large data sets
# but needs to be shared with a "parent" object to make a difference
# e.g. a spatial object copies from a groups object
my $arr = $self->{ELEMENTS}{$element}{_ELEMENT_ARRAY};
return wantarray ? @$arr : $arr
if $arr;

state $el_list_ref_cache_name = '_ELEMENT_ARRAY_REF_CACHE';
my $element_list_ref_cache = $self->get_cached_value_dor_set_default_href ($el_list_ref_cache_name);

$self->{ELEMENTS}{$element}{_ELEMENT_ARRAY} = $element_list_ref_cache->{$element};
$arr = $self->{ELEMENTS}{$element}{_ELEMENT_ARRAY} = $element_list_ref_cache->{$element};

return wantarray
? @{$element_list_ref_cache->{$element}}
: $element_list_ref_cache->{$element}
if $element_list_ref_cache->{$element};
return wantarray ? @$arr : $arr
if $arr;

# package level cache
state $_el_array_cache = {};
my $element_list_ref = $_el_array_cache->{$element};

if ($element_list_ref) {
# work with a copy of the package array but cache the copy on $self
my $copy = [ @$element_list_ref ];
$self->{ELEMENTS}{$element}{_ELEMENT_ARRAY}
= $element_list_ref_cache->{$element}
= $copy;
return wantarray ? @$copy : $copy;
}

my $quote_char = $self->get_param('QUOTES');
my $element_list_ref = $self->csv2list(
$element_list_ref = $self->csv2list(
string => $element,
sep_char => $self->get_param('JOIN_CHAR'),
quote_char => $quote_char,
Expand All @@ -392,10 +404,13 @@ sub get_element_name_as_array_aa {
}
}

$_el_array_cache->{$element} = $element_list_ref;
my $copy = [@$element_list_ref];

$self->{ELEMENTS}{$element}{_ELEMENT_ARRAY}
= $element_list_ref_cache->{$element}
= $element_list_ref;
return wantarray ? @$element_list_ref : $element_list_ref;
= $copy; # work with a copy
return wantarray ? @$copy : $copy;
}

sub get_element_name_as_array {
Expand All @@ -405,7 +420,6 @@ sub get_element_name_as_array {
my $element = $args{element} //
croak "element not specified\n";
return $self->get_element_name_as_array_aa ($element, $args{csv_object});
# return $self->get_array_list_values_aa ($element, '_ELEMENT_ARRAY');
}

# get a list of the unique values for one axis
Expand Down Expand Up @@ -631,44 +645,13 @@ sub add_element {
my $element = $args{element} //
croak "element not specified\n";

$self->{ELEMENTS}{$element} //= {};

# don't re-create the element array
return if $self->{ELEMENTS}{$element}{_ELEMENT_ARRAY};

$self->get_element_name_as_array_aa ($element, $args{csv_object});
return;

# caching saves a little time for large data sets
# but needs to be shared with a "parent" object to make a difference
# e.g. a spatial object copies from a groups object
state $el_list_ref_cache_name = '_ELEMENT_ARRAY_REF_CACHE';
my $element_list_ref_cache = $self->get_cached_value_dor_set_default_href ($el_list_ref_cache_name);

$self->{ELEMENTS}{$element}{_ELEMENT_ARRAY} = $element_list_ref_cache->{$element};

return if $element_list_ref_cache->{$element};

my $quote_char = $self->get_param('QUOTES');
my $element_list_ref = $self->csv2list(
string => $element,
sep_char => $self->get_param('JOIN_CHAR'),
quote_char => $quote_char,
csv_object => $args{csv_object},
);

if (scalar @$element_list_ref == 1) {
$element_list_ref->[0] //= ($quote_char . $quote_char)
}
else {
for my $el (@$element_list_ref) {
$el //= $EMPTY_STRING;
}
}

$self->{ELEMENTS}{$element}{_ELEMENT_ARRAY}
= $element_list_ref_cache->{$element}
= $element_list_ref;

return;
}

sub add_sub_element { # add a subelement to a BaseStruct element. create the element if it does not exist
Expand Down
4 changes: 2 additions & 2 deletions lib/Biodiverse/Indices/Phylogenetic.pm
Original file line number Diff line number Diff line change
Expand Up @@ -604,15 +604,15 @@ sub get_path_lengths_to_root_node {
# We could use a global precalc, but that won't scale well with
# massive trees where we only need a subset.
my $path_cache_master
= $self->get_cached_value_dor_set_default_aa (PATH_LENGTH_CACHE_PER_TERMINAL => {});
= $self->get_cached_value_dor_set_default_href ('PATH_LENGTH_CACHE_PER_TERMINAL');
my $path_cache = $path_cache_master->{$tree_ref} //= {};

# get a hash of node refs
my $all_nodes = $tree_ref->get_node_hash;

# now loop through the labels and get the path to the root node
my $path_hash = {};
my @collected_paths; # used if we have B::Utils 1.07 or greater
my @collected_paths; # used if we have Bd::Utils 1.07 or greater
foreach my $label (grep exists $all_nodes->{$_}, keys %$label_list) {
# Could assign to $current_node here, but profiling indicates it
# takes meaningful chunks of time for large data sets
Expand Down
3 changes: 2 additions & 1 deletion lib/Biodiverse/Progress.pm
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ sub new {

return $self
if $Biodiverse::Config::progress_no_use_gui
|| !$Biodiverse::Config::running_under_gui;
|| !$Biodiverse::Config::running_under_gui
|| $args{no_gui_progress};

# if we are to use the GUI
#print "RUNNING UNDER GUI: $Biodiverse::Config::running_under_gui\n";
Expand Down
50 changes: 31 additions & 19 deletions lib/Biodiverse/Randomise.pm
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use Carp;
use POSIX qw { ceil floor };
use Time::HiRes qw { time gettimeofday tv_interval };
use Scalar::Util qw { blessed looks_like_number };
use List::Util qw /any all none minstr max/;
use List::Util qw /any all none minstr max pairmap/;
use List::MoreUtils::XS; # paranoia to ensure we have this loaded
use List::MoreUtils 0.425 qw /first_index uniq binsert bremove/;
use Ref::Util qw /is_ref is_arrayref is_hashref/;
Expand Down Expand Up @@ -511,6 +511,10 @@ sub run_randomisation {

my $progress_bar = Biodiverse::Progress->new(text => 'Randomisation');

my %progress_timers;
# arbitrary threshold but should be OK
my $no_gui_progress_thresh = 1;

# do stuff here
ITERATION:
foreach my $i (1 .. $iterations) {
Expand All @@ -530,22 +534,26 @@ sub run_randomisation {
"Randomisation iteration $i of $iterations this run",
($i / $iterations),
);

my $start_time_get_rand_bd = [gettimeofday];

my $rand_bd = eval {
$self->get_randomised_basedata (
%args,
rand_object => $rand_object,
rand_iter => $$total_iterations,
rand_function => $function,
rand_object => $rand_object,
rand_iter => $$total_iterations,
rand_function => $function,
no_gui_progress => (($progress_timers{gen_bd} // 1000) / $i < $no_gui_progress_thresh),
);
};
croak $EVAL_ERROR if $EVAL_ERROR || ! defined $rand_bd;

my $time_taken = sprintf "%.3f", tv_interval ($start_time_get_rand_bd);

my $t_diff = tv_interval ($start_time_get_rand_bd);
my $time_taken = sprintf "%.3f", $t_diff;
say "[RANDOMISE] Time taken to randomise basedata: $time_taken seconds";

$progress_timers{gen_bd} += $t_diff;

$rand_bd->rename (
name => join ('_', $bd->get_param ('NAME'), $function, $$total_iterations),
);
Expand Down Expand Up @@ -603,13 +611,19 @@ sub run_randomisation {
}

if ($generate_rand_analysis) {
my $start_time_analysis = [gettimeofday()];
my %prog_args = (
no_gui_progress => (($progress_timers{$target} // 1000) / $i < $no_gui_progress_thresh)
);

eval {
$self->override_object_analysis_args (
%args,
randomised_arg_object_cache => \%randomised_arg_object_cache,
object => $rand_analysis,
rand_object => $rand_object,
iteration => $$total_iterations,
%prog_args,
);
};
croak $EVAL_ERROR if $EVAL_ERROR;
Expand All @@ -618,6 +632,7 @@ sub run_randomisation {
$rand_analysis->run_analysis (
progress_text => $progress_text,
use_nbrs_from => $target,
%prog_args,
);
};
croak $EVAL_ERROR if $EVAL_ERROR;
Expand All @@ -626,9 +641,12 @@ sub run_randomisation {
$target->compare (
comparison => $rand_analysis,
result_list_name => $results_list_name,
%prog_args,
)
};
croak $EVAL_ERROR if $EVAL_ERROR;

$progress_timers{$target} += tv_interval ($start_time_analysis);
}

# Does nothing if not a cluster type analysis
Expand Down Expand Up @@ -1249,7 +1267,9 @@ sub rand_csr_by_group {

my $bd = $args{basedata_ref} || $self->get_param ('BASEDATA_REF');

my $progress_bar = Biodiverse::Progress->new();
my $progress_bar = Biodiverse::Progress->new(
no_gui_progress => $args{no_gui_progress},
);

# can't store MRMA objects to all output formats and then recreate
my $rand = delete $args{rand_object};
Expand Down Expand Up @@ -1303,16 +1323,8 @@ sub rand_csr_by_group {
);

# get the labels from the original group and assign them to the random group
my %tmp = $bd->get_labels_in_group_as_hash_aa ($orig_groups[$i]);

while (my ($label, $counts) = each %tmp) {
$new_bd->add_element(
label => $label,
group => $rand_order->[$i],
count => $counts,
csv_object => $csv_object,
);
}
my $labels = $bd->get_labels_in_group_as_hash_aa ($orig_groups[$i]);
pairmap {$new_bd->add_element_simple_aa ($a, $rand_order->[$i], $b, $csv_object)} %$labels;
}

$bd->transfer_label_properties (
Expand Down Expand Up @@ -1614,7 +1626,7 @@ sub rand_structured {
);


my $progress_bar = Biodiverse::Progress->new();
my $progress_bar = Biodiverse::Progress->new(no_gui_progress => $args{no_gui_progress});

# need to get these from the ARGS param if available
# - should also croak if negative
Expand Down
Loading

0 comments on commit f9078c1

Please sign in to comment.