@@ -435,13 +435,8 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
435
435
#ifdef _OPENACC
436
436
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::acc_vector_size;
437
437
#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,lengths,zcheck,sums) async
438
- #else
439
- // use dynamic scheduling due to non-homogeneity in the loop
440
- #pragma omp parallel for default(shared) schedule(dynamic,1)
441
- #endif
442
438
for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
443
439
for (uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
444
- #ifdef _OPENACC
445
440
// SIMT-based GPU work great one at a time (HW will deal with parallelism)
446
441
for (uint64_t ik = 0 ; ik < step_size ; ik++) {
447
442
const uint64_t k = sk*step_size + ik;
@@ -457,7 +452,22 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
457
452
filled_embs,idx, n_samples_r,
458
453
k, l1);
459
454
} // for ik
455
+ } // for stripe
456
+ } // for sk
460
457
#else
458
+ // tilling helps with better cache reuse without the need of multiple cores
459
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1 ))/step_size; // round up
460
+
461
+ // use dynamic scheduling due to non-homogeneity in the loop
462
+ // Use a moderate block to prevent trashing but still have some cache reuse
463
+ #pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
464
+ for (uint64_t ss = 0 ; ss < stripe_steps ; ss++) {
465
+ for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
466
+ // tile to maximize cache reuse
467
+ for (uint64_t is = 0 ; is < step_size ; is++) {
468
+ const uint64_t stripe = start_idx+ss*step_size + is;
469
+ if (stripe<stop_idx) { // else past limit
470
+
461
471
// SIMD-based CPUs need help with vectorization
462
472
const uint64_t idx = (stripe-start_idx) * n_samples_r;
463
473
uint64_t ks = sk*step_size;
@@ -493,9 +503,12 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
493
503
ks, ls);
494
504
ls = (ls + 1 )%n_samples; // wraparound
495
505
} // for ks
506
+
507
+ } // if stripe
508
+ } // for is
509
+ } // for sk
510
+ } // for ss
496
511
#endif
497
- } // for stripe
498
- } // for sk
499
512
500
513
#ifdef _OPENACC
501
514
// next iteration will use the alternative space
@@ -868,13 +881,8 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
868
881
#ifdef _OPENACC
869
882
const unsigned int acc_vector_size = SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::acc_vector_size;
870
883
#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,lengths,zcheck,sums) async
871
- #else
872
- // use dynamic scheduling due to non-homogeneity in the loop
873
- #pragma omp parallel for schedule(dynamic,1) default(shared)
874
- #endif
875
884
for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
876
885
for (uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
877
- #ifdef _OPENACC
878
886
// SIMT-based GPU work great one at a time (HW will deal with parallelism)
879
887
for (uint64_t ik = 0 ; ik < step_size ; ik++) {
880
888
const uint64_t k = sk*step_size + ik;
@@ -890,7 +898,22 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
890
898
filled_embs,idx, n_samples_r,
891
899
k, l1);
892
900
} // for ik
901
+ } // for stripe
902
+ } // for sk
893
903
#else
904
+ // tilling helps with better cache reuse without the need of multiple cores
905
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1 ))/step_size; // round up
906
+
907
+ // use dynamic scheduling due to non-homogeneity in the loop
908
+ // Use a moderate block to prevent trashing but still have some cache reuse
909
+ #pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
910
+ for (uint64_t ss = 0 ; ss < stripe_steps ; ss++) {
911
+ for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
912
+ // tile to maximize cache reuse
913
+ for (uint64_t is = 0 ; is < step_size ; is++) {
914
+ const uint64_t stripe = start_idx+ss*step_size + is;
915
+ if (stripe<stop_idx) { // else past limit
916
+
894
917
// SIMD-based CPUs need help with vectorization
895
918
const uint64_t idx = (stripe-start_idx) * n_samples_r;
896
919
uint64_t ks = sk*step_size;
@@ -926,9 +949,12 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
926
949
ks, ls);
927
950
ls = (ls + 1 )%n_samples; // wraparound
928
951
} // for ks
952
+
953
+ } // if stripe
954
+ } // for is
955
+ } // for sk
956
+ } // for ss
929
957
#endif
930
- } // for stripe
931
- } // for sk
932
958
933
959
#ifdef _OPENACC
934
960
// next iteration will use the alternative space
0 commit comments