@@ -435,13 +435,8 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
435
435
#ifdef _OPENACC
436
436
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::acc_vector_size;
437
437
#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,lengths,zcheck,sums) async
438
- #else
439
- // use dynamic scheduling due to non-homogeneity in the loop
440
- #pragma omp parallel for default(shared) schedule(dynamic,1)
441
- #endif
442
438
for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
443
439
for (uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
444
- #ifdef _OPENACC
445
440
// SIMT-based GPU work great one at a time (HW will deal with parallelism)
446
441
for (uint64_t ik = 0 ; ik < step_size ; ik++) {
447
442
const uint64_t k = sk*step_size + ik;
@@ -457,7 +452,22 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
457
452
filled_embs,idx, n_samples_r,
458
453
k, l1);
459
454
} // for ik
455
+ } // for stripe
456
+ } // for sk
460
457
#else
458
+ // tilling helps with better cache reuse without the need of multiple cores
459
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1 ))/step_size; // round up
460
+
461
+ // use dynamic scheduling due to non-homogeneity in the loop
462
+ // Use a moderate block to prevent trashing but still have some cache reuse
463
+ #pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
464
+ for (uint64_t ss = 0 ; ss < stripe_steps ; ss++) {
465
+ for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
466
+ // tile to maximize cache reuse
467
+ for (uint64_t is = 0 ; is < step_size ; is++) {
468
+ const uint64_t stripe = start_idx+ss*step_size + is;
469
+ if (stripe<stop_idx) { // else past limit
470
+
461
471
// SIMD-based CPUs need help with vectorization
462
472
const uint64_t idx = (stripe-start_idx) * n_samples_r;
463
473
uint64_t ks = sk*step_size;
@@ -493,9 +503,12 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
493
503
ks, ls);
494
504
ls = (ls + 1 )%n_samples; // wraparound
495
505
} // for ks
506
+
507
+ } // if stripe
508
+ } // for is
509
+ } // for sk
510
+ } // for ss
496
511
#endif
497
- } // for stripe
498
- } // for sk
499
512
500
513
#ifdef _OPENACC
501
514
// next iteration will use the alternative space
@@ -868,13 +881,8 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
868
881
#ifdef _OPENACC
869
882
const unsigned int acc_vector_size = SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::acc_vector_size;
870
883
#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,lengths,zcheck,sums) async
871
- #else
872
- // use dynamic scheduling due to non-homogeneity in the loop
873
- #pragma omp parallel for schedule(dynamic,1) default(shared)
874
- #endif
875
884
for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
876
885
for (uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
877
- #ifdef _OPENACC
878
886
// SIMT-based GPU work great one at a time (HW will deal with parallelism)
879
887
for (uint64_t ik = 0 ; ik < step_size ; ik++) {
880
888
const uint64_t k = sk*step_size + ik;
@@ -890,7 +898,22 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
890
898
filled_embs,idx, n_samples_r,
891
899
k, l1);
892
900
} // for ik
901
+ } // for stripe
902
+ } // for sk
893
903
#else
904
+ // tilling helps with better cache reuse without the need of multiple cores
905
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1 ))/step_size; // round up
906
+
907
+ // use dynamic scheduling due to non-homogeneity in the loop
908
+ // Use a moderate block to prevent trashing but still have some cache reuse
909
+ #pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
910
+ for (uint64_t ss = 0 ; ss < stripe_steps ; ss++) {
911
+ for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
912
+ // tile to maximize cache reuse
913
+ for (uint64_t is = 0 ; is < step_size ; is++) {
914
+ const uint64_t stripe = start_idx+ss*step_size + is;
915
+ if (stripe<stop_idx) { // else past limit
916
+
894
917
// SIMD-based CPUs need help with vectorization
895
918
const uint64_t idx = (stripe-start_idx) * n_samples_r;
896
919
uint64_t ks = sk*step_size;
@@ -926,9 +949,12 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
926
949
ks, ls);
927
950
ls = (ls + 1 )%n_samples; // wraparound
928
951
} // for ks
952
+
953
+ } // if stripe
954
+ } // for is
955
+ } // for sk
956
+ } // for ss
929
957
#endif
930
- } // for stripe
931
- } // for sk
932
958
933
959
#ifdef _OPENACC
934
960
// next iteration will use the alternative space
@@ -1566,11 +1592,7 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
1566
1592
// point of thread
1567
1593
#ifdef _OPENACC
1568
1594
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
1569
- #pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
1570
- #else
1571
- // use dynamic scheduling due to non-homogeneity in the loop
1572
- #pragma omp parallel for schedule(dynamic,1) default(shared)
1573
- #endif
1595
+ #pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums,zcheck,stripe_sums) async
1574
1596
for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
1575
1597
for (uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
1576
1598
for (uint64_t ik = 0 ; ik < step_size ; ik++) {
@@ -1591,6 +1613,38 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
1591
1613
1592
1614
}
1593
1615
}
1616
+ #else
1617
+ // tilling helps with better cache reuse without the need of multiple cores
1618
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1 ))/step_size; // round up
1619
+
1620
+ // use dynamic scheduling due to non-homogeneity in the loop
1621
+ // Use a moderate block to prevent trashing but still have some cache reuse
1622
+ #pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
1623
+ for (uint64_t ss = 0 ; ss < stripe_steps ; ss++) {
1624
+ for (uint64_t sk = 0 ; sk < sample_steps ; sk++) {
1625
+ // tile to maximize cache reuse
1626
+ for (uint64_t is = 0 ; is < step_size ; is++) {
1627
+ const uint64_t stripe = start_idx+ss*step_size + is;
1628
+ if (stripe<stop_idx) { // esle past limit}
1629
+ for (uint64_t ik = 0 ; ik < step_size ; ik++) {
1630
+ const uint64_t k = sk*step_size + ik;
1631
+ if (k<n_samples) { // elsepast the limit
1632
+ const uint64_t idx = (stripe-start_idx) * n_samples_r;
1633
+ const uint64_t l1 = (k + stripe + 1 )%n_samples; // wraparound
1634
+
1635
+ Unweighted1<TFloat>(
1636
+ dm_stripes_buf,dm_stripes_total_buf,
1637
+ zcheck, stripe_sums,
1638
+ sums, embedded_proportions,
1639
+ filled_embs_els_round,idx, n_samples_r,
1640
+ k, l1);
1641
+ } // if k
1642
+ } // for ik
1643
+ } // if stripe
1644
+ } // for is
1645
+ } // for sk
1646
+ } // for ss
1647
+ #endif
1594
1648
1595
1649
#ifdef _OPENACC
1596
1650
// next iteration will use the alternative space
0 commit comments