Skip to content

Commit e9eaffc

Browse files
authored
Merge pull request #14 from sfiligoi/igor_tile2210b
Add tiling to speed CPU code
2 parents 252f3a6 + c761979 commit e9eaffc

File tree

1 file changed

+73
-19
lines changed

1 file changed

+73
-19
lines changed

src/unifrac_task.cpp

+73-19
Original file line numberDiff line numberDiff line change
@@ -435,13 +435,8 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
435435
#ifdef _OPENACC
436436
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::acc_vector_size;
437437
#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,lengths,zcheck,sums) async
438-
#else
439-
// use dynamic scheduling due to non-homogeneity in the loop
440-
#pragma omp parallel for default(shared) schedule(dynamic,1)
441-
#endif
442438
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
443439
for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
444-
#ifdef _OPENACC
445440
// SIMT-based GPU work great one at a time (HW will deal with parallelism)
446441
for(uint64_t ik = 0; ik < step_size ; ik++) {
447442
const uint64_t k = sk*step_size + ik;
@@ -457,7 +452,22 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
457452
filled_embs,idx, n_samples_r,
458453
k, l1);
459454
} // for ik
455+
} // for stripe
456+
} // for sk
460457
#else
458+
// tilling helps with better cache reuse without the need of multiple cores
459+
const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
460+
461+
// use dynamic scheduling due to non-homogeneity in the loop
462+
// Use a moderate block to prevent trashing but still have some cache reuse
463+
#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
464+
for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
465+
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
466+
// tile to maximize cache reuse
467+
for(uint64_t is = 0; is < step_size ; is++) {
468+
const uint64_t stripe = start_idx+ss*step_size + is;
469+
if (stripe<stop_idx) { // else past limit
470+
461471
// SIMD-based CPUs need help with vectorization
462472
const uint64_t idx = (stripe-start_idx) * n_samples_r;
463473
uint64_t ks = sk*step_size;
@@ -493,9 +503,12 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
493503
ks, ls);
494504
ls = (ls + 1)%n_samples; // wraparound
495505
} // for ks
506+
507+
} // if stripe
508+
} // for is
509+
} // for sk
510+
} // for ss
496511
#endif
497-
} // for stripe
498-
} // for sk
499512

500513
#ifdef _OPENACC
501514
// next iteration will use the alternative space
@@ -868,13 +881,8 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
868881
#ifdef _OPENACC
869882
const unsigned int acc_vector_size = SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::acc_vector_size;
870883
#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,lengths,zcheck,sums) async
871-
#else
872-
// use dynamic scheduling due to non-homogeneity in the loop
873-
#pragma omp parallel for schedule(dynamic,1) default(shared)
874-
#endif
875884
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
876885
for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
877-
#ifdef _OPENACC
878886
// SIMT-based GPU work great one at a time (HW will deal with parallelism)
879887
for(uint64_t ik = 0; ik < step_size ; ik++) {
880888
const uint64_t k = sk*step_size + ik;
@@ -890,7 +898,22 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
890898
filled_embs,idx, n_samples_r,
891899
k, l1);
892900
} // for ik
901+
} // for stripe
902+
} // for sk
893903
#else
904+
// tilling helps with better cache reuse without the need of multiple cores
905+
const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
906+
907+
// use dynamic scheduling due to non-homogeneity in the loop
908+
// Use a moderate block to prevent trashing but still have some cache reuse
909+
#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
910+
for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
911+
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
912+
// tile to maximize cache reuse
913+
for(uint64_t is = 0; is < step_size ; is++) {
914+
const uint64_t stripe = start_idx+ss*step_size + is;
915+
if (stripe<stop_idx) { // else past limit
916+
894917
// SIMD-based CPUs need help with vectorization
895918
const uint64_t idx = (stripe-start_idx) * n_samples_r;
896919
uint64_t ks = sk*step_size;
@@ -926,9 +949,12 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
926949
ks, ls);
927950
ls = (ls + 1)%n_samples; // wraparound
928951
} // for ks
952+
953+
} // if stripe
954+
} // for is
955+
} // for sk
956+
} // for ss
929957
#endif
930-
} // for stripe
931-
} // for sk
932958

933959
#ifdef _OPENACC
934960
// next iteration will use the alternative space
@@ -1566,11 +1592,7 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
15661592
// point of thread
15671593
#ifdef _OPENACC
15681594
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
1569-
#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
1570-
#else
1571-
// use dynamic scheduling due to non-homogeneity in the loop
1572-
#pragma omp parallel for schedule(dynamic,1) default(shared)
1573-
#endif
1595+
#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums,zcheck,stripe_sums) async
15741596
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
15751597
for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
15761598
for(uint64_t ik = 0; ik < step_size ; ik++) {
@@ -1591,6 +1613,38 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
15911613

15921614
}
15931615
}
1616+
#else
1617+
// tilling helps with better cache reuse without the need of multiple cores
1618+
const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
1619+
1620+
// use dynamic scheduling due to non-homogeneity in the loop
1621+
// Use a moderate block to prevent trashing but still have some cache reuse
1622+
#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
1623+
for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
1624+
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
1625+
// tile to maximize cache reuse
1626+
for(uint64_t is = 0; is < step_size ; is++) {
1627+
const uint64_t stripe = start_idx+ss*step_size + is;
1628+
if (stripe<stop_idx) { // esle past limit}
1629+
for(uint64_t ik = 0; ik < step_size ; ik++) {
1630+
const uint64_t k = sk*step_size + ik;
1631+
if (k<n_samples) { // elsepast the limit
1632+
const uint64_t idx = (stripe-start_idx) * n_samples_r;
1633+
const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
1634+
1635+
Unweighted1<TFloat>(
1636+
dm_stripes_buf,dm_stripes_total_buf,
1637+
zcheck, stripe_sums,
1638+
sums, embedded_proportions,
1639+
filled_embs_els_round,idx, n_samples_r,
1640+
k, l1);
1641+
} // if k
1642+
} // for ik
1643+
} // if stripe
1644+
} // for is
1645+
} // for sk
1646+
} // for ss
1647+
#endif
15941648

15951649
#ifdef _OPENACC
15961650
// next iteration will use the alternative space

0 commit comments

Comments
 (0)