Skip to content

Commit d95bc96

Browse files
authored
Merge pull request #12 from sfiligoi/igor_cpu_vect2210uw
Add zero checking to Unweighted for CPU code
2 parents 10d3b8a + b0f5967 commit d95bc96

File tree

2 files changed

+309
-53
lines changed

2 files changed

+309
-53
lines changed

src/unifrac_task.cpp

+293-51
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,34 @@ static inline void WeightedZerosAndSums(
5252
}
5353
}
5454

55+
// check for zero values
56+
template<class T>
57+
static inline void WeightedZerosAsync(
58+
bool * const __restrict__ zcheck,
59+
const T * const __restrict__ embedded_proportions,
60+
const unsigned int filled_embs,
61+
const uint64_t n_samples,
62+
const uint64_t n_samples_r) {
63+
#ifdef _OPENACC
64+
#pragma acc parallel loop gang vector present(embedded_proportions,zcheck) async
65+
#else
66+
#pragma omp parallel for default(shared)
67+
#endif
68+
for(uint64_t k=0; k<n_samples; k++) {
69+
bool all_zeros=true;
70+
71+
#pragma acc loop seq
72+
for (uint64_t emb=0; emb<filled_embs; emb++) {
73+
const uint64_t offset = n_samples_r * emb;
74+
75+
T u1 = embedded_proportions[offset + k];
76+
all_zeros = all_zeros && (u1==0);
77+
}
78+
79+
zcheck[k] = all_zeros;
80+
}
81+
}
82+
5583

5684
// Single step in computing Weighted part of Unifrac
5785
template<class TFloat>
@@ -1165,6 +1193,254 @@ void SUCMP_NM::UnifracVawGeneralizedTask<TFloat>::_run(unsigned int filled_embs,
11651193
#endif
11661194
}
11671195

1196+
// Single step in computing NormalizedWeighted Unifrac
1197+
#ifdef _OPENACC
1198+
template<class TFloat>
1199+
static inline void Unweighted1(
1200+
TFloat * const __restrict__ dm_stripes_buf,
1201+
TFloat * const __restrict__ dm_stripes_total_buf,
1202+
const TFloat * const __restrict__ sums,
1203+
const uint64_t * const __restrict__ embedded_proportions,
1204+
const unsigned int filled_embs_els_round,
1205+
const uint64_t idx,
1206+
const uint64_t n_samples_r,
1207+
const uint64_t k,
1208+
const uint64_t l1) {
1209+
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
1210+
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
1211+
//TFloat *dm_stripe = dm_stripes[stripe];
1212+
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
1213+
1214+
bool did_update = false;
1215+
TFloat my_stripe = 0.0;
1216+
TFloat my_stripe_total = 0.0;
1217+
1218+
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
1219+
const uint64_t offset = n_samples_r * emb_el;
1220+
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
1221+
1222+
uint64_t u1 = embedded_proportions[offset + k];
1223+
uint64_t v1 = embedded_proportions[offset + l1];
1224+
uint64_t o1 = u1 | v1;
1225+
1226+
if (o1!=0) { // zeros are prevalent
1227+
did_update=true;
1228+
uint64_t x1 = u1 ^ v1;
1229+
1230+
// Use the pre-computed sums
1231+
// Each range of 8 lengths has already been pre-computed and stored in psum
1232+
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1233+
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1234+
1235+
my_stripe_total += psum[ (uint8_t)(o1) ] +
1236+
psum[0x100+((uint8_t)(o1 >> 8))] +
1237+
psum[0x200+((uint8_t)(o1 >> 16))] +
1238+
psum[0x300+((uint8_t)(o1 >> 24))] +
1239+
psum[0x400+((uint8_t)(o1 >> 32))] +
1240+
psum[0x500+((uint8_t)(o1 >> 40))] +
1241+
psum[0x600+((uint8_t)(o1 >> 48))] +
1242+
psum[0x700+((uint8_t)(o1 >> 56))];
1243+
my_stripe += psum[ (uint8_t)(x1) ] +
1244+
psum[0x100+((uint8_t)(x1 >> 8))] +
1245+
psum[0x200+((uint8_t)(x1 >> 16))] +
1246+
psum[0x300+((uint8_t)(x1 >> 24))] +
1247+
psum[0x400+((uint8_t)(x1 >> 32))] +
1248+
psum[0x500+((uint8_t)(x1 >> 40))] +
1249+
psum[0x600+((uint8_t)(x1 >> 48))] +
1250+
psum[0x700+((uint8_t)(x1 >> 56))];
1251+
}
1252+
}
1253+
1254+
if (did_update) {
1255+
dm_stripe[k] += my_stripe;
1256+
dm_stripe_total[k] += my_stripe_total;
1257+
}
1258+
1259+
}
1260+
#else
1261+
template<class TFloat>
1262+
static inline void Unweighted1(
1263+
TFloat * const __restrict__ dm_stripes_buf,
1264+
TFloat * const __restrict__ dm_stripes_total_buf,
1265+
const bool * const __restrict__ zcheck,
1266+
const TFloat * const __restrict__ sums,
1267+
const uint64_t * const __restrict__ embedded_proportions,
1268+
const unsigned int filled_embs_els_round,
1269+
const uint64_t idx,
1270+
const uint64_t n_samples_r,
1271+
const uint64_t k,
1272+
const uint64_t l1) {
1273+
const bool allzero_k = zcheck[k];
1274+
const bool allzero_l1 = zcheck[l1];
1275+
1276+
if (allzero_k && allzero_l1) {
1277+
// nothing to do, would have to add 0
1278+
} else {
1279+
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
1280+
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
1281+
//TFloat *dm_stripe = dm_stripes[stripe];
1282+
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
1283+
1284+
bool did_update = false;
1285+
TFloat my_stripe = 0.0;
1286+
TFloat my_stripe_total = 0.0;
1287+
1288+
if (allzero_k || allzero_l1) {
1289+
// with one side zero, | and ^ are no-ops
1290+
const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero one
1291+
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
1292+
const uint64_t offset = n_samples_r * emb_el;
1293+
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
1294+
1295+
//uint64_t u1 = embedded_proportions[offset + k];
1296+
//uint64_t v1 = embedded_proportions[offset + l1];
1297+
//uint64_t o1 = u1 | v1;
1298+
// With one of the two being 0, or is just the non-negative one
1299+
uint64_t o1 = embedded_proportions[offset + kl];
1300+
1301+
if (o1==0) { // zeros are prevalent
1302+
// nothing to do
1303+
} else if (((uint32_t)o1)==0) {
1304+
// only high part relevant
1305+
did_update=true;
1306+
//uint64_t x1 = u1 ^ v1;
1307+
// With one of the two being 0, xor is just the non-negative one
1308+
1309+
// Use the pre-computed sums
1310+
// Each range of 8 lengths has already been pre-computed and stored in psum
1311+
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1312+
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1313+
1314+
TFloat esum = psum[0x400+((uint8_t)(o1 >> 32))] +
1315+
psum[0x500+((uint8_t)(o1 >> 40))] +
1316+
psum[0x600+((uint8_t)(o1 >> 48))] +
1317+
psum[0x700+((uint8_t)(o1 >> 56))];
1318+
my_stripe_total += esum;
1319+
my_stripe += esum;
1320+
} else if ((o1>>32)==0) {
1321+
// only low part relevant
1322+
did_update=true;
1323+
//uint64_t x1 = u1 ^ v1;
1324+
// With one of the two being 0, xor is just the non-negative one
1325+
1326+
// Use the pre-computed sums
1327+
// Each range of 8 lengths has already been pre-computed and stored in psum
1328+
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1329+
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1330+
1331+
TFloat esum = psum[ (uint8_t)(o1) ] +
1332+
psum[0x100+((uint8_t)(o1 >> 8))] +
1333+
psum[0x200+((uint8_t)(o1 >> 16))] +
1334+
psum[0x300+((uint8_t)(o1 >> 24))];
1335+
my_stripe_total += esum;
1336+
my_stripe += esum;
1337+
} else {
1338+
did_update=true;
1339+
//uint64_t x1 = u1 ^ v1;
1340+
// With one of the two being 0, xor is just the non-negative one
1341+
1342+
// Use the pre-computed sums
1343+
// Each range of 8 lengths has already been pre-computed and stored in psum
1344+
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1345+
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1346+
1347+
TFloat esum = psum[ (uint8_t)(o1) ] +
1348+
psum[0x100+((uint8_t)(o1 >> 8))] +
1349+
psum[0x200+((uint8_t)(o1 >> 16))] +
1350+
psum[0x300+((uint8_t)(o1 >> 24))] +
1351+
psum[0x400+((uint8_t)(o1 >> 32))] +
1352+
psum[0x500+((uint8_t)(o1 >> 40))] +
1353+
psum[0x600+((uint8_t)(o1 >> 48))] +
1354+
psum[0x700+((uint8_t)(o1 >> 56))];
1355+
my_stripe_total += esum;
1356+
my_stripe += esum;
1357+
}
1358+
}
1359+
} else {
1360+
// we need both sides
1361+
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
1362+
const uint64_t offset = n_samples_r * emb_el;
1363+
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
1364+
1365+
uint64_t u1 = embedded_proportions[offset + k];
1366+
uint64_t v1 = embedded_proportions[offset + l1];
1367+
uint64_t o1 = u1 | v1;
1368+
uint64_t x1 = u1 ^ v1;
1369+
1370+
if (o1==0) { // zeros are prevalent
1371+
// nothing to do
1372+
} else if (((uint32_t)o1)==0) {
1373+
// only high part relevant
1374+
did_update=true;
1375+
1376+
// Use the pre-computed sums
1377+
// Each range of 8 lengths has already been pre-computed and stored in psum
1378+
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1379+
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1380+
1381+
my_stripe_total += psum[0x400+((uint8_t)(o1 >> 32))] +
1382+
psum[0x500+((uint8_t)(o1 >> 40))] +
1383+
psum[0x600+((uint8_t)(o1 >> 48))] +
1384+
psum[0x700+((uint8_t)(o1 >> 56))];
1385+
my_stripe += psum[0x400+((uint8_t)(x1 >> 32))] +
1386+
psum[0x500+((uint8_t)(x1 >> 40))] +
1387+
psum[0x600+((uint8_t)(x1 >> 48))] +
1388+
psum[0x700+((uint8_t)(x1 >> 56))];
1389+
} else if ((o1>>32)==0) {
1390+
// only low part relevant
1391+
did_update=true;
1392+
1393+
// Use the pre-computed sums
1394+
// Each range of 8 lengths has already been pre-computed and stored in psum
1395+
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1396+
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1397+
1398+
my_stripe_total += psum[ (uint8_t)(o1) ] +
1399+
psum[0x100+((uint8_t)(o1 >> 8))] +
1400+
psum[0x200+((uint8_t)(o1 >> 16))] +
1401+
psum[0x300+((uint8_t)(o1 >> 24))];
1402+
my_stripe += psum[ (uint8_t)(x1) ] +
1403+
psum[0x100+((uint8_t)(x1 >> 8))] +
1404+
psum[0x200+((uint8_t)(x1 >> 16))] +
1405+
psum[0x300+((uint8_t)(x1 >> 24))];
1406+
1407+
} else {
1408+
did_update=true;
1409+
1410+
// Use the pre-computed sums
1411+
// Each range of 8 lengths has already been pre-computed and stored in psum
1412+
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1413+
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1414+
1415+
my_stripe_total += psum[ (uint8_t)(o1) ] +
1416+
psum[0x100+((uint8_t)(o1 >> 8))] +
1417+
psum[0x200+((uint8_t)(o1 >> 16))] +
1418+
psum[0x300+((uint8_t)(o1 >> 24))] +
1419+
psum[0x400+((uint8_t)(o1 >> 32))] +
1420+
psum[0x500+((uint8_t)(o1 >> 40))] +
1421+
psum[0x600+((uint8_t)(o1 >> 48))] +
1422+
psum[0x700+((uint8_t)(o1 >> 56))];
1423+
my_stripe += psum[ (uint8_t)(x1) ] +
1424+
psum[0x100+((uint8_t)(x1 >> 8))] +
1425+
psum[0x200+((uint8_t)(x1 >> 16))] +
1426+
psum[0x300+((uint8_t)(x1 >> 24))] +
1427+
psum[0x400+((uint8_t)(x1 >> 32))] +
1428+
psum[0x500+((uint8_t)(x1 >> 40))] +
1429+
psum[0x600+((uint8_t)(x1 >> 48))] +
1430+
psum[0x700+((uint8_t)(x1 >> 56))];
1431+
}
1432+
}
1433+
} // (allzero_k || allzero_l1)
1434+
1435+
if (did_update) {
1436+
dm_stripe[k] += my_stripe;
1437+
dm_stripe_total[k] += my_stripe_total;
1438+
}
1439+
} // (allzero_k && allzero_l1)
1440+
1441+
}
1442+
#endif
1443+
11681444
template<class TFloat>
11691445
void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
11701446
const uint64_t start_idx = this->task_p->start;
@@ -1187,6 +1463,14 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
11871463

11881464
const uint64_t filled_embs_els_round = (filled_embs+63)/64;
11891465

1466+
#ifndef _OPENACC
1467+
// not effective for GPUs, but helps a lot on the CPUs
1468+
bool * const __restrict__ zcheck = this->zcheck;
1469+
// check for zero values
1470+
WeightedZerosAsync(zcheck,
1471+
embedded_proportions,
1472+
filled_embs_els_round, n_samples, n_samples_r);
1473+
#endif
11901474

11911475
// pre-compute sums of length elements, since they are likely to be accessed many times
11921476
// We will use a 8-bit map, to keep it small enough to keep in L1 cache
@@ -1247,7 +1531,7 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
12471531
#ifdef _OPENACC
12481532
#pragma acc wait
12491533
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
1250-
#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
1534+
#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
12511535
#else
12521536
// use dynamic scheduling due to non-homogeneity in the loop
12531537
#pragma omp parallel for schedule(dynamic,1) default(shared)
@@ -1257,61 +1541,19 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
12571541
for(uint64_t ik = 0; ik < step_size ; ik++) {
12581542
const uint64_t k = sk*step_size + ik;
12591543
const uint64_t idx = (stripe-start_idx) * n_samples_r;
1260-
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
1261-
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
1262-
//TFloat *dm_stripe = dm_stripes[stripe];
1263-
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
12641544

12651545
if (k>=n_samples) continue; // past the limit
12661546

12671547
const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
12681548

1269-
bool did_update = false;
1270-
TFloat my_stripe = 0.0;
1271-
TFloat my_stripe_total = 0.0;
1272-
1273-
#pragma acc loop seq
1274-
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
1275-
const uint64_t offset = n_samples_r * emb_el;
1276-
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
1277-
1278-
uint64_t u1 = embedded_proportions[offset + k];
1279-
uint64_t v1 = embedded_proportions[offset + l1];
1280-
uint64_t o1 = u1 | v1;
1281-
1282-
if (o1!=0) { // zeros are prevalent
1283-
did_update=true;
1284-
uint64_t x1 = u1 ^ v1;
1285-
1286-
// Use the pre-computed sums
1287-
// Each range of 8 lengths has already been pre-computed and stored in psum
1288-
// Since embedded_proportions packed format is in 64-bit format for performance reasons
1289-
// we need to add the 8 sums using the four 8-bits for addressing inside psum
1290-
1291-
my_stripe += psum[ (x1 & 0xff)] +
1292-
psum[0x100+((x1 >> 8) & 0xff)] +
1293-
psum[0x200+((x1 >> 16) & 0xff)] +
1294-
psum[0x300+((x1 >> 24) & 0xff)] +
1295-
psum[0x400+((x1 >> 32) & 0xff)] +
1296-
psum[0x500+((x1 >> 40) & 0xff)] +
1297-
psum[0x600+((x1 >> 48) & 0xff)] +
1298-
psum[0x700+((x1 >> 56) )];
1299-
my_stripe_total += psum[ (o1 & 0xff)] +
1300-
psum[0x100+((o1 >> 8) & 0xff)] +
1301-
psum[0x200+((o1 >> 16) & 0xff)] +
1302-
psum[0x300+((o1 >> 24) & 0xff)] +
1303-
psum[0x400+((o1 >> 32) & 0xff)] +
1304-
psum[0x500+((o1 >> 40) & 0xff)] +
1305-
psum[0x600+((o1 >> 48) & 0xff)] +
1306-
psum[0x700+((o1 >> 56) )];
1307-
}
1308-
}
1309-
1310-
if (did_update) {
1311-
dm_stripe[k] += my_stripe;
1312-
dm_stripe_total[k] += my_stripe_total;
1313-
}
1314-
1549+
Unweighted1<TFloat>(
1550+
dm_stripes_buf,dm_stripes_total_buf,
1551+
#ifndef _OPENACC
1552+
zcheck,
1553+
#endif
1554+
sums, embedded_proportions,
1555+
filled_embs_els_round,idx, n_samples_r,
1556+
k, l1);
13151557
}
13161558

13171559
}

0 commit comments

Comments
 (0)