@@ -52,6 +52,34 @@ static inline void WeightedZerosAndSums(
52
52
}
53
53
}
54
54
55
+ // check for zero values
56
+ template <class T >
57
+ static inline void WeightedZerosAsync (
58
+ bool * const __restrict__ zcheck,
59
+ const T * const __restrict__ embedded_proportions,
60
+ const unsigned int filled_embs,
61
+ const uint64_t n_samples,
62
+ const uint64_t n_samples_r) {
63
+ #ifdef _OPENACC
64
+ #pragma acc parallel loop gang vector present(embedded_proportions,zcheck) async
65
+ #else
66
+ #pragma omp parallel for default(shared)
67
+ #endif
68
+ for (uint64_t k=0 ; k<n_samples; k++) {
69
+ bool all_zeros=true ;
70
+
71
+ #pragma acc loop seq
72
+ for (uint64_t emb=0 ; emb<filled_embs; emb++) {
73
+ const uint64_t offset = n_samples_r * emb;
74
+
75
+ T u1 = embedded_proportions[offset + k];
76
+ all_zeros = all_zeros && (u1==0 );
77
+ }
78
+
79
+ zcheck[k] = all_zeros;
80
+ }
81
+ }
82
+
55
83
56
84
// Single step in computing Weighted part of Unifrac
57
85
template <class TFloat >
@@ -1165,6 +1193,254 @@ void SUCMP_NM::UnifracVawGeneralizedTask<TFloat>::_run(unsigned int filled_embs,
1165
1193
#endif
1166
1194
}
1167
1195
1196
+ // Single step in computing NormalizedWeighted Unifrac
1197
+ #ifdef _OPENACC
1198
+ template <class TFloat >
1199
+ static inline void Unweighted1 (
1200
+ TFloat * const __restrict__ dm_stripes_buf,
1201
+ TFloat * const __restrict__ dm_stripes_total_buf,
1202
+ const TFloat * const __restrict__ sums,
1203
+ const uint64_t * const __restrict__ embedded_proportions,
1204
+ const unsigned int filled_embs_els_round,
1205
+ const uint64_t idx,
1206
+ const uint64_t n_samples_r,
1207
+ const uint64_t k,
1208
+ const uint64_t l1) {
1209
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
1210
+ TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
1211
+ // TFloat *dm_stripe = dm_stripes[stripe];
1212
+ // TFloat *dm_stripe_total = dm_stripes_total[stripe];
1213
+
1214
+ bool did_update = false ;
1215
+ TFloat my_stripe = 0.0 ;
1216
+ TFloat my_stripe_total = 0.0 ;
1217
+
1218
+ for (uint64_t emb_el=0 ; emb_el<filled_embs_els_round; emb_el++) {
1219
+ const uint64_t offset = n_samples_r * emb_el;
1220
+ const TFloat * __restrict__ psum = &(sums[emb_el*0x800 ]);
1221
+
1222
+ uint64_t u1 = embedded_proportions[offset + k];
1223
+ uint64_t v1 = embedded_proportions[offset + l1];
1224
+ uint64_t o1 = u1 | v1;
1225
+
1226
+ if (o1!=0 ) { // zeros are prevalent
1227
+ did_update=true ;
1228
+ uint64_t x1 = u1 ^ v1;
1229
+
1230
+ // Use the pre-computed sums
1231
+ // Each range of 8 lengths has already been pre-computed and stored in psum
1232
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
1233
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
1234
+
1235
+ my_stripe_total += psum[ (uint8_t )(o1) ] +
1236
+ psum[0x100 +((uint8_t )(o1 >> 8 ))] +
1237
+ psum[0x200 +((uint8_t )(o1 >> 16 ))] +
1238
+ psum[0x300 +((uint8_t )(o1 >> 24 ))] +
1239
+ psum[0x400 +((uint8_t )(o1 >> 32 ))] +
1240
+ psum[0x500 +((uint8_t )(o1 >> 40 ))] +
1241
+ psum[0x600 +((uint8_t )(o1 >> 48 ))] +
1242
+ psum[0x700 +((uint8_t )(o1 >> 56 ))];
1243
+ my_stripe += psum[ (uint8_t )(x1) ] +
1244
+ psum[0x100 +((uint8_t )(x1 >> 8 ))] +
1245
+ psum[0x200 +((uint8_t )(x1 >> 16 ))] +
1246
+ psum[0x300 +((uint8_t )(x1 >> 24 ))] +
1247
+ psum[0x400 +((uint8_t )(x1 >> 32 ))] +
1248
+ psum[0x500 +((uint8_t )(x1 >> 40 ))] +
1249
+ psum[0x600 +((uint8_t )(x1 >> 48 ))] +
1250
+ psum[0x700 +((uint8_t )(x1 >> 56 ))];
1251
+ }
1252
+ }
1253
+
1254
+ if (did_update) {
1255
+ dm_stripe[k] += my_stripe;
1256
+ dm_stripe_total[k] += my_stripe_total;
1257
+ }
1258
+
1259
+ }
1260
+ #else
1261
+ template <class TFloat >
1262
+ static inline void Unweighted1 (
1263
+ TFloat * const __restrict__ dm_stripes_buf,
1264
+ TFloat * const __restrict__ dm_stripes_total_buf,
1265
+ const bool * const __restrict__ zcheck,
1266
+ const TFloat * const __restrict__ sums,
1267
+ const uint64_t * const __restrict__ embedded_proportions,
1268
+ const unsigned int filled_embs_els_round,
1269
+ const uint64_t idx,
1270
+ const uint64_t n_samples_r,
1271
+ const uint64_t k,
1272
+ const uint64_t l1) {
1273
+ const bool allzero_k = zcheck[k];
1274
+ const bool allzero_l1 = zcheck[l1];
1275
+
1276
+ if (allzero_k && allzero_l1) {
1277
+ // nothing to do, would have to add 0
1278
+ } else {
1279
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
1280
+ TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
1281
+ // TFloat *dm_stripe = dm_stripes[stripe];
1282
+ // TFloat *dm_stripe_total = dm_stripes_total[stripe];
1283
+
1284
+ bool did_update = false ;
1285
+ TFloat my_stripe = 0.0 ;
1286
+ TFloat my_stripe_total = 0.0 ;
1287
+
1288
+ if (allzero_k || allzero_l1) {
1289
+ // with one side zero, | and ^ are no-ops
1290
+ const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero one
1291
+ for (uint64_t emb_el=0 ; emb_el<filled_embs_els_round; emb_el++) {
1292
+ const uint64_t offset = n_samples_r * emb_el;
1293
+ const TFloat * __restrict__ psum = &(sums[emb_el*0x800 ]);
1294
+
1295
+ // uint64_t u1 = embedded_proportions[offset + k];
1296
+ // uint64_t v1 = embedded_proportions[offset + l1];
1297
+ // uint64_t o1 = u1 | v1;
1298
+ // With one of the two being 0, or is just the non-negative one
1299
+ uint64_t o1 = embedded_proportions[offset + kl];
1300
+
1301
+ if (o1==0 ) { // zeros are prevalent
1302
+ // nothing to do
1303
+ } else if (((uint32_t )o1)==0 ) {
1304
+ // only high part relevant
1305
+ did_update=true ;
1306
+ // uint64_t x1 = u1 ^ v1;
1307
+ // With one of the two being 0, xor is just the non-negative one
1308
+
1309
+ // Use the pre-computed sums
1310
+ // Each range of 8 lengths has already been pre-computed and stored in psum
1311
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
1312
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
1313
+
1314
+ TFloat esum = psum[0x400 +((uint8_t )(o1 >> 32 ))] +
1315
+ psum[0x500 +((uint8_t )(o1 >> 40 ))] +
1316
+ psum[0x600 +((uint8_t )(o1 >> 48 ))] +
1317
+ psum[0x700 +((uint8_t )(o1 >> 56 ))];
1318
+ my_stripe_total += esum;
1319
+ my_stripe += esum;
1320
+ } else if ((o1>>32 )==0 ) {
1321
+ // only low part relevant
1322
+ did_update=true ;
1323
+ // uint64_t x1 = u1 ^ v1;
1324
+ // With one of the two being 0, xor is just the non-negative one
1325
+
1326
+ // Use the pre-computed sums
1327
+ // Each range of 8 lengths has already been pre-computed and stored in psum
1328
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
1329
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
1330
+
1331
+ TFloat esum = psum[ (uint8_t )(o1) ] +
1332
+ psum[0x100 +((uint8_t )(o1 >> 8 ))] +
1333
+ psum[0x200 +((uint8_t )(o1 >> 16 ))] +
1334
+ psum[0x300 +((uint8_t )(o1 >> 24 ))];
1335
+ my_stripe_total += esum;
1336
+ my_stripe += esum;
1337
+ } else {
1338
+ did_update=true ;
1339
+ // uint64_t x1 = u1 ^ v1;
1340
+ // With one of the two being 0, xor is just the non-negative one
1341
+
1342
+ // Use the pre-computed sums
1343
+ // Each range of 8 lengths has already been pre-computed and stored in psum
1344
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
1345
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
1346
+
1347
+ TFloat esum = psum[ (uint8_t )(o1) ] +
1348
+ psum[0x100 +((uint8_t )(o1 >> 8 ))] +
1349
+ psum[0x200 +((uint8_t )(o1 >> 16 ))] +
1350
+ psum[0x300 +((uint8_t )(o1 >> 24 ))] +
1351
+ psum[0x400 +((uint8_t )(o1 >> 32 ))] +
1352
+ psum[0x500 +((uint8_t )(o1 >> 40 ))] +
1353
+ psum[0x600 +((uint8_t )(o1 >> 48 ))] +
1354
+ psum[0x700 +((uint8_t )(o1 >> 56 ))];
1355
+ my_stripe_total += esum;
1356
+ my_stripe += esum;
1357
+ }
1358
+ }
1359
+ } else {
1360
+ // we need both sides
1361
+ for (uint64_t emb_el=0 ; emb_el<filled_embs_els_round; emb_el++) {
1362
+ const uint64_t offset = n_samples_r * emb_el;
1363
+ const TFloat * __restrict__ psum = &(sums[emb_el*0x800 ]);
1364
+
1365
+ uint64_t u1 = embedded_proportions[offset + k];
1366
+ uint64_t v1 = embedded_proportions[offset + l1];
1367
+ uint64_t o1 = u1 | v1;
1368
+ uint64_t x1 = u1 ^ v1;
1369
+
1370
+ if (o1==0 ) { // zeros are prevalent
1371
+ // nothing to do
1372
+ } else if (((uint32_t )o1)==0 ) {
1373
+ // only high part relevant
1374
+ did_update=true ;
1375
+
1376
+ // Use the pre-computed sums
1377
+ // Each range of 8 lengths has already been pre-computed and stored in psum
1378
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
1379
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
1380
+
1381
+ my_stripe_total += psum[0x400 +((uint8_t )(o1 >> 32 ))] +
1382
+ psum[0x500 +((uint8_t )(o1 >> 40 ))] +
1383
+ psum[0x600 +((uint8_t )(o1 >> 48 ))] +
1384
+ psum[0x700 +((uint8_t )(o1 >> 56 ))];
1385
+ my_stripe += psum[0x400 +((uint8_t )(x1 >> 32 ))] +
1386
+ psum[0x500 +((uint8_t )(x1 >> 40 ))] +
1387
+ psum[0x600 +((uint8_t )(x1 >> 48 ))] +
1388
+ psum[0x700 +((uint8_t )(x1 >> 56 ))];
1389
+ } else if ((o1>>32 )==0 ) {
1390
+ // only low part relevant
1391
+ did_update=true ;
1392
+
1393
+ // Use the pre-computed sums
1394
+ // Each range of 8 lengths has already been pre-computed and stored in psum
1395
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
1396
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
1397
+
1398
+ my_stripe_total += psum[ (uint8_t )(o1) ] +
1399
+ psum[0x100 +((uint8_t )(o1 >> 8 ))] +
1400
+ psum[0x200 +((uint8_t )(o1 >> 16 ))] +
1401
+ psum[0x300 +((uint8_t )(o1 >> 24 ))];
1402
+ my_stripe += psum[ (uint8_t )(x1) ] +
1403
+ psum[0x100 +((uint8_t )(x1 >> 8 ))] +
1404
+ psum[0x200 +((uint8_t )(x1 >> 16 ))] +
1405
+ psum[0x300 +((uint8_t )(x1 >> 24 ))];
1406
+
1407
+ } else {
1408
+ did_update=true ;
1409
+
1410
+ // Use the pre-computed sums
1411
+ // Each range of 8 lengths has already been pre-computed and stored in psum
1412
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
1413
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
1414
+
1415
+ my_stripe_total += psum[ (uint8_t )(o1) ] +
1416
+ psum[0x100 +((uint8_t )(o1 >> 8 ))] +
1417
+ psum[0x200 +((uint8_t )(o1 >> 16 ))] +
1418
+ psum[0x300 +((uint8_t )(o1 >> 24 ))] +
1419
+ psum[0x400 +((uint8_t )(o1 >> 32 ))] +
1420
+ psum[0x500 +((uint8_t )(o1 >> 40 ))] +
1421
+ psum[0x600 +((uint8_t )(o1 >> 48 ))] +
1422
+ psum[0x700 +((uint8_t )(o1 >> 56 ))];
1423
+ my_stripe += psum[ (uint8_t )(x1) ] +
1424
+ psum[0x100 +((uint8_t )(x1 >> 8 ))] +
1425
+ psum[0x200 +((uint8_t )(x1 >> 16 ))] +
1426
+ psum[0x300 +((uint8_t )(x1 >> 24 ))] +
1427
+ psum[0x400 +((uint8_t )(x1 >> 32 ))] +
1428
+ psum[0x500 +((uint8_t )(x1 >> 40 ))] +
1429
+ psum[0x600 +((uint8_t )(x1 >> 48 ))] +
1430
+ psum[0x700 +((uint8_t )(x1 >> 56 ))];
1431
+ }
1432
+ }
1433
+ } // (allzero_k || allzero_l1)
1434
+
1435
+ if (did_update) {
1436
+ dm_stripe[k] += my_stripe;
1437
+ dm_stripe_total[k] += my_stripe_total;
1438
+ }
1439
+ } // (allzero_k && allzero_l1)
1440
+
1441
+ }
1442
+ #endif
1443
+
1168
1444
template <class TFloat >
1169
1445
void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
1170
1446
const uint64_t start_idx = this ->task_p ->start ;
@@ -1187,6 +1463,14 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
1187
1463
1188
1464
const uint64_t filled_embs_els_round = (filled_embs+63 )/64 ;
1189
1465
1466
+ #ifndef _OPENACC
1467
+ // not effective for GPUs, but helps a lot on the CPUs
1468
+ bool * const __restrict__ zcheck = this ->zcheck ;
1469
+ // check for zero values
1470
+ WeightedZerosAsync (zcheck,
1471
+ embedded_proportions,
1472
+ filled_embs_els_round, n_samples, n_samples_r);
1473
+ #endif
1190
1474
1191
1475
// pre-compute sums of length elements, since they are likely to be accessed many times
1192
1476
// We will use a 8-bit map, to keep it small enough to keep in L1 cache
@@ -1247,7 +1531,7 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
1247
1531
#ifdef _OPENACC
1248
1532
#pragma acc wait
1249
1533
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
1250
- #pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
1534
+ #pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
1251
1535
#else
1252
1536
// use dynamic scheduling due to non-homogeneity in the loop
1253
1537
#pragma omp parallel for schedule(dynamic,1) default(shared)
@@ -1257,61 +1541,19 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
1257
1541
for (uint64_t ik = 0 ; ik < step_size ; ik++) {
1258
1542
const uint64_t k = sk*step_size + ik;
1259
1543
const uint64_t idx = (stripe-start_idx) * n_samples_r;
1260
- TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
1261
- TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
1262
- // TFloat *dm_stripe = dm_stripes[stripe];
1263
- // TFloat *dm_stripe_total = dm_stripes_total[stripe];
1264
1544
1265
1545
if (k>=n_samples) continue ; // past the limit
1266
1546
1267
1547
const uint64_t l1 = (k + stripe + 1 )%n_samples; // wraparound
1268
1548
1269
- bool did_update = false ;
1270
- TFloat my_stripe = 0.0 ;
1271
- TFloat my_stripe_total = 0.0 ;
1272
-
1273
- #pragma acc loop seq
1274
- for (uint64_t emb_el=0 ; emb_el<filled_embs_els_round; emb_el++) {
1275
- const uint64_t offset = n_samples_r * emb_el;
1276
- const TFloat * __restrict__ psum = &(sums[emb_el*0x800 ]);
1277
-
1278
- uint64_t u1 = embedded_proportions[offset + k];
1279
- uint64_t v1 = embedded_proportions[offset + l1];
1280
- uint64_t o1 = u1 | v1;
1281
-
1282
- if (o1!=0 ) { // zeros are prevalent
1283
- did_update=true ;
1284
- uint64_t x1 = u1 ^ v1;
1285
-
1286
- // Use the pre-computed sums
1287
- // Each range of 8 lengths has already been pre-computed and stored in psum
1288
- // Since embedded_proportions packed format is in 64-bit format for performance reasons
1289
- // we need to add the 8 sums using the four 8-bits for addressing inside psum
1290
-
1291
- my_stripe += psum[ (x1 & 0xff )] +
1292
- psum[0x100 +((x1 >> 8 ) & 0xff )] +
1293
- psum[0x200 +((x1 >> 16 ) & 0xff )] +
1294
- psum[0x300 +((x1 >> 24 ) & 0xff )] +
1295
- psum[0x400 +((x1 >> 32 ) & 0xff )] +
1296
- psum[0x500 +((x1 >> 40 ) & 0xff )] +
1297
- psum[0x600 +((x1 >> 48 ) & 0xff )] +
1298
- psum[0x700 +((x1 >> 56 ) )];
1299
- my_stripe_total += psum[ (o1 & 0xff )] +
1300
- psum[0x100 +((o1 >> 8 ) & 0xff )] +
1301
- psum[0x200 +((o1 >> 16 ) & 0xff )] +
1302
- psum[0x300 +((o1 >> 24 ) & 0xff )] +
1303
- psum[0x400 +((o1 >> 32 ) & 0xff )] +
1304
- psum[0x500 +((o1 >> 40 ) & 0xff )] +
1305
- psum[0x600 +((o1 >> 48 ) & 0xff )] +
1306
- psum[0x700 +((o1 >> 56 ) )];
1307
- }
1308
- }
1309
-
1310
- if (did_update) {
1311
- dm_stripe[k] += my_stripe;
1312
- dm_stripe_total[k] += my_stripe_total;
1313
- }
1314
-
1549
+ Unweighted1<TFloat>(
1550
+ dm_stripes_buf,dm_stripes_total_buf,
1551
+ #ifndef _OPENACC
1552
+ zcheck,
1553
+ #endif
1554
+ sums, embedded_proportions,
1555
+ filled_embs_els_round,idx, n_samples_r,
1556
+ k, l1);
1315
1557
}
1316
1558
1317
1559
}
0 commit comments