Skip to content

Commit

Permalink
Issue #236
Browse files Browse the repository at this point in the history
Use dpp8
  • Loading branch information
fancyIX committed Oct 13, 2021
1 parent 5925fe9 commit bf0271e
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 110 deletions.
7 changes: 6 additions & 1 deletion driver-opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1504,11 +1504,16 @@ if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI
status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 2, off2, gws, expand, 0, NULL, NULL); // lyra 4w monolithic
}
} else if ((gpu->algorithm.type == ALGO_ALLIUM || gpu->algorithm.type == ALGO_ALLIUM_NAVI) && (i == 2 || i == 6)) {
if (clState->prebuilt || gpu->algorithm.type == ALGO_ALLIUM_NAVI) {
if (clState->prebuilt) {
const size_t off2[] = { 0, 0, *p_global_work_offset };
const size_t gws[] = { 4, 4, globalThreads[0] / 2 };
const size_t expand[] = { 4, 4, 16 };
status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 3, off2, gws, expand, 0, NULL, NULL); // lyra 4w monolithic
} else if (gpu->algorithm.type == ALGO_ALLIUM_NAVI) {
const size_t off2[] = { 0, 0, *p_global_work_offset };
const size_t gws[] = { 4, 2, globalThreads[0] };
const size_t expand[] = { 4, 2, 16 };
status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 3, off2, gws, expand, 0, NULL, NULL); // lyra 4w monolithic
} else {
const size_t off2[] = { 0, *p_global_work_offset };
const size_t gws[] = { 4, globalThreads[0] };
Expand Down
146 changes: 72 additions & 74 deletions kernel/allium_navi.cl
Original file line number Diff line number Diff line change
Expand Up @@ -279,16 +279,15 @@ __kernel void search2(__global uint* hashes, __global uchar* sharedDataBuf)
__attribute__((amdgpu_waves_per_eu(1,1)))
__attribute__((amdgpu_num_vgpr(256)))
__attribute__((amdgpu_num_sgpr(200)))
__attribute__((reqd_work_group_size(4, 4, 16)))
__attribute__((reqd_work_group_size(4, 2, 16)))
__kernel void search3(__global uchar* sharedDataBuf)
{
uint gid = get_global_id(2);
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4 * 2) * (gid-get_global_offset(2))));
__global lyraState_t *lyraState2 = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) + (8 * 4 * 4 * 2) * (gid-get_global_offset(2))));
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(2))));

uint notepad[192];

const int player = get_local_id(1) % 4;
const int player = get_local_id(1) & 1;

uint state[4];
uint si[3];
Expand All @@ -302,26 +301,26 @@ __kernel void search3(__global uchar* sharedDataBuf)
uint ss3;
uint ss;
uint carry;
const uint mindex = (LOCAL_LINEAR & 1) == 0 ? 0 : 1;
const uint mindex = (player & 1) == 0 ? 0 : 1;

//-------------------------------------
// Load Lyra state
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 0]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 1]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 2]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 3]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 0 + 1]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 1 + 1]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 2 + 1]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 3 + 1]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 0]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 1]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 2]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 3]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 0 + 1]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 1 + 1]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 2 + 1]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 3 + 1]));
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 3 + player]));

write_state(notepad, state, 0, 7);
round_lyra_4way_sw(state);
Expand Down Expand Up @@ -389,22 +388,22 @@ __kernel void search3(__global uchar* sharedDataBuf)

//-------------------------------------
// save lyra state
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 0] = state[0];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 1] = state[1];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 2] = state[2];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 3] = state[3];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 0 + 1] = state[0];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 1 + 1] = state[1];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 2 + 1] = state[2];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 3 + 1] = state[3];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 0] = state[0];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 1] = state[1];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 2] = state[2];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 3] = state[3];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 0 + 1] = state[0];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 1 + 1] = state[1];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 2 + 1] = state[2];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 3 + 1] = state[3];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 3 + player] = state[3];

barrier(CLK_GLOBAL_MEM_FENCE);
}
Expand Down Expand Up @@ -550,16 +549,15 @@ __kernel void search6(__global uint* hashes, __global uchar* sharedDataBuf)
__attribute__((amdgpu_waves_per_eu(1,1)))
__attribute__((amdgpu_num_vgpr(256)))
__attribute__((amdgpu_num_sgpr(200)))
__attribute__((reqd_work_group_size(4, 4, 16)))
__attribute__((reqd_work_group_size(4, 2, 16)))
__kernel void search7(__global uchar* sharedDataBuf)
{
uint gid = get_global_id(2);
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4 * 2) * (gid-get_global_offset(2))));
__global lyraState_t *lyraState2 = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) + (8 * 4 * 4 * 2) * (gid-get_global_offset(2))));
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(2))));

uint notepad[192];

const int player = get_local_id(1) % 4;
const int player = get_local_id(1) & 1;

uint state[4];
uint si[3];
Expand All @@ -573,26 +571,26 @@ __kernel void search7(__global uchar* sharedDataBuf)
uint ss3;
uint ss;
uint carry;
const uint mindex = (LOCAL_LINEAR & 1) == 0 ? 0 : 1;
const uint mindex = (player & 1) == 0 ? 0 : 1;

//-------------------------------------
// Load Lyra state
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 0]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 1]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 2]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 3]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 0 + 1]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 1 + 1]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 2 + 1]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState->h4[2 * player + 2 * 4 * 3 + 1]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 0]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 1]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 2]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 3]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 0 + 1]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 1 + 1]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 2 + 1]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState2->h4[2 * player + 2 * 4 * 3 + 1]));
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 3 + player]));

write_state(notepad, state, 0, 7);
round_lyra_4way_sw(state);
Expand Down Expand Up @@ -660,22 +658,22 @@ __kernel void search7(__global uchar* sharedDataBuf)

//-------------------------------------
// save lyra state
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 0] = state[0];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 1] = state[1];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 2] = state[2];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * player + 2 * 4 * 3] = state[3];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 0 + 1] = state[0];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 1 + 1] = state[1];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 2 + 1] = state[2];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * player + 2 * 4 * 3 + 1] = state[3];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 0] = state[0];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 1] = state[1];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 2] = state[2];
if (LOCAL_LINEAR == 2) lyraState2->h4[2 * player + 2 * 4 * 3] = state[3];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 0 + 1] = state[0];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 1 + 1] = state[1];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 2 + 1] = state[2];
if (LOCAL_LINEAR == 3) lyraState2->h4[2 * player + 2 * 4 * 3 + 1] = state[3];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 3 + player] = state[3];

barrier(CLK_GLOBAL_MEM_FENCE);
}
Expand Down
67 changes: 32 additions & 35 deletions kernel/lyra2mdzf2_navi.cl
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,12 @@
#define LYRA_ROUNDS 8
#define HYPERMATRIX_COUNT (LYRA_ROUNDS * STATE_BLOCK_COUNT)


#define ADD32_DPP(a, b) \
__asm ( \
"v_add_co_u32 %[daa], vcc_lo, %[bb], %[aa]\n" \
"s_lshl_b32 vcc_lo, vcc_lo, 1\n" \
"s_and_b32 vcc_lo, vcc_lo, 0xAAAAAAAA\n" \
"s_lshl_b32 vcc_lo, vcc_lo, 4\n" \
"s_and_b32 vcc_lo, vcc_lo, 0xF0F0F0F0\n" \
"v_add_co_ci_u32 %[daa], vcc_lo, 0, %[daa], vcc_lo\n" \
: [daa] "=v" (a) \
: [aa] "0" (a), \
Expand All @@ -111,7 +112,7 @@
{ \
__asm ( \
"s_nop 1\n" \
"v_mov_b32_dpp %[p], %[pp] quad_perm:[1,0,3,2]\n" \
"v_mov_b32_dpp %[p], %[pp] dpp8:[4,5,6,7,0,1,2,3]\n" \
"s_nop 1" \
: [p] "=v" (s) \
: [pp] "v" (ss)); \
Expand All @@ -122,7 +123,7 @@
{ \
__asm ( \
"s_nop 1\n" \
"v_mov_b32_dpp %[dpp], %[pp] quad_perm:[1,0,3,2]\n" \
"v_mov_b32_dpp %[dpp], %[pp] dpp8:[4,5,6,7,0,1,2,3]\n" \
"s_nop 1\n" \
"v_alignbyte_b32 %[dp], %[dpp], %[p], 3" \
: [dpp] "=v" (ss), \
Expand All @@ -136,7 +137,7 @@
{ \
__asm ( \
"s_nop 1\n" \
"v_mov_b32_dpp %[dpp], %[pp] quad_perm:[1,0,3,2]\n" \
"v_mov_b32_dpp %[dpp], %[pp] dpp8:[4,5,6,7,0,1,2,3]\n" \
"s_nop 1\n" \
"v_alignbyte_b32 %[dp], %[dpp], %[p], 2" \
: [dpp] "=v" (ss), \
Expand All @@ -150,7 +151,7 @@
{ \
__asm ( \
"s_nop 1\n" \
"v_mov_b32_dpp %[dpp], %[pp] quad_perm:[1,0,3,2]\n" \
"v_mov_b32_dpp %[dpp], %[pp] dpp8:[4,5,6,7,0,1,2,3]\n" \
"s_nop 1\n" \
"v_alignbit_b32 %[dp], %[p], %[dpp], 31" \
: [dpp] "=v" (ss), \
Expand All @@ -170,9 +171,9 @@
#define shflldpp(state) \
__asm ( \
"s_nop 1\n" \
"v_mov_b32_dpp %[dp10], %[p10] row_ror:12\n" \
"v_mov_b32_dpp %[dp20], %[p20] row_ror:8\n" \
"v_mov_b32_dpp %[dp30], %[p30] row_ror:4\n" \
"v_mov_b32_dpp %[dp10], %[p10] quad_perm:[1,2,3,0]\n" \
"v_mov_b32_dpp %[dp20], %[p20] quad_perm:[2,3,0,1]\n" \
"v_mov_b32_dpp %[dp30], %[p30] quad_perm:[3,0,1,2]\n" \
"s_nop 1" \
: [dp10] "=v" (state[1]), \
[dp20] "=v" (state[2]), \
Expand All @@ -184,9 +185,9 @@
#define shflrdpp(state) \
__asm ( \
"s_nop 1\n" \
"v_mov_b32_dpp %[dp10], %[p10] row_ror:4\n" \
"v_mov_b32_dpp %[dp20], %[p20] row_ror:8\n" \
"v_mov_b32_dpp %[dp30], %[p30] row_ror:12\n" \
"v_mov_b32_dpp %[dp10], %[p10] quad_perm:[3,0,1,2]\n" \
"v_mov_b32_dpp %[dp20], %[p20] quad_perm:[2,3,0,1]\n" \
"v_mov_b32_dpp %[dp30], %[p30] quad_perm:[1,2,3,0]\n" \
"s_nop 1" \
: [dp10] "=v" (state[1]), \
[dp20] "=v" (state[2]), \
Expand All @@ -208,28 +209,28 @@
s2 = state[2]; \
__asm ( \
"s_nop 1\n" \
"v_mov_b32_dpp %[dp10], %[p10] row_ror:4\n" \
"v_mov_b32_dpp %[dp20], %[p20] row_ror:4\n" \
"v_mov_b32_dpp %[dp30], %[p30] row_ror:4\n" \
"v_mov_b32_dpp %[dp10], %[p10] quad_perm:[3,0,1,2]\n" \
"v_mov_b32_dpp %[dp20], %[p20] quad_perm:[3,0,1,2]\n" \
"v_mov_b32_dpp %[dp30], %[p30] quad_perm:[3,0,1,2]\n" \
"s_nop 1" \
: [dp10] "=v" (s0), \
[dp20] "=v" (s1), \
[dp30] "=v" (s2) \
: [p10] "0" (s0), \
[p20] "1" (s1), \
[p30] "2" (s2)); \
if ((get_local_id(1) & 3) == 1) sII[0] ^= (s0); \
if ((get_local_id(1) & 3) == 1) sII[1] ^= (s1); \
if ((get_local_id(1) & 3) == 1) sII[2] ^= (s2); \
if ((get_local_id(1) & 3) == 2) sII[0] ^= (s0); \
if ((get_local_id(1) & 3) == 2) sII[1] ^= (s1); \
if ((get_local_id(1) & 3) == 2) sII[2] ^= (s2); \
if ((get_local_id(1) & 3) == 3) sII[0] ^= (s0); \
if ((get_local_id(1) & 3) == 3) sII[1] ^= (s1); \
if ((get_local_id(1) & 3) == 3) sII[2] ^= (s2); \
if ((get_local_id(1) & 3) == 0) sII[0] ^= (s2); \
if ((get_local_id(1) & 3) == 0) sII[1] ^= (s0); \
if ((get_local_id(1) & 3) == 0) sII[2] ^= (s1); \
if ((get_local_id(0) & 3) == 1) sII[0] ^= (s0); \
if ((get_local_id(0) & 3) == 1) sII[1] ^= (s1); \
if ((get_local_id(0) & 3) == 1) sII[2] ^= (s2); \
if ((get_local_id(0) & 3) == 2) sII[0] ^= (s0); \
if ((get_local_id(0) & 3) == 2) sII[1] ^= (s1); \
if ((get_local_id(0) & 3) == 2) sII[2] ^= (s2); \
if ((get_local_id(0) & 3) == 3) sII[0] ^= (s0); \
if ((get_local_id(0) & 3) == 3) sII[1] ^= (s1); \
if ((get_local_id(0) & 3) == 3) sII[2] ^= (s2); \
if ((get_local_id(0) & 3) == 0) sII[0] ^= (s2); \
if ((get_local_id(0) & 3) == 0) sII[1] ^= (s0); \
if ((get_local_id(0) & 3) == 0) sII[2] ^= (s1); \

#define write_state(notepad, state, row, col) \
notepad[24 * row + col * 3] = state[0]; \
Expand Down Expand Up @@ -332,18 +333,14 @@

#define broadcast_zero(s) \
p0 = (s[0] & 7); \
p1 = (s[0] & 7); \
__asm ( \
"s_nop 0\n" \
"v_mov_b32_dpp %[dp0], %[p0] dpp8:[0,0,2,2,0,0,2,2]\n" \
"s_nop 0\n" \
"v_mov_b32_dpp %[dp1], %[dp0] row_ror:8\n" \
"v_mov_b32_dpp %[dp0], %[p0] dpp8:[0,0,0,0,0,0,0,0]\n" \
"s_nop 0" \
: [dp0] "=v" (p0), \
[dp1] "=v" (p1) \
: [dp0] "=v" (p0) \
: [p0] "0" (p0)); \
if ((get_local_id(1) & 2) == 0) modify = p0; \
if ((get_local_id(1) & 2) == 2) modify = p1; \
if ((get_local_id(0) & 2) == 0) modify = p0; \
if ((get_local_id(0) & 2) == 2) modify = p0; \

#define real_matrw_read(sII, bigMat, matrw, off) \
if (matrw == 0) sII[0] = bigMat[24 * 0 + off * 3]; \
Expand Down

0 comments on commit bf0271e

Please sign in to comment.