@@ -97,15 +97,15 @@ static void __forceinline__ __device__ keccak_block(uint2 *s)
97
97
__global__
98
98
void heavyhash_gpu_hash (const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
99
99
{
100
- __shared__ ulong2 matrix[1024 ];
100
+ __shared__ uint64_t matrix[1024 * 2 ];
101
101
102
102
uint32_t thread = (blockDim .x * blockIdx .x + threadIdx .x );
103
103
uint32_t nonce = startNonce + thread;
104
104
if (thread < threads)
105
105
{
106
106
uint32_t tid = threadIdx .x ;
107
- ulong2 *cp = (ulong2 *)(& c_matrix[ 0 ][ 0 ] );
108
- for (int i = 0 ; i < 4 ; i++) {
107
+ uint64_t *cp = (uint64_t *)(c_matrix);
108
+ for (int i = 0 ; i < 8 ; i++) {
109
109
matrix[tid + i * 256 ] = cp[tid + i * 256 ];
110
110
}
111
111
@@ -140,26 +140,26 @@ void heavyhash_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint3
140
140
141
141
for (int i = 0 ; i < 64 ; ++i) {
142
142
uint32_t sum = 0 ;
143
- for (int k = 0 ; k < 4 ; k++) {
144
- ulong2 buf0 = matrix[i * 16 + k * 4 + 0 ];
145
- ulong2 buf1 = matrix[i * 16 + k * 4 + 1 ];
146
- ulong2 buf2 = matrix[i * 16 + k * 4 + 2 ];
147
- ulong2 buf3 = matrix[i * 16 + k * 4 + 3 ];
143
+ for (int k = 0 ; k < 8 ; k++) {
144
+ uint64_t buf0 = matrix[i * 32 + k * 4 + 0 ];
145
+ uint64_t buf1 = matrix[i * 32 + k * 4 + 1 ];
146
+ uint64_t buf2 = matrix[i * 32 + k * 4 + 2 ];
147
+ uint64_t buf3 = matrix[i * 32 + k * 4 + 3 ];
148
148
uint32_t *m0 = (uint32_t *)&buf0;
149
- for (int j = 0 ; j < 4 ; j++) {
150
- sum += m0[j] * vector[(k * 4 + 0 ) * 4 + j];
149
+ for (int j = 0 ; j < 2 ; j++) {
150
+ sum += m0[j] * vector[(k * 4 + 0 ) * 2 + j];
151
151
}
152
152
uint32_t *m1 = (uint32_t *)&buf1;
153
- for (int j = 0 ; j < 4 ; j++) {
154
- sum += m1[j] * vector[(k * 4 + 1 ) * 4 + j];
153
+ for (int j = 0 ; j < 2 ; j++) {
154
+ sum += m1[j] * vector[(k * 4 + 1 ) * 2 + j];
155
155
}
156
156
uint32_t *m2 = (uint32_t *)&buf2;
157
- for (int j = 0 ; j < 4 ; j++) {
158
- sum += m2[j] * vector[(k * 4 + 2 ) * 4 + j];
157
+ for (int j = 0 ; j < 2 ; j++) {
158
+ sum += m2[j] * vector[(k * 4 + 2 ) * 2 + j];
159
159
}
160
160
uint32_t *m3 = (uint32_t *)&buf3;
161
- for (int j = 0 ; j < 4 ; j++) {
162
- sum += m3[j] * vector[(k * 4 + 3 ) * 4 + j];
161
+ for (int j = 0 ; j < 2 ; j++) {
162
+ sum += m3[j] * vector[(k * 4 + 3 ) * 2 + j];
163
163
}
164
164
}
165
165
product[i] = (sum >> 10 );
0 commit comments