Skip to content

Commit 8ddb671

Browse files
authored
Merge pull request #38 from fancyIX/feature/#37
Fix #37
2 parents 037686b + 87ea652 commit 8ddb671

File tree

6 files changed

+180
-32
lines changed

6 files changed

+180
-32
lines changed

algorithm.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -510,10 +510,16 @@ static cl_int queue_phi2_kernel(struct __clState *clState, struct _dev_blk_ctx *
510510
CL_SET_ARG(has_roots);
511511

512512
kernel = clState->extra_kernels;
513-
// lyra2_cuda_hash_64 - search1
513+
// lyra2_cuda_hash_64 - search1 2 3
514514
num = 0;
515515
CL_SET_ARG(clState->padbuffer8);
516-
//CL_SET_ARG(clState->buffer3);
516+
CL_SET_ARG(clState->buffer3);
517+
//CL_SET_ARG(clState->buffer2);
518+
num = 0;
519+
CL_NEXTKERNEL_SET_ARG(clState->buffer3);
520+
num = 0;
521+
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
522+
CL_SET_ARG(clState->buffer3);
517523
// quark_jh512_cpu_hash_64 - search2
518524
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
519525
// phi_filter_cuda - search3
@@ -1303,7 +1309,7 @@ static algorithm_settings_t algos[] = {
13031309
{ "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, NULL, NULL, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
13041310

13051311
{ "phi", ALGO_PHI, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 5, 8 * 16 * 4194304, 0, phi_regenhash, NULL, NULL, queue_phi_kernel, gen_hash, append_x11_compiler_options },
1306-
{ "phi2", ALGO_PHI2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 9, 8 * 16 * 4194304, 0, phi2_regenhash, NULL, NULL, queue_phi2_kernel, gen_hash, append_x11_compiler_options },
1312+
{ "phi2", ALGO_PHI2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 11, 8 * 16 * 4194304, 0, phi2_regenhash, NULL, NULL, queue_phi2_kernel, gen_hash, append_x11_compiler_options },
13071313

13081314
{ "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, NULL, NULL, queue_fresh_kernel, gen_hash, NULL },
13091315

algorithm/lyra2rev2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#include "miner.h"
55
#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent
6-
#define LYRA2Z_SCRATCHBUF_SIZE (8 * 8)
6+
#define LYRA2Z_SCRATCHBUF_SIZE (8 * 8 + 8 * 4 * 8)
77
#define LYRA_SECBUF_SIZE (4) // (not used)
88
extern int lyra2rev2_test(unsigned char *pdata, const unsigned char *ptarget,
99
uint32_t nonce);

driver-opencl.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,14 +1433,21 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
14331433
}
14341434

14351435
for (i = 0; i < clState->n_extra_kernels; i++) {
1436-
if (gpu->algorithm.type == ALGO_PHI2 && i == 0) {
1436+
if (gpu->algorithm.type == ALGO_PHI2 && i == 1) {
14371437
const size_t off2[] = { 0, *p_global_work_offset };
14381438
const size_t gws[] = { 4, globalThreads[0] * 2 };
14391439
const size_t expand[] = { 4, 5 };
14401440
status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 2, off2, gws, expand, 0, NULL, NULL); // lyra 4w monolithic
1441+
} else if (gpu->algorithm.type == ALGO_PHI2 && (i == 0 || i == 2)) {
1442+
size_t globalThreads2[1];
1443+
size_t localThreads2[1];
1444+
globalThreads2[0] = globalThreads[0] * 2;
1445+
localThreads2[0] = localThreads[0];
1446+
status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset,
1447+
globalThreads2, localThreads2, 0, NULL, NULL);
14411448
} else
14421449
status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset,
1443-
globalThreads, localThreads, 0, NULL, NULL);
1450+
globalThreads, localThreads, 0, NULL, NULL);
14441451
if (unlikely(status != CL_SUCCESS)) {
14451452
applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
14461453
return -1;

kernel/lyra2mdz.cl

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,45 @@
4040
* head and tail, one coherent matrix expansion, one incoherent mess.
4141
*/
4242

43+
#define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : (amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n) - 32) | ((ulong)amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n) - 32) << 32)))
44+
45+
#define Gfunc(a,b,c,d) \
46+
{ \
47+
a += b; \
48+
d ^= a; \
49+
ttr = rotr64(d, 32); \
50+
d = ttr; \
51+
\
52+
c += d; \
53+
b ^= c; \
54+
ttr = rotr64(b, 24); \
55+
b = ttr; \
56+
\
57+
a += b; \
58+
d ^= a; \
59+
ttr = rotr64(d, 16); \
60+
d = ttr; \
61+
\
62+
c += d; \
63+
b ^= c; \
64+
ttr = rotr64(b, 63); \
65+
b = ttr; \
66+
}
67+
68+
#define roundLyra(state) \
69+
{ \
70+
Gfunc(state[0].x, state[2].x, state[4].x, state[6].x); \
71+
Gfunc(state[0].y, state[2].y, state[4].y, state[6].y); \
72+
Gfunc(state[1].x, state[3].x, state[5].x, state[7].x); \
73+
Gfunc(state[1].y, state[3].y, state[5].y, state[7].y); \
74+
\
75+
Gfunc(state[0].x, state[2].y, state[5].x, state[7].y); \
76+
Gfunc(state[0].y, state[3].x, state[5].y, state[6].x); \
77+
Gfunc(state[1].x, state[3].y, state[4].x, state[6].y); \
78+
Gfunc(state[1].y, state[2].x, state[4].y, state[7].x); \
79+
}
80+
81+
4382
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
4483
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
4584

kernel/phi2.cl

Lines changed: 119 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ typedef int sph_s32;
6666
#include "cubehash.cl"
6767
#include "fugue.cl"
6868
#include "gost-mod.cl"
69-
#define memshift 3
7069

7170

7271
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@@ -92,12 +91,21 @@ typedef union {
9291
} hash_t;
9392

9493
typedef union {
95-
unsigned char h1[32];
96-
unsigned short h2[16];
97-
uint h4[8];
98-
ulong h8[4];
94+
uint h4[8];
95+
ulong h8[4];
96+
uint4 h16[2];
97+
ulong2 hl16[2];
98+
ulong4 h32;
9999
} hash2_t;
100100

101+
typedef union {
102+
uint h4[32];
103+
ulong h8[16];
104+
uint4 h16[8];
105+
ulong2 hl16[8];
106+
ulong4 h32[4];
107+
} lyraState_t;
108+
101109
#define SWAP8_INPUT(x) x
102110
#define SWAP8_USELESS(x) x
103111

@@ -280,15 +288,62 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes, uin
280288
}
281289

282290

291+
/// lyra2 p1
292+
293+
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
294+
__kernel void search1(__global uint* hashes, __global uchar* sharedDataBuf)
295+
{
296+
int gid = get_global_id(0);
297+
298+
__global hash2_t *hash = (__global hash2_t *)(hashes + (8* (gid-get_global_offset(0))));
299+
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(0))));
300+
301+
ulong ttr;
302+
303+
ulong2 state[8];
304+
// state0
305+
state[0] = hash->hl16[0];
306+
state[1] = hash->hl16[1];
307+
// state1
308+
state[2] = state[0];
309+
state[3] = state[1];
310+
// state2
311+
state[4] = (ulong2)(0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL);
312+
state[5] = (ulong2)(0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL);
313+
// state3 (low,high,..
314+
state[6] = (ulong2)(0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL);
315+
state[7] = (ulong2)(0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL);
316+
317+
// Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
318+
for (int i = 0; i < 24; ++i)
319+
{
320+
roundLyra(state);
321+
}
322+
323+
// state0
324+
lyraState->hl16[0] = state[0];
325+
lyraState->hl16[1] = state[1];
326+
// state1
327+
lyraState->hl16[2] = state[2];
328+
lyraState->hl16[3] = state[3];
329+
// state2
330+
lyraState->hl16[4] = state[4];
331+
lyraState->hl16[5] = state[5];
332+
// state3
333+
lyraState->hl16[6] = state[6];
334+
lyraState->hl16[7] = state[7];
335+
336+
barrier(CLK_GLOBAL_MEM_FENCE);
337+
}
283338

284-
/// lyra2 algo
339+
/// lyra2 algo p2
285340

286341

287342
__attribute__((reqd_work_group_size(4, 5, 1)))
288-
__kernel void search1(__global uchar* hashes)
343+
__kernel void search2(__global uchar* sharedDataBuf)
289344
{
290345
uint gid = get_global_id(1);
291-
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (gid - get_global_offset(1))));
346+
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(1))));
292347

293348
__local ulong roundPad[12 * 5];
294349
__local ulong *xchange = roundPad + get_local_id(1) * 4;
@@ -299,12 +354,13 @@ __kernel void search1(__global uchar* hashes)
299354
const int player = get_local_id(0);
300355

301356
ulong state[4];
302-
state[0] = hash->h8[player];
303-
state[1] = state[0];
304-
state[2] = initial_lyra2[0][player];
305-
state[3] = initial_lyra2[1][player];
306357

307-
for (int loop = 0; loop < 24; loop++) round_lyra_4way(state, xchange);
358+
//-------------------------------------
359+
// Load Lyra state
360+
state[0] = (ulong)(lyraState->h8[player]);
361+
state[1] = (ulong)(lyraState->h8[player+4]);
362+
state[2] = (ulong)(lyraState->h8[player+8]);
363+
state[3] = (ulong)(lyraState->h8[player+12]);
308364

309365
__local ulong *dst = notepad + HYPERMATRIX_COUNT;
310366
for (int loop = 0; loop < LYRA_ROUNDS; loop++) { // write columns and rows 'in order'
@@ -374,16 +430,56 @@ __kernel void search1(__global uchar* hashes)
374430

375431
notepad += HYPERMATRIX_COUNT * modify;
376432
for(int loop = 0; loop < 3; loop++) state[loop] ^= notepad[loop * REG_ROW_COUNT];
377-
for(int loop = 0; loop < 12; loop++) round_lyra_4way(state, xchange);
378433

379-
hash->h8[player] = state[0];
434+
//-------------------------------------
435+
// save lyra state
436+
lyraState->h8[player] = state[0];
437+
lyraState->h8[player+4] = state[1];
438+
lyraState->h8[player+8] = state[2];
439+
lyraState->h8[player+12] = state[3];
380440

381441
barrier(CLK_GLOBAL_MEM_FENCE);
382442
}
383443

444+
// lyra2 p3
445+
446+
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
447+
__kernel void search3(__global uint* hashes, __global uchar* sharedDataBuf)
448+
{
449+
int gid = get_global_id(0);
450+
451+
__global hash2_t *hash = (__global hash2_t *)(hashes + (8* (gid-get_global_offset(0))));
452+
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(0))));
453+
454+
ulong ttr;
455+
456+
ulong2 state[8];
457+
// 1. load lyra State
458+
state[0] = lyraState->hl16[0];
459+
state[1] = lyraState->hl16[1];
460+
state[2] = lyraState->hl16[2];
461+
state[3] = lyraState->hl16[3];
462+
state[4] = lyraState->hl16[4];
463+
state[5] = lyraState->hl16[5];
464+
state[6] = lyraState->hl16[6];
465+
state[7] = lyraState->hl16[7];
466+
467+
// 2. rounds
468+
for (int i = 0; i < 12; ++i)
469+
{
470+
roundLyra(state);
471+
}
472+
473+
// 3. store result
474+
hash->hl16[0] = state[0];
475+
hash->hl16[1] = state[1];
476+
477+
barrier(CLK_GLOBAL_MEM_FENCE);
478+
}
479+
384480
// jh 64
385481
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
386-
__kernel void search2(__global hash_t* hashes)
482+
__kernel void search4(__global hash_t* hashes)
387483
{
388484
uint gid = get_global_id(0);
389485
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -555,7 +651,7 @@ __kernel void search2(__global hash_t* hashes)
555651

556652

557653
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
558-
__kernel void search3(__global hash_t* hashes, __global hash_t* branches, __global uchar* nonceBranches)
654+
__kernel void search5(__global hash_t* hashes, __global hash_t* branches, __global uchar* nonceBranches)
559655
{
560656
// phi_filter_cuda
561657

@@ -579,7 +675,7 @@ __kernel void search3(__global hash_t* hashes, __global hash_t* branches, __glob
579675

580676
//gost streebog 64
581677
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
582-
__kernel void search4(__global hash_t* hashes)
678+
__kernel void search6(__global hash_t* hashes)
583679
{
584680
uint gid = get_global_id(0);
585681
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -622,7 +718,7 @@ __kernel void search4(__global hash_t* hashes)
622718

623719
// echo 64
624720
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
625-
__kernel void search5(__global hash_t* hashes)
721+
__kernel void search7(__global hash_t* hashes)
626722
{
627723
uint gid = get_global_id(0);
628724
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -677,7 +773,7 @@ __kernel void search5(__global hash_t* hashes)
677773

678774
// echo 64
679775
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
680-
__kernel void search6(__global hash_t* hashes)
776+
__kernel void search8(__global hash_t* hashes)
681777
{
682778
uint gid = get_global_id(0);
683779
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -731,7 +827,7 @@ __kernel void search6(__global hash_t* hashes)
731827
}
732828

733829
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
734-
__kernel void search7(__global hash_t* hashes, __global hash_t* branches, __global uchar* nonceBranches)
830+
__kernel void search9(__global hash_t* hashes, __global hash_t* branches, __global uchar* nonceBranches)
735831
{
736832
//phi_merge_cuda
737833
uint gid = get_global_id(0);
@@ -751,7 +847,7 @@ __kernel void search7(__global hash_t* hashes, __global hash_t* branches, __glob
751847

752848
// skein 64
753849
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
754-
__kernel void search8(__global hash_t* hashes)
850+
__kernel void search10(__global hash_t* hashes)
755851
{
756852
uint gid = get_global_id(0);
757853
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -778,7 +874,7 @@ __kernel void search8(__global hash_t* hashes)
778874
}
779875

780876
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
781-
__kernel void search9(__global hash_t* hashes, __global uint* output, const ulong target)
877+
__kernel void search11(__global hash_t* hashes, __global uint* output, const ulong target)
782878
{
783879
// phi_final_compress_cuda
784880
uint gid = get_global_id(0);

ocl.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -827,7 +827,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
827827
}
828828
}
829829
else if (algorithm->type == ALGO_PHI2) {
830-
buf3size = LYRA2Z_SCRATCHBUF_SIZE * cgpu->thread_concurrency; //matrix
830+
buf3size = 8 * 4 * 8 * cgpu->thread_concurrency; //lyra2 states
831831
buf2size = 1 * cgpu->thread_concurrency;
832832
bufsize = 8 * 8 * cgpu->thread_concurrency;
833833

@@ -894,11 +894,11 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
894894
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status);
895895
return NULL;
896896
}
897-
/*clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status);
897+
clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status);
898898
if (status != CL_SUCCESS && !clState->buffer3) {
899899
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status);
900900
return NULL;
901-
}*/
901+
}
902902
}
903903
else {
904904
clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);

0 commit comments

Comments
 (0)