@@ -66,7 +66,6 @@ typedef int sph_s32;
66
66
#include "cubehash.cl"
67
67
#include "fugue.cl"
68
68
#include "gost-mod.cl"
69
- #define memshift 3
70
69
71
70
72
71
#define SWAP4 (x ) as_uint(as_uchar4(x).wzyx)
@@ -92,12 +91,21 @@ typedef union {
92
91
} hash_t ;
93
92
94
93
typedef union {
95
- unsigned char h1 [32 ];
96
- unsigned short h2 [16 ];
97
- uint h4 [8 ];
98
- ulong h8 [4 ];
94
+ uint h4 [8 ];
95
+ ulong h8 [4 ];
96
+ uint4 h16 [2 ];
97
+ ulong2 hl16 [2 ];
98
+ ulong4 h32 ;
99
99
} hash2_t ;
100
100
101
+ typedef union {
102
+ uint h4 [32 ];
103
+ ulong h8 [16 ];
104
+ uint4 h16 [8 ];
105
+ ulong2 hl16 [8 ];
106
+ ulong4 h32 [4 ];
107
+ } lyraState_t ;
108
+
101
109
#define SWAP8_INPUT (x ) x
102
110
#define SWAP8_USELESS (x ) x
103
111
@@ -280,15 +288,62 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes, uin
280
288
}
281
289
282
290
291
+ /// lyra2 p1
292
+
293
+ __attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
294
+ __kernel void search1 (__global uint * hashes , __global uchar * sharedDataBuf )
295
+ {
296
+ int gid = get_global_id (0 );
297
+
298
+ __global hash2_t * hash = (__global hash2_t * )(hashes + (8 * (gid - get_global_offset (0 ))));
299
+ __global lyraState_t * lyraState = (__global lyraState_t * )(sharedDataBuf + ((8 * 4 * 4 ) * (gid - get_global_offset (0 ))));
300
+
301
+ ulong ttr ;
302
+
303
+ ulong2 state [8 ];
304
+ // state0
305
+ state [0 ] = hash -> hl16 [0 ];
306
+ state [1 ] = hash -> hl16 [1 ];
307
+ // state1
308
+ state [2 ] = state [0 ];
309
+ state [3 ] = state [1 ];
310
+ // state2
311
+ state [4 ] = (ulong2 )(0x6a09e667f3bcc908UL , 0xbb67ae8584caa73bUL );
312
+ state [5 ] = (ulong2 )(0x3c6ef372fe94f82bUL , 0xa54ff53a5f1d36f1UL );
313
+ // state3 (low,high,..
314
+ state [6 ] = (ulong2 )(0x510e527fade682d1UL , 0x9b05688c2b3e6c1fUL );
315
+ state [7 ] = (ulong2 )(0x1f83d9abfb41bd6bUL , 0x5be0cd19137e2179UL );
316
+
317
+ // Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
318
+ for (int i = 0 ; i < 24 ; ++ i )
319
+ {
320
+ roundLyra (state );
321
+ }
322
+
323
+ // state0
324
+ lyraState -> hl16 [0 ] = state [0 ];
325
+ lyraState -> hl16 [1 ] = state [1 ];
326
+ // state1
327
+ lyraState -> hl16 [2 ] = state [2 ];
328
+ lyraState -> hl16 [3 ] = state [3 ];
329
+ // state2
330
+ lyraState -> hl16 [4 ] = state [4 ];
331
+ lyraState -> hl16 [5 ] = state [5 ];
332
+ // state3
333
+ lyraState -> hl16 [6 ] = state [6 ];
334
+ lyraState -> hl16 [7 ] = state [7 ];
335
+
336
+ barrier (CLK_GLOBAL_MEM_FENCE );
337
+ }
283
338
284
- /// lyra2 algo
339
+ /// lyra2 algo p2
285
340
286
341
287
342
__attribute__((reqd_work_group_size (4 , 5 , 1 )))
288
- __kernel void search1 (__global uchar * hashes )
343
+ __kernel void search2 (__global uchar * sharedDataBuf )
289
344
{
290
345
uint gid = get_global_id (1 );
291
- __global hash_t * hash = (__global hash_t * )(hashes + (4 * sizeof ( ulong ) * (gid - get_global_offset (1 ))));
346
+ __global lyraState_t * lyraState = (__global lyraState_t * )(sharedDataBuf + (( 8 * 4 * 4 ) * (gid - get_global_offset (1 ))));
292
347
293
348
__local ulong roundPad [12 * 5 ];
294
349
__local ulong * xchange = roundPad + get_local_id (1 ) * 4 ;
@@ -299,12 +354,13 @@ __kernel void search1(__global uchar* hashes)
299
354
const int player = get_local_id (0 );
300
355
301
356
ulong state [4 ];
302
- state [0 ] = hash -> h8 [player ];
303
- state [1 ] = state [0 ];
304
- state [2 ] = initial_lyra2 [0 ][player ];
305
- state [3 ] = initial_lyra2 [1 ][player ];
306
357
307
- for (int loop = 0 ; loop < 24 ; loop ++ ) round_lyra_4way (state , xchange );
358
+ //-------------------------------------
359
+ // Load Lyra state
360
+ state [0 ] = (ulong )(lyraState -> h8 [player ]);
361
+ state [1 ] = (ulong )(lyraState -> h8 [player + 4 ]);
362
+ state [2 ] = (ulong )(lyraState -> h8 [player + 8 ]);
363
+ state [3 ] = (ulong )(lyraState -> h8 [player + 12 ]);
308
364
309
365
__local ulong * dst = notepad + HYPERMATRIX_COUNT ;
310
366
for (int loop = 0 ; loop < LYRA_ROUNDS ; loop ++ ) { // write columns and rows 'in order'
@@ -374,16 +430,56 @@ __kernel void search1(__global uchar* hashes)
374
430
375
431
notepad += HYPERMATRIX_COUNT * modify ;
376
432
for (int loop = 0 ; loop < 3 ; loop ++ ) state [loop ] ^= notepad [loop * REG_ROW_COUNT ];
377
- for (int loop = 0 ; loop < 12 ; loop ++ ) round_lyra_4way (state , xchange );
378
433
379
- hash -> h8 [player ] = state [0 ];
434
+ //-------------------------------------
435
+ // save lyra state
436
+ lyraState -> h8 [player ] = state [0 ];
437
+ lyraState -> h8 [player + 4 ] = state [1 ];
438
+ lyraState -> h8 [player + 8 ] = state [2 ];
439
+ lyraState -> h8 [player + 12 ] = state [3 ];
380
440
381
441
barrier (CLK_GLOBAL_MEM_FENCE );
382
442
}
383
443
444
+ // lyra2 p3
445
+
446
+ __attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
447
+ __kernel void search3 (__global uint * hashes , __global uchar * sharedDataBuf )
448
+ {
449
+ int gid = get_global_id (0 );
450
+
451
+ __global hash2_t * hash = (__global hash2_t * )(hashes + (8 * (gid - get_global_offset (0 ))));
452
+ __global lyraState_t * lyraState = (__global lyraState_t * )(sharedDataBuf + ((8 * 4 * 4 ) * (gid - get_global_offset (0 ))));
453
+
454
+ ulong ttr ;
455
+
456
+ ulong2 state [8 ];
457
+ // 1. load lyra State
458
+ state [0 ] = lyraState -> hl16 [0 ];
459
+ state [1 ] = lyraState -> hl16 [1 ];
460
+ state [2 ] = lyraState -> hl16 [2 ];
461
+ state [3 ] = lyraState -> hl16 [3 ];
462
+ state [4 ] = lyraState -> hl16 [4 ];
463
+ state [5 ] = lyraState -> hl16 [5 ];
464
+ state [6 ] = lyraState -> hl16 [6 ];
465
+ state [7 ] = lyraState -> hl16 [7 ];
466
+
467
+ // 2. rounds
468
+ for (int i = 0 ; i < 12 ; ++ i )
469
+ {
470
+ roundLyra (state );
471
+ }
472
+
473
+ // 3. store result
474
+ hash -> hl16 [0 ] = state [0 ];
475
+ hash -> hl16 [1 ] = state [1 ];
476
+
477
+ barrier (CLK_GLOBAL_MEM_FENCE );
478
+ }
479
+
384
480
// jh 64
385
481
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
386
- __kernel void search2 (__global hash_t * hashes )
482
+ __kernel void search4 (__global hash_t * hashes )
387
483
{
388
484
uint gid = get_global_id (0 );
389
485
__global hash_t * hash = & (hashes [gid - get_global_offset (0 )]);
@@ -555,7 +651,7 @@ __kernel void search2(__global hash_t* hashes)
555
651
556
652
557
653
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
558
- __kernel void search3 (__global hash_t * hashes , __global hash_t * branches , __global uchar * nonceBranches )
654
+ __kernel void search5 (__global hash_t * hashes , __global hash_t * branches , __global uchar * nonceBranches )
559
655
{
560
656
// phi_filter_cuda
561
657
@@ -579,7 +675,7 @@ __kernel void search3(__global hash_t* hashes, __global hash_t* branches, __glob
579
675
580
676
//gost streebog 64
581
677
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
582
- __kernel void search4 (__global hash_t * hashes )
678
+ __kernel void search6 (__global hash_t * hashes )
583
679
{
584
680
uint gid = get_global_id (0 );
585
681
__global hash_t * hash = & (hashes [gid - get_global_offset (0 )]);
@@ -622,7 +718,7 @@ __kernel void search4(__global hash_t* hashes)
622
718
623
719
// echo 64
624
720
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
625
- __kernel void search5 (__global hash_t * hashes )
721
+ __kernel void search7 (__global hash_t * hashes )
626
722
{
627
723
uint gid = get_global_id (0 );
628
724
__global hash_t * hash = & (hashes [gid - get_global_offset (0 )]);
@@ -677,7 +773,7 @@ __kernel void search5(__global hash_t* hashes)
677
773
678
774
// echo 64
679
775
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
680
- __kernel void search6 (__global hash_t * hashes )
776
+ __kernel void search8 (__global hash_t * hashes )
681
777
{
682
778
uint gid = get_global_id (0 );
683
779
__global hash_t * hash = & (hashes [gid - get_global_offset (0 )]);
@@ -731,7 +827,7 @@ __kernel void search6(__global hash_t* hashes)
731
827
}
732
828
733
829
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
734
- __kernel void search7 (__global hash_t * hashes , __global hash_t * branches , __global uchar * nonceBranches )
830
+ __kernel void search9 (__global hash_t * hashes , __global hash_t * branches , __global uchar * nonceBranches )
735
831
{
736
832
//phi_merge_cuda
737
833
uint gid = get_global_id (0 );
@@ -751,7 +847,7 @@ __kernel void search7(__global hash_t* hashes, __global hash_t* branches, __glob
751
847
752
848
// skein 64
753
849
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
754
- __kernel void search8 (__global hash_t * hashes )
850
+ __kernel void search10 (__global hash_t * hashes )
755
851
{
756
852
uint gid = get_global_id (0 );
757
853
__global hash_t * hash = & (hashes [gid - get_global_offset (0 )]);
@@ -778,7 +874,7 @@ __kernel void search8(__global hash_t* hashes)
778
874
}
779
875
780
876
__attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
781
- __kernel void search9 (__global hash_t * hashes , __global uint * output , const ulong target )
877
+ __kernel void search11 (__global hash_t * hashes , __global uint * output , const ulong target )
782
878
{
783
879
// phi_final_compress_cuda
784
880
uint gid = get_global_id (0 );
0 commit comments