Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,23 @@
// Release number
#define RELEASE "2.2"

// Use symmetry
//#define USE_SYMMETRY
// Use symmetry - provides sqrt(2) speedup (~41% faster)
// Enabled for 150-bit support
#define USE_SYMMETRY

// Number of random jumps
// Max 512 for the GPU
#define NB_JUMP 32
// Max 512 for the GPU - increased to 64 for better distribution
// More jumps = more uniform random walk = closer to theoretical bounds
#define NB_JUMP 64

// GPU group size
// 128 is optimal balance between throughput and register pressure
// Higher values may cause register spills on older GPUs
#define GPU_GRP_SIZE 128

// GPU number of run per kernel call
#define NB_RUN 64
// Increased for better GPU utilization and reduced kernel launch overhead
#define NB_RUN 128

// Kangaroo type
#define TAME 0 // Tame kangaroo
Expand Down
5 changes: 5 additions & 0 deletions GPU/GPUEngine.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ int _ConvertSMVer2Cores(int major,int minor) {
{ 0x70, 64 },
{ 0x72, 64 },
{ 0x75, 64 },
{ 0x80, 64 }, // Ampere (SM 8.0) - GA100
{ 0x86, 128 }, // Ampere (SM 8.6) - GA102, GA104, GA106, GA107
{ 0x87, 128 }, // Ampere (SM 8.7) - Jetson Orin
{ 0x89, 128 }, // Ada Lovelace (SM 8.9) - RTX 40xx
{ 0x90, 128 }, // Hopper (SM 9.0) - H100
{ -1, -1 } };

int index = 0;
Expand Down
7 changes: 6 additions & 1 deletion HashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@
#include <Windows.h>
#endif

#define HASH_SIZE_BIT 18
// Hash table size - dramatically increased for 135-150 bit range support
// For 135-bit range: ~2^67.5 ops, with DP=32 -> ~2^35.5 DPs needed
// For 150-bit range: ~2^75 ops, with DP=40 -> ~2^35 DPs needed
// 2^26 = 64M entries allows for larger ranges with reasonable RAM (~2-4 GB)
// For extreme ranges (135+ bit), use server mode or increase this value
#define HASH_SIZE_BIT 26
#define HASH_SIZE (1<<HASH_SIZE_BIT)
#define HASH_MASK (HASH_SIZE-1)

Expand Down
190 changes: 133 additions & 57 deletions Kangaroo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -748,23 +748,27 @@ void Kangaroo::CreateJumpTable() {
#endif

if(jumpBit > 128) jumpBit = 128;
int maxRetry = 100;
bool ok = false;
double distAvg;
double maxAvg = pow(2.0,(double)jumpBit - 0.95);
double minAvg = pow(2.0,(double)jumpBit - 1.05);
//::printf("Jump Avg distance min: 2^%.2f\n",log2(minAvg));
//::printf("Jump Avg distance max: 2^%.2f\n",log2(maxAvg));

// Kangaroo jumps
// Constant seed for compatibilty of workfiles
rseed(0x600DCAFE);

::printf("Creating optimized power-of-2 jump table for %d-bit range...\n", rangePower);

// Kangaroo jumps - Use power-of-2 based jumps for optimal distribution
// This approach provides better coverage and is closer to theoretical bounds
// See: Pollard's Lambda method optimization papers

Int totalDist;
totalDist.SetInt32(0);

// For USE_SYMMETRY mode, we need to ensure even-length jumps
// to maintain parity consistency across the symmetric search

#ifdef USE_SYMMETRY
// Symmetry mode: Use carefully chosen coprime multipliers for the two halves
// This prevents intra-herd collisions as per van Oorschot-Wiener optimization
Int old;
old.Set(Int::GetFieldCharacteristic());
Int u;
Int v;

// Find coprime odd multipliers for better distribution
Int u, v;
u.SetInt32(1);
u.ShiftL(jumpBit/2);
u.AddOne();
Expand All @@ -781,50 +785,76 @@ void Kangaroo::CreateJumpTable() {
}
Int::SetupField(&old);

::printf("U= %s\n",u.GetBase16().c_str());
::printf("V= %s\n",v.GetBase16().c_str());
#endif
// Power-of-2 based jumps with coprime multipliers
// First half: powers of 2 multiplied by u
for(int i = 0; i < NB_JUMP/2; ++i) {
// Use power-of-2 based distribution
int pow2 = (i * jumpBit) / (NB_JUMP/2);
if(pow2 > jumpBit - 1) pow2 = jumpBit - 1;
jumpDistance[i].SetInt32(1);
jumpDistance[i].ShiftL(pow2);
// Apply small random variation (within 2^3 range) for better mixing
Int variation;
variation.Rand(3);
variation.AddOne();
jumpDistance[i].Mult(&variation);
jumpDistance[i].Mult(&u);
if(jumpDistance[i].IsZero())
jumpDistance[i].SetInt32(1);
totalDist.Add(&jumpDistance[i]);
}

// Second half: powers of 2 multiplied by v
for(int i = NB_JUMP/2; i < NB_JUMP; ++i) {
int pow2 = ((i - NB_JUMP/2) * jumpBit) / (NB_JUMP/2);
if(pow2 > jumpBit - 1) pow2 = jumpBit - 1;
jumpDistance[i].SetInt32(1);
jumpDistance[i].ShiftL(pow2);
// Apply small random variation
Int variation;
variation.Rand(3);
variation.AddOne();
jumpDistance[i].Mult(&variation);
jumpDistance[i].Mult(&v);
if(jumpDistance[i].IsZero())
jumpDistance[i].SetInt32(1);
totalDist.Add(&jumpDistance[i]);
}

// Positive only
// When using symmetry, the sign is switched by the symmetry class switch
while(!ok && maxRetry>0 ) {
Int totalDist;
totalDist.SetInt32(0);
#ifdef USE_SYMMETRY
for(int i = 0; i < NB_JUMP/2; ++i) {
jumpDistance[i].Rand(jumpBit/2);
jumpDistance[i].Mult(&u);
if(jumpDistance[i].IsZero())
jumpDistance[i].SetInt32(1);
totalDist.Add(&jumpDistance[i]);
}
for(int i = NB_JUMP / 2; i < NB_JUMP; ++i) {
jumpDistance[i].Rand(jumpBit/2);
jumpDistance[i].Mult(&v);
if(jumpDistance[i].IsZero())
jumpDistance[i].SetInt32(1);
totalDist.Add(&jumpDistance[i]);
}
#else
for(int i = 0; i < NB_JUMP; ++i) {
jumpDistance[i].Rand(jumpBit);
if(jumpDistance[i].IsZero())
jumpDistance[i].SetInt32(1);
totalDist.Add(&jumpDistance[i]);
// Non-symmetry mode: Pure power-of-2 jumps with slight variation
// Optimal mean jump size is sqrt(N)/2, largest jump ~2*mean
for(int i = 0; i < NB_JUMP; ++i) {
// Distribute powers of 2 across the jump range
// Use formula: 2^(i * jumpBit / NB_JUMP) with small random variation
int pow2 = (i * jumpBit) / NB_JUMP;
if(pow2 > jumpBit) pow2 = jumpBit;

jumpDistance[i].SetInt32(1);
jumpDistance[i].ShiftL(pow2);

// Add small random variation (1-8x multiplier) for better mixing
// while keeping the power-of-2 structure
Int multiplier;
multiplier.Rand(3); // 0-7
multiplier.AddOne(); // 1-8
jumpDistance[i].Mult(&multiplier);

if(jumpDistance[i].IsZero())
jumpDistance[i].SetInt32(1);
totalDist.Add(&jumpDistance[i]);
}
#endif
distAvg = totalDist.ToDouble() / (double)(NB_JUMP);
ok = distAvg>minAvg && distAvg<maxAvg;
maxRetry--;
}

// Compute jump points
for(int i = 0; i < NB_JUMP; ++i) {
Point J = secp->ComputePublicKey(&jumpDistance[i]);
jumpPointx[i].Set(&J.x);
jumpPointy[i].Set(&J.y);
}

::printf("Jump Avg distance: 2^%.2f\n",log2(distAvg));
double distAvg = totalDist.ToDouble() / (double)(NB_JUMP);
::printf("Jump table: %d entries, Avg distance: 2^%.2f\n", NB_JUMP, log2(distAvg));

unsigned long seed = Timer::getSeed32();
rseed(seed);
Expand All @@ -836,6 +866,12 @@ void Kangaroo::CreateJumpTable() {
void Kangaroo::ComputeExpected(double dp,double *op,double *ram,double *overHead) {

// Compute expected number of operation and memory
//
// Using Gaudry-Schost improved formula for interval DLP:
// - Standard Kangaroo: 2.08√N operations
// - Gaudry-Schost (interval): 1.686√N operations (~19% improvement)
//
// Reference: "Computing Discrete Logarithms in an Interval" (ePrint 2010/617)

#ifdef USE_SYMMETRY
double gainS = 1.0 / sqrt(2.0);
Expand All @@ -849,21 +885,32 @@ void Kangaroo::ComputeExpected(double dp,double *op,double *ram,double *overHead
// Range size
double N = pow(2.0,(double)rangePower);

// theta
// theta (DP density = 1/2^dp)
double theta = pow(2.0,dp);

// Z0
double Z0 = (2.0 * (2.0 - sqrt(2.0)) * gainS) * sqrt(M_PI);
// Gaudry-Schost constant for interval DLP
// 1.686 vs the standard 2.08 (van Oorschot-Wiener)
// This constant assumes optimal tame/wild set construction
double GS_CONSTANT = 1.686;

// Average for DP = 0
// Apply symmetry gain
double Z0 = GS_CONSTANT * gainS * sqrt(M_PI);

// Average for DP = 0 (no DP overhead)
double avgDP0 = Z0 * sqrt(N);

// DP Overhead
// DP Overhead formula from van Oorschot-Wiener:
// Expected ops = Z0 * ∛(N * (k*θ + √N))
// This accounts for:
// - k = number of kangaroos running in parallel
// - θ = 2^dp = expected jumps between DPs
// - √N = optimal number of DPs to collect
*op = Z0 * pow(N * (k * theta + sqrt(N)),1.0 / 3.0);

*ram = (double)sizeof(HASH_ENTRY) * (double)HASH_SIZE + // Table
// Memory estimate
*ram = (double)sizeof(HASH_ENTRY) * (double)HASH_SIZE + // Hash table
(double)sizeof(ENTRY *) * (double)(HASH_SIZE * 4) + // Allocation overhead
(double)(sizeof(ENTRY) + sizeof(ENTRY *)) * (*op / theta); // Entries
(double)(sizeof(ENTRY) + sizeof(ENTRY *)) * (*op / theta); // DP entries

*ram /= (1024.0*1024.0);

Expand Down Expand Up @@ -977,23 +1024,52 @@ void Kangaroo::Run(int nbThread,std::vector<int> gpuId,std::vector<int> gridSize

if( !clientMode ) {

// Compute suggested distinguished bits number for less than 5% overhead (see README)
// Compute suggested distinguished bits number
// For large ranges (100+ bits), we need higher DP to avoid hash table overflow
// Balance: lower DP = more storage, higher DP = more overhead after collision
double dpOverHead;
int suggestedDP = (int)((double)rangePower / 2.0 - log2((double)totalRW));
if(suggestedDP<0) suggestedDP=0;

// For 135+ bit ranges, ensure minimum DP to prevent hash table overflow
// With HASH_SIZE_BIT=26 (64M entries), we need DP such that
// expected_DPs = 2^(rangePower/2 - DP) < 2^26
// So DP > rangePower/2 - 26
int minDPForHashSize = (rangePower / 2) - HASH_SIZE_BIT + 2; // +2 for safety margin
if(minDPForHashSize > suggestedDP) {
::printf("Warning: Range is very large (%d-bit). Adjusting DP for hash table capacity.\n", rangePower);
::printf(" Minimum DP for current hash table: %d\n", minDPForHashSize);
suggestedDP = minDPForHashSize;
}

ComputeExpected((double)suggestedDP,&expectedNbOp,&expectedMem,&dpOverHead);
while(dpOverHead>1.05 && suggestedDP>0) {

// For ranges over 120 bits, allow higher overhead (up to 15%) to reduce memory
double maxOverhead = (rangePower > 120) ? 1.15 : 1.05;

while(dpOverHead > maxOverhead && suggestedDP > 0) {
suggestedDP--;
ComputeExpected((double)suggestedDP,&expectedNbOp,&expectedMem,&dpOverHead);
}

if(initDPSize < 0)
initDPSize = suggestedDP;

ComputeExpected((double)initDPSize,&expectedNbOp,&expectedMem);
if(nbLoadedWalk == 0) ::printf("Suggested DP: %d\n",suggestedDP);
ComputeExpected((double)initDPSize,&expectedNbOp,&expectedMem,&dpOverHead);
if(nbLoadedWalk == 0) {
::printf("Suggested DP: %d\n",suggestedDP);
if(rangePower >= 130) {
::printf("\n=== LARGE RANGE NOTICE (%d-bit) ===\n", rangePower);
::printf("For ranges over 130 bits, solving requires massive compute resources.\n");
::printf("Expected operations: 2^%.2f (~10^%.1f)\n", log2(expectedNbOp), log2(expectedNbOp)*0.301);
::printf("With symmetry enabled: ~%.1fx faster than without\n", sqrt(2.0));
::printf("Consider using distributed computing (server mode) for faster results.\n");
::printf("===================================\n\n");
}
}
::printf("Expected operations: 2^%.2f\n",log2(expectedNbOp));
::printf("Expected RAM: %.1fMB\n",expectedMem);
::printf("DP overhead factor: %.2fx\n", dpOverHead);

} else {

Expand Down
18 changes: 11 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,25 +40,28 @@ OBJET = $(addprefix $(OBJDIR)/, \
endif

CXX = g++
CUDA = /usr/local/cuda-8.0
CXXCUDA = /usr/bin/g++-4.8
# CUDA path - adjust for your system (common paths: /usr/local/cuda, /usr/local/cuda-11.0, /usr/local/cuda-12.0)
CUDA ?= /usr/local/cuda
CXXCUDA ?= $(CXX)
NVCC = $(CUDA)/bin/nvcc

ifdef gpu

ifdef debug
CXXFLAGS = -DWITHGPU -m64 -mssse3 -Wno-unused-result -Wno-write-strings -g -I. -I$(CUDA)/include
CXXFLAGS = -DWITHGPU -m64 -march=native -mssse3 -Wno-unused-result -Wno-write-strings -g -I. -I$(CUDA)/include
else
CXXFLAGS = -DWITHGPU -m64 -mssse3 -Wno-unused-result -Wno-write-strings -O2 -I. -I$(CUDA)/include
# Added -O3 and -march=native for better CPU performance
CXXFLAGS = -DWITHGPU -m64 -march=native -mssse3 -Wno-unused-result -Wno-write-strings -O3 -I. -I$(CUDA)/include
endif
LFLAGS = -lpthread -L$(CUDA)/lib64 -lcudart

else

ifdef debug
CXXFLAGS = -m64 -mssse3 -Wno-unused-result -Wno-write-strings -g -I. -I$(CUDA)/include
CXXFLAGS = -m64 -march=native -mssse3 -Wno-unused-result -Wno-write-strings -g -I. -I$(CUDA)/include
else
CXXFLAGS = -m64 -mssse3 -Wno-unused-result -Wno-write-strings -O2 -I. -I$(CUDA)/include
# Added -O3 and -march=native for better CPU performance
CXXFLAGS = -m64 -march=native -mssse3 -Wno-unused-result -Wno-write-strings -O3 -I. -I$(CUDA)/include
endif
LFLAGS = -lpthread

Expand All @@ -72,7 +75,8 @@ $(OBJDIR)/GPU/GPUEngine.o: GPU/GPUEngine.cu
$(NVCC) -G -maxrregcount=0 --ptxas-options=-v --compile --compiler-options -fPIC -ccbin $(CXXCUDA) -m64 -g -I$(CUDA)/include -gencode=arch=compute_$(ccap),code=sm_$(ccap) -o $(OBJDIR)/GPU/GPUEngine.o -c GPU/GPUEngine.cu
else
$(OBJDIR)/GPU/GPUEngine.o: GPU/GPUEngine.cu
$(NVCC) -maxrregcount=0 --ptxas-options=-v --compile --compiler-options -fPIC -ccbin $(CXXCUDA) -m64 -O2 -I$(CUDA)/include -gencode=arch=compute_$(ccap),code=sm_$(ccap) -o $(OBJDIR)/GPU/GPUEngine.o -c GPU/GPUEngine.cu
@echo "Compiling GPU kernel for compute capability $(ccap)..."
$(NVCC) -maxrregcount=48 --ptxas-options=-v --compile --compiler-options "-fPIC -O3" -ccbin $(CXXCUDA) -m64 -O3 -I$(CUDA)/include -gencode=arch=compute_$(ccap),code=sm_$(ccap) -o $(OBJDIR)/GPU/GPUEngine.o -c GPU/GPUEngine.cu
endif
endif

Expand Down
Loading