Merge pull request #6 from fancyIX/feature/#4

fancyIX · web-flow · commit 8a079545ffb8 · 2021-11-29T16:57:04.000-08:00
Fix windows complier problem
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
@@ -39,7 +39,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.1.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -195,14 +195,14 @@
     </Link>
     <CudaCompile>
       <CInterleavedPTX>false</CInterleavedPTX>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>128</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
       <CodeGeneration>compute_75,sm_75;compute_61,sm_61;compute_52,sm_52</CodeGeneration>
       <Include>$(NVTOOLSEXT_PATH)\include</Include>
       <Optimization>O3</Optimization>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <AdditionalOptions>--Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>-allow-unsupported-compiler --Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
     </CudaCompile>
     <CudaLink>
       <Optimization>O3</Optimization>
@@ -584,7 +584,7 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.1.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.3.targets" />
   </ImportGroup>
   <!-- Copy the required dlls -->
   <Target Name="AfterBuild">
diff --git a/heavyhash/cuda_heavyhash.cu b/heavyhash/cuda_heavyhash.cu
@@ -97,15 +97,15 @@ static void __forceinline__ __device__ keccak_block(uint2 *s)
 __global__
 void heavyhash_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
 {
-	__shared__ ulong2 matrix[1024];
+	__shared__ uint64_t matrix[1024 * 2];
 
     uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     uint32_t nonce = startNonce + thread;
     if (thread < threads)
 	{
 		uint32_t tid = threadIdx.x;
-		ulong2 *cp = (ulong2 *)(&c_matrix[0][0]);
-		for (int i = 0; i < 4; i++) {
+		uint64_t *cp = (uint64_t *)(c_matrix);
+		for (int i = 0; i < 8; i++) {
 			matrix[tid + i * 256] = cp[tid + i * 256];
 		}
 
@@ -140,26 +140,26 @@ void heavyhash_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint3
 
         for (int i = 0; i < 64; ++i) {
             uint32_t sum = 0;
-			for (int k = 0; k < 4; k++) {
-				ulong2 buf0 = matrix[i * 16 + k * 4 + 0];
-				ulong2 buf1 = matrix[i * 16 + k * 4 + 1];
-				ulong2 buf2 = matrix[i * 16 + k * 4 + 2];
-				ulong2 buf3 = matrix[i * 16 + k * 4 + 3];
+			for (int k = 0; k < 8; k++) {
+				uint64_t buf0 = matrix[i * 32 + k * 4 + 0];
+				uint64_t buf1 = matrix[i * 32 + k * 4 + 1];
+				uint64_t buf2 = matrix[i * 32 + k * 4 + 2];
+				uint64_t buf3 = matrix[i * 32 + k * 4 + 3];
 				uint32_t *m0 = (uint32_t *)&buf0;
-				for (int j = 0; j < 4; j++) {
-					sum += m0[j] * vector[(k * 4 + 0) * 4 + j];
+				for (int j = 0; j < 2; j++) {
+					sum += m0[j] * vector[(k * 4 + 0) * 2 + j];
 				}
 				uint32_t *m1 = (uint32_t *)&buf1;
-				for (int j = 0; j < 4; j++) {
-					sum += m1[j] * vector[(k * 4 + 1) * 4 + j];
+				for (int j = 0; j < 2; j++) {
+					sum += m1[j] * vector[(k * 4 + 1) * 2 + j];
 				}
 				uint32_t *m2 = (uint32_t *)&buf2;
-				for (int j = 0; j < 4; j++) {
-					sum += m2[j] * vector[(k * 4 + 2) * 4 + j];
+				for (int j = 0; j < 2; j++) {
+					sum += m2[j] * vector[(k * 4 + 2) * 2 + j];
 				}
 				uint32_t *m3 = (uint32_t *)&buf3;
-				for (int j = 0; j < 4; j++) {
-					sum += m3[j] * vector[(k * 4 + 3) * 4 + j];
+				for (int j = 0; j < 2; j++) {
+					sum += m3[j] * vector[(k * 4 + 3) * 2 + j];
 				}
 			}
             product[i] = (sum >> 10);