Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions cuda/inc/WireCellCuda/cuFftDFT.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#ifndef WIRECELLCUDA_CUFFTDFT
#define WIRECELLCUDA_CUFFTDFT

#include <cuda.h>
#include <cufft.h>

#include "WireCellIface/IDFT.h"

namespace WireCell::Cuda
{
/* Used by the `(pre/post)transformTask` functions to
* move around pointers to the input and output arrays
* both on the CPU and GPU */
template<class T>
struct memArgs
{ const T* in; T* out; };

// The `cuFftDFT` class provieds `IDFT` based on `cuFFT`
class cuFftDFT : public IDFT
{
private:
/*These two functions prepare/clean up the `in` and `out` arrays on both the
* the GPU and CPU */
memArgs<cufftComplex> preTransformTasks(memArgs<complex_t> hostMem, int size) const;
void postTransformTasks( memArgs<complex_t> hmem, memArgs<cufftComplex> dmem, int size ) const;

/* The `gen*` functions handle both the forward and inverse transforms since,
* unlike FFTW3, cuFFT uses the same plan and execution function for both
* the forward and inverse transforms. */
void gen1d(const complex_t* in, complex_t* out, int size, int dir) const;
void gen1b(const complex_t* in, complex_t* out, int nrows, int ncols, int axis, int dir) const;
void gen2d(const complex_t* in, complex_t* out, int nrows, int ncols, int dir) const;

// std::shared_mutex mutex;
public:
// See `IDFT.h` for more information about these.
cuFftDFT() = default;
virtual ~cuFftDFT() = default;

virtual void fwd1d(const complex_t* in, complex_t* out, int size) const;
virtual void inv1d(const complex_t* in, complex_t* out, int size) const;

virtual void fwd1b(const complex_t* in, complex_t* out, int nrows, int ncols, int axis) const;
virtual void inv1b(const complex_t* in, complex_t* out, int nrows, int ncols, int axis) const;

virtual void fwd2d(const complex_t* in, complex_t* out, int nrows, int ncols) const;
virtual void inv2d(const complex_t* in, complex_t* out, int nrows, int ncols) const;

};
}

#endif
128 changes: 128 additions & 0 deletions cuda/src/cuFftDFT.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#include "WireCellCuda/cuFftDFT.h"

#include <cuda.h>
#include <cufft.h>
#include <assert.h>
#include <iostream>

using namespace WireCell;
using complex_t = IDFT::complex_t;

/* Because otherwise a more explicit casting will be needed which
* will reduce performance. */
static_assert(sizeof(cufftComplex) == sizeof(complex_t));

Cuda::memArgs<cufftComplex>
Cuda::cuFftDFT::preTransformTasks(
Cuda::memArgs<complex_t> hmem, int size
) const
{
// Allocate memory for the input and output arrays in the GPU
static Cuda::memArgs<cufftComplex> dmem;

assert(cudaMalloc( &dmem.in, size*sizeof(cufftComplex) ) == cudaSuccess);

if( dmem.in == dmem.out ) dmem.out = const_cast<cufftComplex*>(dmem.in);
else assert(cudaMalloc( &dmem.out, size*sizeof(cufftComplex) ) == cudaSuccess);

// Copy input array in CPU to the GPU
assert(cudaMemcpy(
(void *)dmem.in, (void *)hmem.in,
size*sizeof(cufftComplex),
cudaMemcpyHostToDevice
) == cudaSuccess);

return dmem;
}

void Cuda::cuFftDFT::postTransformTasks(
memArgs<complex_t> hmem, memArgs<cufftComplex> dmem, int size
) const
{
// Copy result back to the CPU
assert(cudaMemcpy(
(void *)hmem.out, (void *)dmem.out,
size*sizeof(complex_t),
cudaMemcpyDeviceToHost
) == cudaSuccess);

// Free memory on GPU
assert(cudaFree((void *)dmem.in) == cudaSuccess);
if( dmem.in != dmem.out )
assert(cudaFree((void *)dmem.out) == cudaSuccess);
}

void Cuda::cuFftDFT::gen1d(const complex_t* in, complex_t* out, int size, int dir) const
{
static cufftHandle plan;
Cuda::memArgs<complex_t> hmem{in, out};
Cuda::memArgs<cufftComplex> dmem = preTransformTasks(hmem, size);

// Create the plan and perform the transform
assert(cufftPlan1d(&plan, size, CUFFT_C2C, 1) == CUFFT_SUCCESS);
assert(cufftExecC2C(plan, const_cast<cufftComplex*>(dmem.in), dmem.out, dir) == CUFFT_SUCCESS);

postTransformTasks(hmem, dmem, size);
}

void Cuda::cuFftDFT::gen1b(const complex_t* in, complex_t* out, int nrows, int ncols, int axis, int dir) const
{
static cufftHandle plan;
Cuda::memArgs<complex_t> hmem{in, out};
Cuda::memArgs<cufftComplex> dmem = preTransformTasks(hmem, nrows*ncols);

// Create the plan and perform the transform
assert(axis == 0 || axis == 1);
if( axis == 0 )
{
assert(cufftPlanMany(
&plan, 1, new int[2] {ncols, nrows},
&nrows, 1, ncols,
&nrows, 1, ncols,
CUFFT_C2C, nrows
) == CUFFT_SUCCESS);
} else
{
assert(cufftPlanMany(
&plan, 1, new int[2] {ncols, nrows},
&ncols, nrows, 1,
&ncols, nrows, 1,
CUFFT_C2C, nrows
) == CUFFT_SUCCESS);
}

assert(cufftExecC2C(plan, const_cast<cufftComplex*>(dmem.in), dmem.out, dir) == CUFFT_SUCCESS);

postTransformTasks(hmem, dmem, nrows*ncols);
}

void Cuda::cuFftDFT::gen2d(const complex_t* in, complex_t* out, int nrows, int ncols, int dir) const
{
static cufftHandle plan;
Cuda::memArgs<complex_t> hmem{in, out};
Cuda::memArgs<cufftComplex> dmem = preTransformTasks(hmem, nrows*ncols);

assert(cufftPlan2d(&plan, nrows, ncols, CUFFT_C2C) == CUFFT_SUCCESS);
assert(cufftExecC2C(plan, const_cast<cufftComplex*>(dmem.in), dmem.out, dir) == CUFFT_SUCCESS);

postTransformTasks(hmem, dmem, nrows*ncols);
}

void Cuda::cuFftDFT::fwd1d(const complex_t* in, complex_t* out, int size) const
{ gen1d(in, out, size, CUFFT_FORWARD); }

void Cuda::cuFftDFT::inv1d(const complex_t* in, complex_t* out, int size) const
{ gen1d(in, out, size, CUFFT_INVERSE); }

void Cuda::cuFftDFT::fwd1b(const complex_t* in, complex_t* out, int nrows, int ncols, int axis) const
{ gen1b(in, out, nrows, ncols, axis, CUFFT_FORWARD); }

void Cuda::cuFftDFT::inv1b(const complex_t* in, complex_t* out, int nrows, int ncols, int axis) const
{ gen1b(in, out, nrows, ncols, axis, CUFFT_INVERSE); }

void Cuda::cuFftDFT::fwd2d(const complex_t* in, complex_t* out, int nrows, int ncols) const
{ gen2d(in, out, nrows, ncols, CUFFT_FORWARD); }

void Cuda::cuFftDFT::inv2d(const complex_t* in, complex_t* out, int nrows, int ncols) const
{ gen2d(in, out, nrows, ncols, CUFFT_INVERSE); }

4 changes: 4 additions & 0 deletions cuda/src/cuFftDFT.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#include "WireCellCuda/cuFftDFT.h"
#include "WireCellUtil/NamedFactory.h"

WIRECELL_FACTORY(cuFftDFT, WireCell::Cuda::cuFftDFT, WireCell::IDFT)
87 changes: 0 additions & 87 deletions cuda/src/simplecudatest.cu

This file was deleted.

29 changes: 29 additions & 0 deletions cuda/test/debug-helper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef DEBUG_HELPER
#include <iostream>

#define DEBUG_HELPER

using namespace std;

template<class T>
void printArr(T *p, int n)
{
if(n==0) cout << "[]\n";
cout << "[" << p[0];
for(int i=1; i<n; ++i) cout << ", " << p[i];
cout << "]" << endl;
}

template<class T>
void printArr(T *p, int n, int m)
{
if(n==0) cout << "[]\n";
cout << "[\n " << p[0];
for(int i=1; i<n*m; ++i)
{
if( i%n == 0 ) cout << "\n";
cout << ", " << p[i];
}
cout << "\n]" << endl;
}
#endif
Loading