1111// /
1212
1313#include < cuda_runtime.h>
14- #include < fmt/format.h>
1514
1615#include < unistd.h>
1716#include < vector>
@@ -439,8 +438,10 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeedN>& seed
439438 GPUTimer timer (" loading track seeds" );
440439 GPULog (" gpu-transfer: loading {} track seeds, for {:.2f} MB." , seeds.size (), seeds.size () * sizeof (CellSeedN) / constants::MB);
441440 allocMem (reinterpret_cast <void **>(&mTrackSeedsDevice ), seeds.size () * sizeof (CellSeedN), this ->hasFrameworkAllocator (), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
442- GPUChkErrS (cudaHostRegister (seeds.data (), seeds.size () * sizeof (CellSeedN), cudaHostRegisterPortable));
443441 GPUChkErrS (cudaMemcpy (mTrackSeedsDevice , seeds.data (), seeds.size () * sizeof (CellSeedN), cudaMemcpyHostToDevice));
442+ GPULog (" gpu-transfer: creating {} track seeds LUT, for {:.2f} MB." , seeds.size () + 1 , (seeds.size () + 1 ) * sizeof (int ) / constants::MB);
443+ allocMem (reinterpret_cast <void **>(&mTrackSeedsLUTDevice ), (seeds.size () + 1 ) * sizeof (int ), this ->hasFrameworkAllocator (), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
444+ GPUChkErrS (cudaMemset (mTrackSeedsLUTDevice , 0 , (seeds.size () + 1 ) * sizeof (int )));
444445}
445446
446447template <int nLayers>
@@ -458,14 +459,15 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
458459}
459460
460461template <int nLayers>
461- void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeedN>& seeds )
462+ void TimeFrameGPU<nLayers>::createTrackITSExtDevice(const size_t nSeeds )
462463{
463464 GPUTimer timer (" reserving tracks" );
464- mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size (), {}, this ->getMemoryPool ().get ());
465- GPULog (" gpu-allocation: reserving {} tracks, for {:.2f} MB." , seeds.size (), seeds.size () * sizeof (o2::its::TrackITSExt) / constants::MB);
466- allocMem (reinterpret_cast <void **>(&mTrackITSExtDevice ), seeds.size () * sizeof (o2::its::TrackITSExt), this ->hasFrameworkAllocator (), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
467- GPUChkErrS (cudaMemset (mTrackITSExtDevice , 0 , seeds.size () * sizeof (o2::its::TrackITSExt)));
468- GPUChkErrS (cudaHostRegister (mTrackITSExt .data (), seeds.size () * sizeof (o2::its::TrackITSExt), cudaHostRegisterPortable));
465+ mNTracks = 0 ;
466+ GPUChkErrS (cudaMemcpy (&mNTracks , mTrackSeedsLUTDevice + nSeeds, sizeof (int ), cudaMemcpyDeviceToHost));
467+ GPULog (" gpu-allocation: reserving {} tracks, for {:.2f} MB." , mNTracks , mNTracks * sizeof (o2::its::TrackITSExt) / constants::MB);
468+ mTrackITSExt = bounded_vector<TrackITSExt>(mNTracks , {}, this ->getMemoryPool ().get ());
469+ allocMem (reinterpret_cast <void **>(&mTrackITSExtDevice ), mNTracks * sizeof (o2::its::TrackITSExt), this ->hasFrameworkAllocator (), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
470+ GPUChkErrS (cudaMemset (mTrackITSExtDevice , 0 , mNTracks * sizeof (o2::its::TrackITSExt)));
469471}
470472
471473template <int nLayers>
@@ -588,13 +590,11 @@ void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut
588590}
589591
590592template <int nLayers>
591- void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeedN>& seeds )
593+ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice()
592594{
593595 GPUTimer timer (" downloading tracks" );
594596 GPULog (" gpu-transfer: downloading {} tracks, for {:.2f} MB." , mTrackITSExt .size (), mTrackITSExt .size () * sizeof (o2::its::TrackITSExt) / constants::MB);
595- GPUChkErrS (cudaMemcpy (mTrackITSExt .data (), mTrackITSExtDevice , seeds.size () * sizeof (o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
596- GPUChkErrS (cudaHostUnregister (mTrackITSExt .data ()));
597- GPUChkErrS (cudaHostUnregister (seeds.data ()));
597+ GPUChkErrS (cudaMemcpy (mTrackITSExt .data (), mTrackITSExtDevice , mTrackITSExt .size () * sizeof (o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
598598}
599599
600600template <int nLayers>
@@ -632,21 +632,37 @@ void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
632632 checkedUnregisterArray (mPinnedROFramesClusters , mROFramesClustersDevice );
633633}
634634
635+ namespace detail
636+ {
637+ template <std::size_t I>
638+ constexpr uint64_t makeIterTag ()
639+ {
640+ static_assert (I < 10 );
641+ constexpr char tag[] = {' I' , ' T' , ' S' , ' I' , ' T' , ' E' , ' R' , char (' 0' + I), ' \0 ' };
642+ return qStr2Tag (tag);
643+ }
644+ template <std::size_t ... I>
645+ constexpr auto makeIterTags (std::index_sequence<I...>)
646+ {
647+ return std::array<uint64_t , sizeof ...(I)>{makeIterTag<I>()...};
648+ }
649+ // FIXME: we have to be careful that the MaxIter does not diverge from the 4 here!
650+ constexpr auto kIterTags = makeIterTags(std::make_index_sequence<4 >{});
651+ } // namespace detail
652+
635653template <int nLayers>
636654void TimeFrameGPU<nLayers>::pushMemoryStack(const int iteration)
637655{
638656 // mark the beginning of memory marked with MEMORY_STACK that can be discarded
639657 // after doing one iteration
640- const auto name = fmt::format (" ITSITER{}" , iteration);
641- (this ->mExternalAllocator )->pushTagOnStack (qStr2Tag (name.c_str ()));
658+ (this ->mExternalAllocator )->pushTagOnStack (detail::kIterTags [iteration]);
642659}
643660
644661template <int nLayers>
645662void TimeFrameGPU<nLayers>::popMemoryStack(const int iteration)
646663{
647664 // pop all memory on the stack from this iteration
648- const auto name = fmt::format (" ITSITER{}" , iteration);
649- (this ->mExternalAllocator )->popTagOffStack (qStr2Tag (name.c_str ()));
665+ (this ->mExternalAllocator )->popTagOffStack (detail::kIterTags [iteration]);
650666}
651667
652668template <int nLayers>
0 commit comments