Skip to content

Commit bf3b8ca

Browse files
authored
Merge e8f778f into sapling-pr-archive-ktf
2 parents b704cf2 + e8f778f commit bf3b8ca

File tree

24 files changed

+761
-345
lines changed

24 files changed

+761
-345
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
8080
void createNeighboursIndexTablesDevice(const int);
8181
void createNeighboursDevice(const unsigned int layer);
8282
void createNeighboursLUTDevice(const int, const unsigned int);
83-
void createTrackITSExtDevice(bounded_vector<CellSeedN>&);
84-
void downloadTrackITSExtDevice(bounded_vector<CellSeedN>&);
83+
void createTrackITSExtDevice(const size_t);
84+
void downloadTrackITSExtDevice();
8585
void downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>&, const int);
8686
void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
8787
void downloadCellsDevice();
@@ -140,6 +140,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
140140
int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
141141
CellSeedN** getDeviceArrayCells() { return mCellsDeviceArray; }
142142
CellSeedN* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
143+
int* getDeviceTrackSeedsLUT() { return mTrackSeedsLUTDevice; }
144+
auto getNTrackSeeds() const { return mNTracks; }
143145
o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
144146
float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
145147
int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
@@ -219,6 +221,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
219221
CellSeedN** mCellsDeviceArray;
220222
std::array<int*, nLayers - 3> mNeighboursIndexTablesDevice;
221223
CellSeedN* mTrackSeedsDevice{nullptr};
224+
int* mTrackSeedsLUTDevice{nullptr};
225+
unsigned int mNTracks{0};
222226
std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
223227
o2::track::TrackParCovF** mCellSeedsDeviceArray;
224228
std::array<float*, nLayers - 2> mCellSeedsChi2Device;

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -207,23 +207,48 @@ void processNeighboursHandler(const int startLayer,
207207
const int nThreads);
208208

209209
template <int nLayers = 7>
210-
void trackSeedHandler(CellSeed<nLayers>* trackSeeds,
211-
const TrackingFrameInfo** foundTrackingFrameInfo,
212-
const Cluster** unsortedClusters,
213-
o2::its::TrackITSExt* tracks,
214-
const std::vector<float>& layerRadiiHost,
215-
const std::vector<float>& minPtsHost,
216-
const unsigned int nSeeds,
217-
const float Bz,
218-
const int startLevel,
219-
const float maxChi2ClusterAttachment,
220-
const float maxChi2NDF,
221-
const int reseedIfShorter,
222-
const bool repeatRefitOut,
223-
const bool shiftRefToCluster,
224-
const o2::base::Propagator* propagator,
225-
const o2::base::PropagatorF::MatCorrType matCorrType,
226-
const int nBlocks,
227-
const int nThreads);
210+
void countTrackSeedHandler(CellSeed<nLayers>* trackSeeds,
211+
const TrackingFrameInfo** foundTrackingFrameInfo,
212+
const Cluster** unsortedClusters,
213+
int* seedLUT,
214+
const std::vector<float>& layerRadiiHost,
215+
const std::vector<float>& minPtsHost,
216+
const unsigned int nSeeds,
217+
const float Bz,
218+
const int startLevel,
219+
const float maxChi2ClusterAttachment,
220+
const float maxChi2NDF,
221+
const int reseedIfShorter,
222+
const bool repeatRefitOut,
223+
const bool shiftRefToCluster,
224+
const o2::base::Propagator* propagator,
225+
const o2::base::PropagatorF::MatCorrType matCorrType,
226+
o2::its::ExternalAllocator* alloc,
227+
const int nBlocks,
228+
const int nThreads);
229+
230+
template <int nLayers = 7>
231+
void computeTrackSeedHandler(CellSeed<nLayers>* trackSeeds,
232+
const TrackingFrameInfo** foundTrackingFrameInfo,
233+
const Cluster** unsortedClusters,
234+
o2::its::TrackITSExt* tracks,
235+
const int* seedLUT,
236+
const std::vector<float>& layerRadiiHost,
237+
const std::vector<float>& minPtsHost,
238+
const unsigned int nSeeds,
239+
const unsigned int nTracks,
240+
const float Bz,
241+
const int startLevel,
242+
const float maxChi2ClusterAttachment,
243+
const float maxChi2NDF,
244+
const int reseedIfShorter,
245+
const bool repeatRefitOut,
246+
const bool shiftRefToCluster,
247+
const o2::base::Propagator* propagator,
248+
const o2::base::PropagatorF::MatCorrType matCorrType,
249+
o2::its::ExternalAllocator* alloc,
250+
const int nBlocks,
251+
const int nThreads);
252+
228253
} // namespace o2::its
229254
#endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
///
1212

1313
#include <cuda_runtime.h>
14-
#include <fmt/format.h>
1514

1615
#include <unistd.h>
1716
#include <vector>
@@ -439,8 +438,10 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeedN>& seed
439438
GPUTimer timer("loading track seeds");
440439
GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeedN) / constants::MB);
441440
allocMem(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeedN), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
442-
GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeedN), cudaHostRegisterPortable));
443441
GPUChkErrS(cudaMemcpy(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeedN), cudaMemcpyHostToDevice));
442+
GPULog("gpu-transfer: creating {} track seeds LUT, for {:.2f} MB.", seeds.size() + 1, (seeds.size() + 1) * sizeof(int) / constants::MB);
443+
allocMem(reinterpret_cast<void**>(&mTrackSeedsLUTDevice), (seeds.size() + 1) * sizeof(int), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
444+
GPUChkErrS(cudaMemset(mTrackSeedsLUTDevice, 0, (seeds.size() + 1) * sizeof(int)));
444445
}
445446

446447
template <int nLayers>
@@ -458,14 +459,15 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
458459
}
459460

460461
template <int nLayers>
461-
void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeedN>& seeds)
462+
void TimeFrameGPU<nLayers>::createTrackITSExtDevice(const size_t nSeeds)
462463
{
463464
GPUTimer timer("reserving tracks");
464-
mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
465-
GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
466-
allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
467-
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt)));
468-
GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
465+
mNTracks = 0;
466+
GPUChkErrS(cudaMemcpy(&mNTracks, mTrackSeedsLUTDevice + nSeeds, sizeof(int), cudaMemcpyDeviceToHost));
467+
GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", mNTracks, mNTracks * sizeof(o2::its::TrackITSExt) / constants::MB);
468+
mTrackITSExt = bounded_vector<TrackITSExt>(mNTracks, {}, this->getMemoryPool().get());
469+
allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), mNTracks * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
470+
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt)));
469471
}
470472

471473
template <int nLayers>
@@ -588,13 +590,11 @@ void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut
588590
}
589591

590592
template <int nLayers>
591-
void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeedN>& seeds)
593+
void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice()
592594
{
593595
GPUTimer timer("downloading tracks");
594596
GPULog("gpu-transfer: downloading {} tracks, for {:.2f} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
595-
GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
596-
GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
597-
GPUChkErrS(cudaHostUnregister(seeds.data()));
597+
GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, mTrackITSExt.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
598598
}
599599

600600
template <int nLayers>
@@ -632,21 +632,37 @@ void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
632632
checkedUnregisterArray(mPinnedROFramesClusters, mROFramesClustersDevice);
633633
}
634634

635+
namespace detail
636+
{
637+
template <std::size_t I>
638+
constexpr uint64_t makeIterTag()
639+
{
640+
static_assert(I < 10);
641+
constexpr char tag[] = {'I', 'T', 'S', 'I', 'T', 'E', 'R', char('0' + I), '\0'};
642+
return qStr2Tag(tag);
643+
}
644+
template <std::size_t... I>
645+
constexpr auto makeIterTags(std::index_sequence<I...>)
646+
{
647+
return std::array<uint64_t, sizeof...(I)>{makeIterTag<I>()...};
648+
}
649+
// FIXME: we have to be careful that the MaxIter does not diverge from the 4 here!
650+
constexpr auto kIterTags = makeIterTags(std::make_index_sequence<4>{});
651+
} // namespace detail
652+
635653
template <int nLayers>
636654
void TimeFrameGPU<nLayers>::pushMemoryStack(const int iteration)
637655
{
638656
// mark the beginning of memory marked with MEMORY_STACK that can be discarded
639657
// after doing one iteration
640-
const auto name = fmt::format("ITSITER{}", iteration);
641-
(this->mExternalAllocator)->pushTagOnStack(qStr2Tag(name.c_str()));
658+
(this->mExternalAllocator)->pushTagOnStack(detail::kIterTags[iteration]);
642659
}
643660

644661
template <int nLayers>
645662
void TimeFrameGPU<nLayers>::popMemoryStack(const int iteration)
646663
{
647664
// pop all memory on the stack from this iteration
648-
const auto name = fmt::format("ITSITER{}", iteration);
649-
(this->mExternalAllocator)->popTagOffStack(qStr2Tag(name.c_str()));
665+
(this->mExternalAllocator)->popTagOffStack(detail::kIterTags[iteration]);
650666
}
651667

652668
template <int nLayers>

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -322,29 +322,52 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
322322
LOGP(debug, "No track seeds found, skipping track finding");
323323
continue;
324324
}
325-
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
326325
mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
327326

328-
trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed*
329-
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo**
330-
mTimeFrameGPU->getDeviceArrayUnsortedClusters(), // Cluster**
331-
mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt*
332-
this->mTrkParams[iteration].LayerRadii, // const std::vector<float>&
333-
this->mTrkParams[iteration].MinPt, // const std::vector<float>&
334-
trackSeeds.size(), // const size_t nSeeds
335-
this->mBz, // const float Bz
336-
startLevel, // const int startLevel,
337-
this->mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
338-
this->mTrkParams[0].MaxChi2NDF, // float maxChi2NDF
339-
this->mTrkParams[0].RepeatRefitOut,
340-
this->mTrkParams[0].ReseedIfShorter,
341-
this->mTrkParams[0].ShiftRefToCluster,
342-
mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator
343-
this->mTrkParams[0].CorrType, // o2::base::PropagatorImpl<float>::MatCorrType
344-
conf.nBlocksTracksSeeds[iteration],
345-
conf.nThreadsTracksSeeds[iteration]);
346-
347-
mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds);
327+
// Since TrackITSExt is an enourmous class it is better to first count how many
328+
// successfull fits we do and only then allocate
329+
countTrackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),
330+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
331+
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
332+
mTimeFrameGPU->getDeviceTrackSeedsLUT(),
333+
this->mTrkParams[iteration].LayerRadii,
334+
this->mTrkParams[iteration].MinPt,
335+
trackSeeds.size(),
336+
this->mBz,
337+
startLevel,
338+
this->mTrkParams[0].MaxChi2ClusterAttachment,
339+
this->mTrkParams[0].MaxChi2NDF,
340+
this->mTrkParams[0].ReseedIfShorter,
341+
this->mTrkParams[0].RepeatRefitOut,
342+
this->mTrkParams[0].ShiftRefToCluster,
343+
mTimeFrameGPU->getDevicePropagator(),
344+
this->mTrkParams[0].CorrType,
345+
mTimeFrameGPU->getFrameworkAllocator(),
346+
conf.nBlocksTracksSeeds[iteration],
347+
conf.nThreadsTracksSeeds[iteration]);
348+
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds.size());
349+
computeTrackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),
350+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
351+
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
352+
mTimeFrameGPU->getDeviceTrackITSExt(),
353+
mTimeFrameGPU->getDeviceTrackSeedsLUT(),
354+
this->mTrkParams[iteration].LayerRadii,
355+
this->mTrkParams[iteration].MinPt,
356+
trackSeeds.size(),
357+
mTimeFrameGPU->getNTrackSeeds(),
358+
this->mBz,
359+
startLevel,
360+
this->mTrkParams[0].MaxChi2ClusterAttachment,
361+
this->mTrkParams[0].MaxChi2NDF,
362+
this->mTrkParams[0].ReseedIfShorter,
363+
this->mTrkParams[0].RepeatRefitOut,
364+
this->mTrkParams[0].ShiftRefToCluster,
365+
mTimeFrameGPU->getDevicePropagator(),
366+
this->mTrkParams[0].CorrType,
367+
mTimeFrameGPU->getFrameworkAllocator(),
368+
conf.nBlocksTracksSeeds[iteration],
369+
conf.nThreadsTracksSeeds[iteration]);
370+
mTimeFrameGPU->downloadTrackITSExtDevice();
348371

349372
auto& tracks = mTimeFrameGPU->getTrackITSExt();
350373

0 commit comments

Comments
 (0)