ktf
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h‎
Lines changed: 6 additions & 2 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h‎
Lines changed: 43 additions & 18 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h‎
Lines changed: 43 additions & 18 deletions
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu‎
Lines changed: 32 additions & 16 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu‎
Lines changed: 32 additions & 16 deletions
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx‎
Lines changed: 44 additions & 21 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx‎
Lines changed: 44 additions & 21 deletions
@@ -80,8 +80,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
   void createNeighboursIndexTablesDevice(const int);
   void createNeighboursDevice(const unsigned int layer);
   void createNeighboursLUTDevice(const int, const unsigned int);
-  void createTrackITSExtDevice(bounded_vector<CellSeedN>&);
-  void downloadTrackITSExtDevice(bounded_vector<CellSeedN>&);
+  void createTrackITSExtDevice(const size_t);
+  void downloadTrackITSExtDevice();
   void downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>&, const int);
   void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
   void downloadCellsDevice();
@@ -140,6 +140,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
   int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
   CellSeedN** getDeviceArrayCells() { return mCellsDeviceArray; }
   CellSeedN* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
+  int* getDeviceTrackSeedsLUT() { return mTrackSeedsLUTDevice; }
+  auto getNTrackSeeds() const { return mNTracks; }
   o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
   float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
   int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
@@ -219,6 +221,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
   CellSeedN** mCellsDeviceArray;
   std::array<int*, nLayers - 3> mNeighboursIndexTablesDevice;
   CellSeedN* mTrackSeedsDevice{nullptr};
+  int* mTrackSeedsLUTDevice{nullptr};
+  unsigned int mNTracks{0};
   std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
   o2::track::TrackParCovF** mCellSeedsDeviceArray;
   std::array<float*, nLayers - 2> mCellSeedsChi2Device;
 
@@ -207,23 +207,48 @@ void processNeighboursHandler(const int startLayer,
                               const int nThreads);
 
 template <int nLayers = 7>
-void trackSeedHandler(CellSeed<nLayers>* trackSeeds,
-                      const TrackingFrameInfo** foundTrackingFrameInfo,
-                      const Cluster** unsortedClusters,
-                      o2::its::TrackITSExt* tracks,
-                      const std::vector<float>& layerRadiiHost,
-                      const std::vector<float>& minPtsHost,
-                      const unsigned int nSeeds,
-                      const float Bz,
-                      const int startLevel,
-                      const float maxChi2ClusterAttachment,
-                      const float maxChi2NDF,
-                      const int reseedIfShorter,
-                      const bool repeatRefitOut,
-                      const bool shiftRefToCluster,
-                      const o2::base::Propagator* propagator,
-                      const o2::base::PropagatorF::MatCorrType matCorrType,
-                      const int nBlocks,
-                      const int nThreads);
+void countTrackSeedHandler(CellSeed<nLayers>* trackSeeds,
+                           const TrackingFrameInfo** foundTrackingFrameInfo,
+                           const Cluster** unsortedClusters,
+                           int* seedLUT,
+                           const std::vector<float>& layerRadiiHost,
+                           const std::vector<float>& minPtsHost,
+                           const unsigned int nSeeds,
+                           const float Bz,
+                           const int startLevel,
+                           const float maxChi2ClusterAttachment,
+                           const float maxChi2NDF,
+                           const int reseedIfShorter,
+                           const bool repeatRefitOut,
+                           const bool shiftRefToCluster,
+                           const o2::base::Propagator* propagator,
+                           const o2::base::PropagatorF::MatCorrType matCorrType,
+                           o2::its::ExternalAllocator* alloc,
+                           const int nBlocks,
+                           const int nThreads);
+
+template <int nLayers = 7>
+void computeTrackSeedHandler(CellSeed<nLayers>* trackSeeds,
+                             const TrackingFrameInfo** foundTrackingFrameInfo,
+                             const Cluster** unsortedClusters,
+                             o2::its::TrackITSExt* tracks,
+                             const int* seedLUT,
+                             const std::vector<float>& layerRadiiHost,
+                             const std::vector<float>& minPtsHost,
+                             const unsigned int nSeeds,
+                             const unsigned int nTracks,
+                             const float Bz,
+                             const int startLevel,
+                             const float maxChi2ClusterAttachment,
+                             const float maxChi2NDF,
+                             const int reseedIfShorter,
+                             const bool repeatRefitOut,
+                             const bool shiftRefToCluster,
+                             const o2::base::Propagator* propagator,
+                             const o2::base::PropagatorF::MatCorrType matCorrType,
+                             o2::its::ExternalAllocator* alloc,
+                             const int nBlocks,
+                             const int nThreads);
+
 } // namespace o2::its
 #endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_
@@ -11,7 +11,6 @@
 ///
 
 #include <cuda_runtime.h>
-#include <fmt/format.h>
 
 #include <unistd.h>
 #include <vector>
@@ -439,8 +438,10 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeedN>& seed
   GPUTimer timer("loading track seeds");
   GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeedN) / constants::MB);
   allocMem(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeedN), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
-  GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeedN), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpy(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeedN), cudaMemcpyHostToDevice));
+  GPULog("gpu-transfer: creating {} track seeds LUT, for {:.2f} MB.", seeds.size() + 1, (seeds.size() + 1) * sizeof(int) / constants::MB);
+  allocMem(reinterpret_cast<void**>(&mTrackSeedsLUTDevice), (seeds.size() + 1) * sizeof(int), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
+  GPUChkErrS(cudaMemset(mTrackSeedsLUTDevice, 0, (seeds.size() + 1) * sizeof(int)));
 }
 
 template <int nLayers>
@@ -458,14 +459,15 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeedN>& seeds)
+void TimeFrameGPU<nLayers>::createTrackITSExtDevice(const size_t nSeeds)
 {
   GPUTimer timer("reserving tracks");
-  mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
-  GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
-  allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
-  GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt)));
-  GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
+  mNTracks = 0;
+  GPUChkErrS(cudaMemcpy(&mNTracks, mTrackSeedsLUTDevice + nSeeds, sizeof(int), cudaMemcpyDeviceToHost));
+  GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", mNTracks, mNTracks * sizeof(o2::its::TrackITSExt) / constants::MB);
+  mTrackITSExt = bounded_vector<TrackITSExt>(mNTracks, {}, this->getMemoryPool().get());
+  allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), mNTracks * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
+  GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt)));
 }
 
 template <int nLayers>
@@ -588,13 +590,11 @@ void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeedN>& seeds)
+void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice()
 {
   GPUTimer timer("downloading tracks");
   GPULog("gpu-transfer: downloading {} tracks, for {:.2f} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
-  GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
-  GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
-  GPUChkErrS(cudaHostUnregister(seeds.data()));
+  GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, mTrackITSExt.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
 }
 
 template <int nLayers>
@@ -632,21 +632,37 @@ void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
   checkedUnregisterArray(mPinnedROFramesClusters, mROFramesClustersDevice);
 }
 
+namespace detail
+{
+template <std::size_t I>
+constexpr uint64_t makeIterTag()
+{
+  static_assert(I < 10);
+  constexpr char tag[] = {'I', 'T', 'S', 'I', 'T', 'E', 'R', char('0' + I), '\0'};
+  return qStr2Tag(tag);
+}
+template <std::size_t... I>
+constexpr auto makeIterTags(std::index_sequence<I...>)
+{
+  return std::array<uint64_t, sizeof...(I)>{makeIterTag<I>()...};
+}
+// FIXME: we have to be careful that the MaxIter does not diverge from the 4 here!
+constexpr auto kIterTags = makeIterTags(std::make_index_sequence<4>{});
+} // namespace detail
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::pushMemoryStack(const int iteration)
 {
   // mark the beginning of memory marked with MEMORY_STACK that can be discarded
   // after doing one iteration
-  const auto name = fmt::format("ITSITER{}", iteration);
-  (this->mExternalAllocator)->pushTagOnStack(qStr2Tag(name.c_str()));
+  (this->mExternalAllocator)->pushTagOnStack(detail::kIterTags[iteration]);
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::popMemoryStack(const int iteration)
 {
   // pop all memory on the stack from this iteration
-  const auto name = fmt::format("ITSITER{}", iteration);
-  (this->mExternalAllocator)->popTagOffStack(qStr2Tag(name.c_str()));
+  (this->mExternalAllocator)->popTagOffStack(detail::kIterTags[iteration]);
 }
 
 template <int nLayers>
 
@@ -322,29 +322,52 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       LOGP(debug, "No track seeds found, skipping track finding");
       continue;
     }
-    mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
     mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
 
-    trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),             // CellSeed*
-                     mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo**
-                     mTimeFrameGPU->getDeviceArrayUnsortedClusters(),  // Cluster**
-                     mTimeFrameGPU->getDeviceTrackITSExt(),            // o2::its::TrackITSExt*
-                     this->mTrkParams[iteration].LayerRadii,           // const std::vector<float>&
-                     this->mTrkParams[iteration].MinPt,                // const std::vector<float>&
-                     trackSeeds.size(),                                // const size_t nSeeds
-                     this->mBz,                                        // const float Bz
-                     startLevel,                                       // const int startLevel,
-                     this->mTrkParams[0].MaxChi2ClusterAttachment,     // float maxChi2ClusterAttachment
-                     this->mTrkParams[0].MaxChi2NDF,                   // float maxChi2NDF
-                     this->mTrkParams[0].RepeatRefitOut,
-                     this->mTrkParams[0].ReseedIfShorter,
-                     this->mTrkParams[0].ShiftRefToCluster,
-                     mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator
-                     this->mTrkParams[0].CorrType,         // o2::base::PropagatorImpl<float>::MatCorrType
-                     conf.nBlocksTracksSeeds[iteration],
-                     conf.nThreadsTracksSeeds[iteration]);
-
-    mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds);
+    // Since TrackITSExt is an enourmous class it is better to first count how many
+    // successfull fits we do and only then allocate
+    countTrackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),
+                          mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
+                          mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
+                          mTimeFrameGPU->getDeviceTrackSeedsLUT(),
+                          this->mTrkParams[iteration].LayerRadii,
+                          this->mTrkParams[iteration].MinPt,
+                          trackSeeds.size(),
+                          this->mBz,
+                          startLevel,
+                          this->mTrkParams[0].MaxChi2ClusterAttachment,
+                          this->mTrkParams[0].MaxChi2NDF,
+                          this->mTrkParams[0].ReseedIfShorter,
+                          this->mTrkParams[0].RepeatRefitOut,
+                          this->mTrkParams[0].ShiftRefToCluster,
+                          mTimeFrameGPU->getDevicePropagator(),
+                          this->mTrkParams[0].CorrType,
+                          mTimeFrameGPU->getFrameworkAllocator(),
+                          conf.nBlocksTracksSeeds[iteration],
+                          conf.nThreadsTracksSeeds[iteration]);
+    mTimeFrameGPU->createTrackITSExtDevice(trackSeeds.size());
+    computeTrackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),
+                            mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
+                            mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
+                            mTimeFrameGPU->getDeviceTrackITSExt(),
+                            mTimeFrameGPU->getDeviceTrackSeedsLUT(),
+                            this->mTrkParams[iteration].LayerRadii,
+                            this->mTrkParams[iteration].MinPt,
+                            trackSeeds.size(),
+                            mTimeFrameGPU->getNTrackSeeds(),
+                            this->mBz,
+                            startLevel,
+                            this->mTrkParams[0].MaxChi2ClusterAttachment,
+                            this->mTrkParams[0].MaxChi2NDF,
+                            this->mTrkParams[0].ReseedIfShorter,
+                            this->mTrkParams[0].RepeatRefitOut,
+                            this->mTrkParams[0].ShiftRefToCluster,
+                            mTimeFrameGPU->getDevicePropagator(),
+                            this->mTrkParams[0].CorrType,
+                            mTimeFrameGPU->getFrameworkAllocator(),
+                            conf.nBlocksTracksSeeds[iteration],
+                            conf.nThreadsTracksSeeds[iteration]);
+    mTimeFrameGPU->downloadTrackITSExtDevice();
 
     auto& tracks = mTimeFrameGPU->getTrackITSExt();