From 1221d5939d549aee94f198fe8facaf412c2e20a1 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Sun, 8 Mar 2026 12:45:16 -0400 Subject: [PATCH] Add reduced memory runtime toggle for LST --- RecoTracker/LST/plugins/alpaka/LSTProducer.cc | 6 +- RecoTracker/LSTCore/interface/alpaka/LST.h | 3 +- RecoTracker/LSTCore/src/alpaka/LST.cc | 5 +- .../LSTCore/src/alpaka/LSTEvent.dev.cc | 268 +++++++++++++----- RecoTracker/LSTCore/src/alpaka/LSTEvent.h | 10 +- RecoTracker/LSTCore/src/alpaka/MiniDoublet.h | 251 +++++++++++----- RecoTracker/LSTCore/src/alpaka/Quadruplet.h | 168 ++++++++++- RecoTracker/LSTCore/src/alpaka/Quintuplet.h | 163 ++++++++++- RecoTracker/LSTCore/src/alpaka/Segment.h | 74 +++++ RecoTracker/LSTCore/src/alpaka/Triplet.h | 211 ++++++++++++++ RecoTracker/LSTCore/standalone/bin/lst.cc | 13 +- .../standalone/code/core/AnalysisConfig.h | 3 + 12 files changed, 1006 insertions(+), 169 deletions(-) diff --git a/RecoTracker/LST/plugins/alpaka/LSTProducer.cc b/RecoTracker/LST/plugins/alpaka/LSTProducer.cc index 496c1ae1182b0..981999c7a6ca1 100644 --- a/RecoTracker/LST/plugins/alpaka/LSTProducer.cc +++ b/RecoTracker/LST/plugins/alpaka/LSTProducer.cc @@ -32,6 +32,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { clustSizeCut_(static_cast(config.getParameter("clustSizeCut"))), nopLSDupClean_(config.getParameter("nopLSDupClean")), tcpLSTriplets_(config.getParameter("tcpLSTriplets")), + reduceMem_(config.getParameter("reduceMem")), lstOutputToken_{produces()} {} void produce(edm::StreamID sid, device::Event& iEvent, const device::EventSetup& iSetup) const override { @@ -47,7 +48,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { &lstESDeviceData, &lstInputDC, nopLSDupClean_, - tcpLSTriplets_); + tcpLSTriplets_, + reduceMem_); // Output auto lstTrackCandidates = lst.getTrackCandidates(); @@ -63,6 +65,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { desc.add("ptCutLabel", "0.8"); desc.add("nopLSDupClean", false); desc.add("tcpLSTriplets", false); + desc.add("reduceMem", false); descriptions.addWithDefaultLabel(desc); } @@ -74,6 +77,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const uint16_t clustSizeCut_; const bool nopLSDupClean_; const bool tcpLSTriplets_; + const bool reduceMem_; const device::EDPutToken lstOutputToken_; }; diff --git a/RecoTracker/LSTCore/interface/alpaka/LST.h b/RecoTracker/LSTCore/interface/alpaka/LST.h index b03170df580a5..b5c11fc04bffd 100644 --- a/RecoTracker/LSTCore/interface/alpaka/LST.h +++ b/RecoTracker/LSTCore/interface/alpaka/LST.h @@ -24,7 +24,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { LSTESData const* deviceESData, LSTInputDeviceCollection const* lstInputDC, bool no_pls_dupclean, - bool tc_pls_triplets); + bool tc_pls_triplets, + bool reduceMem = false); std::unique_ptr getTrackCandidates() { return std::move(trackCandidatesBaseDC_); } diff --git a/RecoTracker/LSTCore/src/alpaka/LST.cc b/RecoTracker/LSTCore/src/alpaka/LST.cc index 29d719921bc94..604e42660eca5 100644 --- a/RecoTracker/LSTCore/src/alpaka/LST.cc +++ b/RecoTracker/LSTCore/src/alpaka/LST.cc @@ -17,8 +17,9 @@ void LST::run(Queue& queue, LSTESData const* deviceESData, LSTInputDeviceCollection const* lstInputDC, bool no_pls_dupclean, - bool tc_pls_triplets) { - auto event = LSTEvent(verbose, ptCut, clustSizeCut, queue, deviceESData); + bool tc_pls_triplets, + bool reduceMem) { + auto event = LSTEvent(verbose, ptCut, clustSizeCut, queue, deviceESData, reduceMem); event.addInputToEvent(lstInputDC); event.addHitToEvent(); diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc index 111afe0a286db..5876d7574cdb5 100644 --- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc +++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc @@ -188,31 +188,66 @@ void LSTEvent::addPixelSegmentToEventFinalize() { void LSTEvent::createMiniDoublets() { if (!miniDoubletsDC_) { - // Create a view for the element nLowerModules_ inside rangesOccupancy->miniDoubletModuleOccupancy auto rangesOccupancy = rangesDC_->view(); - auto dst_view_miniDoubletModuleOccupancy = - cms::alpakatools::make_device_view(queue_, rangesOccupancy.miniDoubletModuleOccupancy()[nLowerModules_]); - // Create a host buffer for a value to be passed to the device + // Create a host buffer for pixel MD cap auto pixelMaxMDs_buf_h = cms::alpakatools::make_host_buffer(queue_); *pixelMaxMDs_buf_h.data() = 2 * pixelSize_; - alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); - - auto dst_view_miniDoubletModuleOccupancyPix = - cms::alpakatools::make_device_view(queue_, rangesOccupancy.miniDoubletModuleOccupancy()[pixelModuleIndex_]); - - alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancyPix, pixelMaxMDs_buf_h); - - auto const createMDArrayRangesGPU_workDiv = cms::alpakatools::make_workdiv(1, 1024); - - alpaka::exec(queue_, - createMDArrayRangesGPU_workDiv, - CreateMDArrayRangesGPU{}, - modules_.const_view().modules(), - hitsDC_->const_view().ranges(), - rangesDC_->view(), - ptCut_); + if (reduceMem_) { + // Reduced memory: zero occupancy, set pixel caps, then count MDs for exact sizing + auto miniDoubletModuleOccupancy_view = + cms::alpakatools::make_device_view(queue_, rangesOccupancy.miniDoubletModuleOccupancy()); + alpaka::memset(queue_, miniDoubletModuleOccupancy_view, 0u); + + auto dst_view_miniDoubletModuleOccupancy = + cms::alpakatools::make_device_view(queue_, rangesOccupancy.miniDoubletModuleOccupancy()[nLowerModules_]); + alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); + + auto dst_view_miniDoubletModuleOccupancyPix = + cms::alpakatools::make_device_view(queue_, rangesOccupancy.miniDoubletModuleOccupancy()[pixelModuleIndex_]); + alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancyPix, pixelMaxMDs_buf_h); + + constexpr int threadsPerBlockY = 16; + auto const countMiniDoublets_workDiv = + cms::alpakatools::make_workdiv({nLowerModules_ / threadsPerBlockY, 1}, {threadsPerBlockY, 32}); + + alpaka::exec(queue_, + countMiniDoublets_workDiv, + CountMiniDoublets{}, + modules_.const_view().modules(), + lstInputDC_->const_view().hits(), + hitsDC_->const_view().extended(), + hitsDC_->const_view().ranges(), + rangesDC_->view(), + ptCut_, + clustSizeCut_); + + auto const createMDArrayRangesReducedMem_workDiv = cms::alpakatools::make_workdiv(1, 1024); + alpaka::exec(queue_, + createMDArrayRangesReducedMem_workDiv, + CreateMDArrayRangesReducedMem{}, + modules_.const_view().modules(), + rangesDC_->view()); + } else { + // Default: set pixel caps and use matrix-based occupancy estimates + auto dst_view_miniDoubletModuleOccupancy = + cms::alpakatools::make_device_view(queue_, rangesOccupancy.miniDoubletModuleOccupancy()[nLowerModules_]); + alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); + + auto dst_view_miniDoubletModuleOccupancyPix = + cms::alpakatools::make_device_view(queue_, rangesOccupancy.miniDoubletModuleOccupancy()[pixelModuleIndex_]); + alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancyPix, pixelMaxMDs_buf_h); + + auto const createMDArrayRangesGPU_workDiv = cms::alpakatools::make_workdiv(1, 1024); + alpaka::exec(queue_, + createMDArrayRangesGPU_workDiv, + CreateMDArrayRangesGPU{}, + modules_.const_view().modules(), + hitsDC_->const_view().ranges(), + rangesDC_->view(), + ptCut_); + } auto nTotalMDs_buf_h = cms::alpakatools::make_host_buffer(queue_); auto nTotalMDs_buf_d = cms::alpakatools::make_device_view(queue_, rangesOccupancy.nTotalMDs()); @@ -289,14 +324,25 @@ void LSTEvent::createSegmentsWithModuleMap() { if (!segmentsDC_) { auto const countMDConn_wd = cms::alpakatools::make_workdiv({nLowerModules_, 1, 1}, {1, 8, 32}); - alpaka::exec(queue_, - countMDConn_wd, - CountMiniDoubletConnections{}, - modules_.const_view().modules(), - miniDoubletsDC_->view().miniDoublets(), - miniDoubletsDC_->const_view().miniDoubletsOccupancy(), - rangesDC_->const_view(), - ptCut_); + if (reduceMem_) { + alpaka::exec(queue_, + countMDConn_wd, + CountMiniDoubletConnectionsDense{}, + modules_.const_view().modules(), + miniDoubletsDC_->view().miniDoublets(), + miniDoubletsDC_->const_view().miniDoubletsOccupancy(), + rangesDC_->const_view(), + ptCut_); + } else { + alpaka::exec(queue_, + countMDConn_wd, + CountMiniDoubletConnections{}, + modules_.const_view().modules(), + miniDoubletsDC_->view().miniDoublets(), + miniDoubletsDC_->const_view().miniDoubletsOccupancy(), + rangesDC_->const_view(), + ptCut_); + } auto const createSegmentArrayRanges_workDiv = cms::alpakatools::make_workdiv(1, 1024); @@ -376,15 +422,27 @@ void LSTEvent::createTriplets() { if (!tripletsDC_) { auto const countSegConn_wd = cms::alpakatools::make_workdiv({nLowerModules_, 1, 1}, {1, 16, 16}); - alpaka::exec(queue_, - countSegConn_wd, - CountSegmentConnections{}, - modules_.const_view().modules(), - miniDoubletsDC_->const_view().miniDoublets(), - segmentsDC_->view().segments(), - segmentsDC_->const_view().segmentsOccupancy(), - rangesDC_->const_view(), - ptCut_); + if (reduceMem_) { + alpaka::exec(queue_, + countSegConn_wd, + CountSegmentConnectionsDense{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->view().segments(), + segmentsDC_->const_view().segmentsOccupancy(), + rangesDC_->const_view(), + ptCut_); + } else { + alpaka::exec(queue_, + countSegConn_wd, + CountSegmentConnections{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->view().segments(), + segmentsDC_->const_view().segmentsOccupancy(), + rangesDC_->const_view(), + ptCut_); + } auto const createTripletArrayRanges_workDiv = cms::alpakatools::make_workdiv(1, 1024); @@ -473,19 +531,35 @@ void LSTEvent::createTriplets() { auto const createTriplets_workDiv = cms::alpakatools::make_workdiv({nonZeroModules, 1, 1}, {1, 16, 16}); - alpaka::exec(queue_, - createTriplets_workDiv, - CreateTriplets{}, - modules_.const_view().modules(), - miniDoubletsDC_->const_view().miniDoublets(), - segmentsDC_->const_view().segments(), - segmentsDC_->const_view().segmentsOccupancy(), - tripletsDC_->view().triplets(), - tripletsDC_->view().tripletsOccupancy(), - rangesDC_->const_view(), - index_gpu_buf.data(), - nonZeroModules, - ptCut_); + if (reduceMem_) { + alpaka::exec(queue_, + createTriplets_workDiv, + CreateTripletsDense{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->const_view().segments(), + segmentsDC_->const_view().segmentsOccupancy(), + tripletsDC_->view().triplets(), + tripletsDC_->view().tripletsOccupancy(), + rangesDC_->const_view(), + index_gpu_buf.data(), + nonZeroModules, + ptCut_); + } else { + alpaka::exec(queue_, + createTriplets_workDiv, + CreateTriplets{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->const_view().segments(), + segmentsDC_->const_view().segmentsOccupancy(), + tripletsDC_->view().triplets(), + tripletsDC_->view().tripletsOccupancy(), + rangesDC_->const_view(), + index_gpu_buf.data(), + nonZeroModules, + ptCut_); + } auto const addTripletRangesToEventExplicit_workDiv = cms::alpakatools::make_workdiv(1, 1024); @@ -917,7 +991,8 @@ void LSTEvent::createQuintuplets() { tripletsDC_->view().triplets(), tripletsDC_->const_view().tripletsOccupancy(), rangesDC_->const_view(), - ptCut_); + ptCut_, + reduceMem_); auto const createEligibleModulesListForQuintuplets_workDiv = cms::alpakatools::make_workdiv(1, 1024); @@ -966,19 +1041,37 @@ void LSTEvent::createQuintuplets() { auto const createQuintuplets_workDiv = cms::alpakatools::make_workdiv({std::max((int)nEligibleT5Modules, 1), 1, 1}, {1, 8, 32}); - alpaka::exec(queue_, - createQuintuplets_workDiv, - CreateQuintuplets{}, - modules_.const_view().modules(), - miniDoubletsDC_->const_view().miniDoublets(), - segmentsDC_->const_view().segments(), - tripletsDC_->view().triplets(), - tripletsDC_->const_view().tripletsOccupancy(), - quintupletsDC_->view().quintuplets(), - quintupletsDC_->view().quintupletsOccupancy(), - rangesDC_->const_view(), - nEligibleT5Modules, - ptCut_); + if (reduceMem_) { + // Exact buffer sizing: use direct-iteration kernel (no preAllocatedTripletIndices) + alpaka::exec(queue_, + createQuintuplets_workDiv, + CreateQuintupletsDense{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->const_view().segments(), + tripletsDC_->view().triplets(), + tripletsDC_->const_view().tripletsOccupancy(), + quintupletsDC_->view().quintuplets(), + quintupletsDC_->view().quintupletsOccupancy(), + rangesDC_->const_view(), + nEligibleT5Modules, + ptCut_); + } else { + // Default: overestimated buffers with match+batch approach + alpaka::exec(queue_, + createQuintuplets_workDiv, + CreateQuintuplets{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->const_view().segments(), + tripletsDC_->view().triplets(), + tripletsDC_->const_view().tripletsOccupancy(), + quintupletsDC_->view().quintuplets(), + quintupletsDC_->view().quintupletsOccupancy(), + rangesDC_->const_view(), + nEligibleT5Modules, + ptCut_); + } auto const removeDupQuintupletsAfterBuild_workDiv = cms::alpakatools::make_workdiv({max_blocks, 1, 1}, {1, 16, 16}); @@ -1165,7 +1258,8 @@ void LSTEvent::createQuadruplets() { tripletsDC_->view().triplets(), tripletsDC_->const_view().tripletsOccupancy(), rangesDC_->const_view(), - ptCut_); + ptCut_, + reduceMem_); auto const createEligibleModulesListForQuadruplets_workDiv = cms::alpakatools::make_workdiv(1, 1024); @@ -1211,19 +1305,37 @@ void LSTEvent::createQuadruplets() { auto const createQuadruplets_workDiv = cms::alpakatools::make_workdiv({std::max((int)nEligibleT4Modules, 1), 1, 1}, {1, 8, 32}); - alpaka::exec(queue_, - createQuadruplets_workDiv, - CreateQuadruplets{}, - modules_.const_view().modules(), - miniDoubletsDC_->const_view().miniDoublets(), - segmentsDC_->const_view().segments(), - tripletsDC_->view().triplets(), - tripletsDC_->const_view().tripletsOccupancy(), - quadrupletsDC_->view().quadruplets(), - quadrupletsDC_->view().quadrupletsOccupancy(), - rangesDC_->const_view(), - nEligibleT4Modules, - ptCut_); + if (reduceMem_) { + // Exact buffer sizing: use direct-iteration kernel (no preAllocatedTripletIndices) + alpaka::exec(queue_, + createQuadruplets_workDiv, + CreateQuadrupletsDense{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->const_view().segments(), + tripletsDC_->const_view().triplets(), + tripletsDC_->const_view().tripletsOccupancy(), + quadrupletsDC_->view().quadruplets(), + quadrupletsDC_->view().quadrupletsOccupancy(), + rangesDC_->const_view(), + nEligibleT4Modules, + ptCut_); + } else { + // Default: overestimated buffers with match+batch approach + alpaka::exec(queue_, + createQuadruplets_workDiv, + CreateQuadruplets{}, + modules_.const_view().modules(), + miniDoubletsDC_->const_view().miniDoublets(), + segmentsDC_->const_view().segments(), + tripletsDC_->view().triplets(), + tripletsDC_->const_view().tripletsOccupancy(), + quadrupletsDC_->view().quadruplets(), + quadrupletsDC_->view().quadrupletsOccupancy(), + rangesDC_->const_view(), + nEligibleT4Modules, + ptCut_); + } auto const removeDupQuadrupletsAfterBuild_workDiv = cms::alpakatools::make_workdiv({max_blocks, 1, 1}, {1, 16, 16}); diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.h b/RecoTracker/LSTCore/src/alpaka/LSTEvent.h index ca94566cad404..91406feddc615 100644 --- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.h +++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.h @@ -43,6 +43,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { Queue& queue_; const float ptCut_; const uint16_t clustSizeCut_; + const bool reduceMem_; std::array n_minidoublets_by_layer_barrel_{}; std::array n_minidoublets_by_layer_endcap_{}; @@ -101,11 +102,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { public: // Constructor used for CMSSW integration. Uses an external queue. - LSTEvent( - bool verbose, const float ptCut, const uint16_t clustSizeCut, Queue& q, const LSTESData* deviceESData) + LSTEvent(bool verbose, + const float ptCut, + const uint16_t clustSizeCut, + Queue& q, + const LSTESData* deviceESData, + bool reduceMem = false) : queue_(q), ptCut_(ptCut), clustSizeCut_(clustSizeCut), + reduceMem_(reduceMem), nModules_(deviceESData->nModules), nLowerModules_(deviceESData->nLowerModules), nPixels_(deviceESData->nPixels), diff --git a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h index 806869522e062..9fb32231b585a 100644 --- a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h +++ b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h @@ -364,29 +364,29 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } template - ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoBarrel(TAcc const& acc, - ModulesConst modules, - uint16_t lowerModuleIndex, - uint16_t upperModuleIndex, - unsigned int lowerHitIndex, - unsigned int upperHitIndex, - float& dz, - float& dPhi, - float& dPhiChange, - float& shiftedX, - float& shiftedY, - float& shiftedZ, - float& noShiftedDphi, - float& noShiftedDphiChange, - float xLower, - float yLower, - float zLower, - float rtLower, - float xUpper, - float yUpper, - float zUpper, - float rtUpper, - const float ptCut) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runMiniDoubletDefaultAlgoBarrel(TAcc const& acc, + ModulesConst modules, + uint16_t lowerModuleIndex, + uint16_t upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float& dz, + float& dPhi, + float& dPhiChange, + float& shiftedX, + float& shiftedY, + float& shiftedZ, + float& noShiftedDphi, + float& noShiftedDphiChange, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper, + const float ptCut) { dz = zLower - zUpper; const float dzCut = modules.moduleType()[lowerModuleIndex] == PS ? 2.f : 10.f; const float sign = ((dz > 0) - (dz < 0)) * ((zLower > 0) - (zLower < 0)); @@ -492,29 +492,29 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } template - ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoEndcap(TAcc const& acc, - ModulesConst modules, - uint16_t lowerModuleIndex, - uint16_t upperModuleIndex, - unsigned int lowerHitIndex, - unsigned int upperHitIndex, - float& drt, - float& dPhi, - float& dPhiChange, - float& shiftedX, - float& shiftedY, - float& shiftedZ, - float& noShiftedDphi, - float& noShiftedDphichange, - float xLower, - float yLower, - float zLower, - float rtLower, - float xUpper, - float yUpper, - float zUpper, - float rtUpper, - const float ptCut) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runMiniDoubletDefaultAlgoEndcap(TAcc const& acc, + ModulesConst modules, + uint16_t lowerModuleIndex, + uint16_t upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float& drt, + float& dPhi, + float& dPhiChange, + float& shiftedX, + float& shiftedY, + float& shiftedZ, + float& noShiftedDphi, + float& noShiftedDphichange, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper, + const float ptCut) { // There are series of cuts that applies to mini-doublet in a "endcap" region // Cut #1 : dz cut. The dz difference can't be larger than 1cm. (max separation is 4mm for modules in the endcap) // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3093 @@ -604,32 +604,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } template - ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgo(TAcc const& acc, - ModulesConst modules, - uint16_t lowerModuleIndex, - uint16_t upperModuleIndex, - unsigned int lowerHitIndex, - unsigned int upperHitIndex, - float& dz, - float& dPhi, - float& dPhiChange, - float& shiftedX, - float& shiftedY, - float& shiftedZ, - float& noShiftedDphi, - float& noShiftedDphiChange, - float xLower, - float yLower, - float zLower, - float rtLower, - float xUpper, - float yUpper, - float zUpper, - float rtUpper, - const float ptCut, - uint16_t clustSizeLower, - uint16_t clustSizeUpper, - const uint16_t clustSizeCut) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runMiniDoubletDefaultAlgo(TAcc const& acc, + ModulesConst modules, + uint16_t lowerModuleIndex, + uint16_t upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float& dz, + float& dPhi, + float& dPhiChange, + float& shiftedX, + float& shiftedY, + float& shiftedZ, + float& noShiftedDphi, + float& noShiftedDphiChange, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper, + const float ptCut, + uint16_t clustSizeLower, + uint16_t clustSizeUpper, + const uint16_t clustSizeCut) { if (clustSizeLower > clustSizeCut or clustSizeUpper > clustSizeCut) { return false; } @@ -818,6 +818,113 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { return -1; } + // Counting kernel: runs full MD algorithm to get exact counts per module. + // Only launched when reduceMem is enabled. + struct CountMiniDoublets { + ALPAKA_FN_ACC void operator()(Acc2D const& acc, + ModulesConst modules, + HitsBaseConst hitsBase, + HitsExtendedConst hitsExtended, + HitsRangesConst hitsRanges, + ObjectRanges ranges, + const float ptCut, + const uint16_t clustSizeCut) const { + for (uint16_t lowerModuleIndex : cms::alpakatools::uniform_elements_y(acc, modules.nLowerModules())) { + uint16_t upperModuleIndex = modules.partnerModuleIndices()[lowerModuleIndex]; + int nLowerHits = hitsRanges.hitRangesnLower()[lowerModuleIndex]; + int nUpperHits = hitsRanges.hitRangesnUpper()[lowerModuleIndex]; + if (hitsRanges.hitRangesLower()[lowerModuleIndex] == -1) + continue; + unsigned int upHitArrayIndex = hitsRanges.hitRangesUpper()[lowerModuleIndex]; + unsigned int loHitArrayIndex = hitsRanges.hitRangesLower()[lowerModuleIndex]; + int limit = nUpperHits * nLowerHits; + + for (int hitIndex : cms::alpakatools::uniform_elements_x(acc, limit)) { + int lowerHitIndex = hitIndex / nUpperHits; + int upperHitIndex = hitIndex % nUpperHits; + if (upperHitIndex >= nUpperHits) + continue; + if (lowerHitIndex >= nLowerHits) + continue; + unsigned int lowerHitArrayIndex = loHitArrayIndex + lowerHitIndex; + float xLower = hitsBase.xs()[lowerHitArrayIndex]; + float yLower = hitsBase.ys()[lowerHitArrayIndex]; + float zLower = hitsBase.zs()[lowerHitArrayIndex]; + float rtLower = hitsExtended.rts()[lowerHitArrayIndex]; + unsigned int upperHitArrayIndex = upHitArrayIndex + upperHitIndex; + float xUpper = hitsBase.xs()[upperHitArrayIndex]; + float yUpper = hitsBase.ys()[upperHitArrayIndex]; + float zUpper = hitsBase.zs()[upperHitArrayIndex]; + float rtUpper = hitsExtended.rts()[upperHitArrayIndex]; + uint16_t clustSizeLower = hitsBase.clustsize()[lowerHitArrayIndex]; + uint16_t clustSizeUpper = hitsBase.clustsize()[upperHitArrayIndex]; + + float dz, dphi, dphichange, shiftedX, shiftedY, shiftedZ, noShiftedDphi, noShiftedDphiChange; + bool success = runMiniDoubletDefaultAlgo(acc, + modules, + lowerModuleIndex, + upperModuleIndex, + lowerHitArrayIndex, + upperHitArrayIndex, + dz, + dphi, + dphichange, + shiftedX, + shiftedY, + shiftedZ, + noShiftedDphi, + noShiftedDphiChange, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper, + ptCut, + clustSizeLower, + clustSizeUpper, + clustSizeCut); + if (success) { + alpaka::atomicAdd( + acc, &ranges.miniDoubletModuleOccupancy()[lowerModuleIndex], 1, alpaka::hierarchy::Threads{}); + } + } + } + } + }; + + // Reduced-memory version of CreateMDArrayRangesGPU: reads pre-computed exact counts + // from CountMiniDoublets instead of using matrix-based caps. + // Only launched when reduceMem is enabled. + struct CreateMDArrayRangesReducedMem { + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, ObjectRanges ranges) const { + // implementation is 1D with a single block + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + + int& nTotalMDs = alpaka::declareSharedVar(acc); + if (cms::alpakatools::once_per_block(acc)) { + nTotalMDs = 0; + } + alpaka::syncBlockThreads(acc); + + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { + // Exact count was pre-computed by CountMiniDoublets + int occupancy = ranges.miniDoubletModuleOccupancy()[i]; + unsigned int nTotMDs = alpaka::atomicAdd(acc, &nTotalMDs, occupancy, alpaka::hierarchy::Threads{}); + + ranges.miniDoubletModuleIndices()[i] = nTotMDs; + } + + alpaka::syncBlockThreads(acc); + if (cms::alpakatools::once_per_block(acc)) { + ranges.miniDoubletModuleIndices()[modules.nLowerModules()] = nTotalMDs; + ranges.nTotalMDs() = nTotalMDs; + } + } + }; + struct CreateMDArrayRangesGPU { ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, diff --git a/RecoTracker/LSTCore/src/alpaka/Quadruplet.h b/RecoTracker/LSTCore/src/alpaka/Quadruplet.h index e4b342937c958..302527dd11e0c 100644 --- a/RecoTracker/LSTCore/src/alpaka/Quadruplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Quadruplet.h @@ -521,6 +521,165 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { return true; }; + // Reduced-memory version: processes dense T4 pairs in a separate kernel. + // Only launched when reduceMem is enabled. + struct CreateQuadrupletsDense { + ALPAKA_FN_ACC void operator()(Acc3D const& acc, + ModulesConst modules, + MiniDoubletsConst mds, + SegmentsConst segments, + TripletsConst triplets, + TripletsOccupancyConst tripletsOccupancy, + Quadruplets quadruplets, + QuadrupletsOccupancy quadrupletsOccupancy, + ObjectRangesConst ranges, + uint16_t nEligibleT4Modules, + const float ptCut) const { + const auto& mdIndices = segments.mdIndices(); + const auto& segIdx = triplets.segmentIndices(); + const auto& lmIdx = triplets.lowerModuleIndices(); + const auto& tripIdx = ranges.tripletModuleIndices(); + + for (int iter : cms::alpakatools::uniform_groups_z(acc, nEligibleT4Modules)) { + const uint16_t lowerModule1 = ranges.indicesOfEligibleT4Modules()[iter]; + + short layer2_adjustment, md_adjustment; + int layer = modules.layers()[lowerModule1]; + if (layer == 1) { + if (modules.subdets()[lowerModule1] != Endcap) + continue; + layer2_adjustment = 1; + md_adjustment = 1; + } else if (layer == 2) { + if (modules.subdets()[lowerModule1] != Endcap) + continue; + layer2_adjustment = 1; + md_adjustment = 0; + } else { + layer2_adjustment = 0; + md_adjustment = 0; + } + const unsigned int nInnerTriplets = tripletsOccupancy.nTriplets()[lowerModule1]; + + for (unsigned int innerTripletArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerTriplets)) { + const unsigned int innerTripletIndex = tripIdx[lowerModule1] + innerTripletArrayIndex; + if (triplets.partOfPT5()[innerTripletIndex]) + continue; + if (triplets.partOfT5()[innerTripletIndex]) + continue; + if (triplets.partOfPT3()[innerTripletIndex]) + continue; + const uint16_t lowerModule2 = lmIdx[innerTripletIndex][1]; + const unsigned int nOuterTriplets = tripletsOccupancy.nTriplets()[lowerModule2]; + for (unsigned int outerTripletArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterTriplets)) { + unsigned int outerTripletIndex = tripIdx[lowerModule2] + outerTripletArrayIndex; + if (triplets.partOfPT5()[outerTripletIndex]) + continue; + if (triplets.partOfT5()[outerTripletIndex]) + continue; + if (triplets.partOfPT3()[outerTripletIndex]) + continue; + + const unsigned int innerT3LS2Index = segIdx[innerTripletIndex][1]; + const unsigned int outerT3LS1Index = segIdx[outerTripletIndex][0]; + + if (innerT3LS2Index != outerT3LS1Index) + continue; + + // When launched as the sole creation kernel (reduceMem mode), processes ALL pairs. + // When launched alongside CreateQuadruplets, processes only dense pairs. + + const uint16_t lowerModule3 = lmIdx[outerTripletIndex][1]; + const uint16_t lowerModule4 = lmIdx[outerTripletIndex][2]; + + float innerRadius = triplets.radius()[innerTripletIndex]; + float outerRadius = triplets.radius()[outerTripletIndex]; + float rzChiSquared, dBeta, nonAnchorChiSquared, regressionCenterX, regressionCenterY, regressionRadius, + nonAnchorRegressionRadius, chiSquared, promptScore, displacedScore, fakeScore; + + float pt = (innerRadius + outerRadius) * k2Rinv1GeVf; + + bool success = runQuadrupletDefaultAlgo(acc, + modules, + mds, + segments, + triplets, + lowerModule1, + lowerModule2, + lowerModule3, + lowerModule4, + innerTripletIndex, + outerTripletIndex, + regressionCenterX, + regressionCenterY, + regressionRadius, + nonAnchorRegressionRadius, + chiSquared, + ptCut, + rzChiSquared, + nonAnchorChiSquared, + dBeta, + promptScore, + displacedScore, + fakeScore); + if (success) { + int totOccupancyQuadruplets = alpaka::atomicAdd( + acc, &quadrupletsOccupancy.totOccupancyQuadruplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{}); + if (totOccupancyQuadruplets >= ranges.quadrupletModuleOccupancy()[lowerModule1]) { +#ifdef WARNINGS + printf("Quadruplet excess alert! Module index = %d, Occupancy = %d\n", + lowerModule1, + totOccupancyQuadruplets); +#endif + } else { + int quadrupletModuleIndex = alpaka::atomicAdd( + acc, &quadrupletsOccupancy.nQuadruplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{}); + if (ranges.quadrupletModuleIndices()[lowerModule1] == -1) { +#ifdef WARNINGS + printf("Quadruplets : no memory for module at module index = %d\n", lowerModule1); +#endif + } else { + unsigned int quadrupletIndex = ranges.quadrupletModuleIndices()[lowerModule1] + quadrupletModuleIndex; + const unsigned int layer3MDIndex = + mdIndices[segIdx[innerTripletIndex][md_adjustment]][layer2_adjustment]; + float phi = mds.anchorPhi()[layer3MDIndex]; + float eta = mds.anchorEta()[layer3MDIndex]; + + float scores = chiSquared + nonAnchorChiSquared; + addQuadrupletToMemory(triplets, + quadruplets, + innerTripletIndex, + outerTripletIndex, + lowerModule1, + lowerModule2, + lowerModule3, + lowerModule4, + innerRadius, + outerRadius, + pt, + eta, + phi, + scores, + layer, + quadrupletIndex, + rzChiSquared, + dBeta, + promptScore, + displacedScore, + fakeScore, + regressionCenterX, + regressionCenterY, + regressionRadius, + nonAnchorRegressionRadius); + } + } + } + } + } + } + } + }; + struct CreateQuadruplets { ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, @@ -610,8 +769,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { if (innerT3LS2Index != outerT3LS1Index) continue; - // If densely connected, do not attempt parallel processing to avoid truncation if (nInnerTriplets >= kNTripletThreshold || nOuterTriplets >= kNTripletThreshold) { + // Dense pairs: handle inline const uint16_t lowerModule3 = lmIdx[outerTripletIndex][1]; const uint16_t lowerModule4 = lmIdx[outerTripletIndex][2]; @@ -843,7 +1002,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { Triplets triplets, TripletsOccupancyConst tripletsOcc, ObjectRangesConst ranges, - const float ptCut) const { + const float ptCut, + const bool reduceMem = false) const { // The atomicAdd below with hierarchy::Threads{} requires one block in x, y dimensions. ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[1] == 1) && (alpaka::getWorkDiv(acc)[2] == 1)); @@ -879,8 +1039,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { const unsigned int thirdMDOuter = mdIndices[thirdSegIdx][1]; if ((secondMDInner == thirdMDInner) && (secondMDOuter == thirdMDOuter)) { - // Will only perform runQuadrupletDefaultAlgorithm() checks if densely connected - if (nInnerTriplets < kNTripletThreshold && nOuterTriplets < kNTripletThreshold) { + // When reduceMem or densely connected: run full algo for exact count + if (!reduceMem && nInnerTriplets < kNTripletThreshold && nOuterTriplets < kNTripletThreshold) { alpaka::atomicAdd(acc, &triplets.connectedLSMax()[innerTripletIndex], 1u, alpaka::hierarchy::Threads{}); } else { const uint16_t lowerModule3 = lmIdx[outerTripletIndex][1]; diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h index 15b0aa1f7f405..739e97d8e2406 100644 --- a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h @@ -1682,6 +1682,157 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { return true; } + // Reduced-memory version: processes dense T5 pairs in a separate kernel. + // Only launched when reduceMem is enabled. + struct CreateQuintupletsDense { + ALPAKA_FN_ACC void operator()(Acc3D const& acc, + ModulesConst modules, + MiniDoubletsConst mds, + SegmentsConst segments, + Triplets triplets, + TripletsOccupancyConst tripletsOccupancy, + Quintuplets quintuplets, + QuintupletsOccupancy quintupletsOccupancy, + ObjectRangesConst ranges, + uint16_t nEligibleT5Modules, + const float ptCut) const { + for (int iter : cms::alpakatools::uniform_groups_z(acc, nEligibleT5Modules)) { + uint16_t lowerModule1 = ranges.indicesOfEligibleT5Modules()[iter]; + + short layer2_adjustment; + int layer = modules.layers()[lowerModule1]; + if (layer == 1) { + layer2_adjustment = 1; + } else if (layer == 2) { + layer2_adjustment = 0; + } else { + continue; + } + + unsigned int nInnerTriplets = tripletsOccupancy.nTriplets()[lowerModule1]; + + for (unsigned int innerTripletArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerTriplets)) { + unsigned int innerTripletIndex = ranges.tripletModuleIndices()[lowerModule1] + innerTripletArrayIndex; + uint16_t lowerModule3 = triplets.lowerModuleIndices()[innerTripletIndex][2]; + unsigned int nOuterTriplets = tripletsOccupancy.nTriplets()[lowerModule3]; + for (unsigned int outerTripletArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterTriplets)) { + unsigned int outerTripletIndex = ranges.tripletModuleIndices()[lowerModule3] + outerTripletArrayIndex; + + unsigned int secondSegmentIndex = triplets.segmentIndices()[innerTripletIndex][1]; + unsigned int thirdSegmentIndex = triplets.segmentIndices()[outerTripletIndex][0]; + + unsigned int miniDoublet3Index = segments.mdIndices()[secondSegmentIndex][1]; + unsigned int outerInnerInnerMiniDoubletIndex = segments.mdIndices()[thirdSegmentIndex][0]; + + if (miniDoublet3Index != outerInnerInnerMiniDoubletIndex) + continue; + + // When launched as the sole creation kernel (reduceMem mode), processes ALL pairs. + // When launched alongside CreateQuintuplets, processes only dense pairs. + + uint16_t lowerModule2 = triplets.lowerModuleIndices()[innerTripletIndex][1]; + uint16_t lowerModule4 = triplets.lowerModuleIndices()[outerTripletIndex][1]; + uint16_t lowerModule5 = triplets.lowerModuleIndices()[outerTripletIndex][2]; + + float innerRadius, outerRadius, bridgeRadius, regressionCenterX, regressionCenterY, regressionRadius, + rzChiSquared, chiSquared, nonAnchorChiSquared, dBeta1, dBeta2, dnnScore; + + float t5Embed[Params_T5::kEmbed] = {0.f}; + + bool tightCutFlag = false; + + bool success = runQuintupletDefaultAlgo(acc, + modules, + mds, + segments, + triplets, + lowerModule1, + lowerModule2, + lowerModule3, + lowerModule4, + lowerModule5, + innerTripletIndex, + outerTripletIndex, + innerRadius, + outerRadius, + bridgeRadius, + regressionCenterX, + regressionCenterY, + regressionRadius, + rzChiSquared, + chiSquared, + nonAnchorChiSquared, + dBeta1, + dBeta2, + dnnScore, + tightCutFlag, + t5Embed, + ptCut); + if (success) { + int totOccupancyQuintuplets = alpaka::atomicAdd( + acc, &quintupletsOccupancy.totOccupancyQuintuplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{}); + if (totOccupancyQuintuplets >= ranges.quintupletModuleOccupancy()[lowerModule1]) { +#ifdef WARNINGS + printf("Quintuplet excess alert! Module index = %d, Occupancy = %d\n", + lowerModule1, + totOccupancyQuintuplets); +#endif + } else { + int quintupletModuleIndex = alpaka::atomicAdd( + acc, &quintupletsOccupancy.nQuintuplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{}); + if (ranges.quintupletModuleIndices()[lowerModule1] == -1) { +#ifdef WARNINGS + printf("Quintuplets : no memory for module at module index = %d\n", lowerModule1); +#endif + } else { + unsigned int quintupletIndex = ranges.quintupletModuleIndices()[lowerModule1] + quintupletModuleIndex; + float phi = mds.anchorPhi()[segments.mdIndices()[triplets.segmentIndices()[innerTripletIndex][0]] + [layer2_adjustment]]; + float eta = mds.anchorEta()[segments.mdIndices()[triplets.segmentIndices()[innerTripletIndex][0]] + [layer2_adjustment]]; + float pt = (innerRadius + outerRadius) * k2Rinv1GeVf; + float scores = chiSquared + nonAnchorChiSquared; + addQuintupletToMemory(triplets, + quintuplets, + innerTripletIndex, + outerTripletIndex, + lowerModule1, + lowerModule2, + lowerModule3, + lowerModule4, + lowerModule5, + innerRadius, + bridgeRadius, + outerRadius, + regressionCenterX, + regressionCenterY, + regressionRadius, + rzChiSquared, + chiSquared, + nonAnchorChiSquared, + dBeta1, + dBeta2, + pt, + eta, + phi, + scores, + layer, + quintupletIndex, + t5Embed, + tightCutFlag, + dnnScore); + + triplets.partOfT5()[quintuplets.tripletIndices()[quintupletIndex][0]] = true; + triplets.partOfT5()[quintuplets.tripletIndices()[quintupletIndex][1]] = true; + } + } + } + } + } + } + } + }; + struct CreateQuintuplets { ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, @@ -1753,15 +1904,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { if (miniDoublet3Index != outerInnerInnerMiniDoubletIndex) continue; - // If densely connected, do not attempt parallel processing to avoid truncation if (nInnerTriplets >= kNTripletThreshold || nOuterTriplets >= kNTripletThreshold) { + // Dense pairs: handle inline uint16_t lowerModule2 = triplets.lowerModuleIndices()[innerTripletIndex][1]; uint16_t lowerModule4 = triplets.lowerModuleIndices()[outerTripletIndex][1]; uint16_t lowerModule5 = triplets.lowerModuleIndices()[outerTripletIndex][2]; float innerRadius, outerRadius, bridgeRadius, regressionCenterX, regressionCenterY, regressionRadius, - rzChiSquared, chiSquared, nonAnchorChiSquared, dBeta1, dBeta2, - dnnScore; //required for making distributions + rzChiSquared, chiSquared, nonAnchorChiSquared, dBeta1, dBeta2, dnnScore; float t5Embed[Params_T5::kEmbed] = {0.f}; @@ -2013,7 +2163,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { Triplets triplets, TripletsOccupancyConst tripletsOcc, ObjectRangesConst ranges, - const float ptCut) const { + const float ptCut, + const bool reduceMem = false) const { // The atomicAdd below with hierarchy::Threads{} requires one block in x, y dimensions. ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[1] == 1) && (alpaka::getWorkDiv(acc)[2] == 1)); @@ -2047,8 +2198,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { const unsigned int thirdMDInner = mdIndices[thirdSegIdx][0]; if (secondMDOuter == thirdMDInner) { - // Will only perform runQuintupletDefaultAlgorithm() checks if densely connected - if (nInnerTriplets < kNTripletThreshold && nOuterTriplets < kNTripletThreshold) { + // When reduceMem or densely connected: run full algo for exact count + if (!reduceMem && nInnerTriplets < kNTripletThreshold && nOuterTriplets < kNTripletThreshold) { alpaka::atomicAdd(acc, &triplets.connectedMax()[innerTripletIndex], 1u, alpaka::hierarchy::Threads{}); } else { const uint16_t lowerModule2 = lmIdx[innerTripletIndex][1]; diff --git a/RecoTracker/LSTCore/src/alpaka/Segment.h b/RecoTracker/LSTCore/src/alpaka/Segment.h index 6437625761950..4ed402c9da934 100644 --- a/RecoTracker/LSTCore/src/alpaka/Segment.h +++ b/RecoTracker/LSTCore/src/alpaka/Segment.h @@ -863,6 +863,80 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } }; + // Reduced-memory version of CountMiniDoubletConnections: runs full segment algorithm + // for exact segment counts instead of the delta-phi precut. + // Only launched when reduceMem is enabled. + struct CountMiniDoubletConnectionsDense { + ALPAKA_FN_ACC void operator()(Acc3D const& acc, + ModulesConst modules, + MiniDoublets mds, + MiniDoubletsOccupancyConst mdsOccupancy, + ObjectRangesConst ranges, + const float ptCut) const { + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[1] == 1) && + (alpaka::getWorkDiv(acc)[2] == 1)); + const auto& mdRanges = ranges.mdRanges(); + + for (uint16_t innerLowerModuleIndex : cms::alpakatools::uniform_groups_z(acc, modules.nLowerModules())) { + const unsigned int nInnerMDs = mdsOccupancy.nMDs()[innerLowerModuleIndex]; + if (nInnerMDs == 0) + continue; + + const uint16_t nConnectedModules = modules.nConnectedModules()[innerLowerModuleIndex]; + if (nConnectedModules == 0) + continue; + + for (uint16_t outerLowerModuleArrayIdx : cms::alpakatools::uniform_elements_y(acc, nConnectedModules)) { + const uint16_t outerLowerModuleIndex = modules.moduleMap()[innerLowerModuleIndex][outerLowerModuleArrayIdx]; + const unsigned int nOuterMDs = mdsOccupancy.nMDs()[outerLowerModuleIndex]; + if (nOuterMDs == 0) + continue; + + const unsigned int limit = nInnerMDs * nOuterMDs; + + for (unsigned int hitIndex : cms::alpakatools::uniform_elements_x(acc, limit)) { + const unsigned int innerMDArrayIdx = hitIndex / nOuterMDs; + const unsigned int outerMDArrayIdx = hitIndex % nOuterMDs; + + const unsigned int innerMDIndex = mdRanges[innerLowerModuleIndex][0] + innerMDArrayIdx; + const unsigned int outerMDIndex = mdRanges[outerLowerModuleIndex][0] + outerMDArrayIdx; + + // Run full segment algo for exact count + float dPhi, dPhiMin, dPhiMax, dPhiChange, dPhiChangeMin, dPhiChangeMax; +#ifdef CUT_VALUE_DEBUG + float dAlphaInnerMDSegment, dAlphaOuterMDSegment, dAlphaInnerMDOuterMD, zLo, zHi, rtLo, rtHi; +#endif + if (runSegmentDefaultAlgo(acc, + modules, + mds, + innerLowerModuleIndex, + outerLowerModuleIndex, + innerMDIndex, + outerMDIndex, + dPhi, + dPhiMin, + dPhiMax, + dPhiChange, + dPhiChangeMin, + dPhiChangeMax, +#ifdef CUT_VALUE_DEBUG + dAlphaInnerMDSegment, + dAlphaOuterMDSegment, + dAlphaInnerMDOuterMD, + zLo, + zHi, + rtLo, + rtHi, +#endif + ptCut)) { + alpaka::atomicAdd(acc, &mds.connectedMax()[innerMDIndex], 1u, alpaka::hierarchy::Threads{}); + } + } + } + } + } + }; + struct CreateSegmentArrayRanges { ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, diff --git a/RecoTracker/LSTCore/src/alpaka/Triplet.h b/RecoTracker/LSTCore/src/alpaka/Triplet.h index 81c166cd1f4f5..858667bc6da27 100644 --- a/RecoTracker/LSTCore/src/alpaka/Triplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Triplet.h @@ -459,6 +459,132 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { return true; } + // Reduced-memory version of CreateTriplets: directly processes all segment pairs + // with full T3 algorithm. Only launched when reduceMem is enabled, replacing CreateTriplets. + struct CreateTripletsDense { + ALPAKA_FN_ACC void operator()(Acc3D const& acc, + ModulesConst modules, + MiniDoubletsConst mds, + SegmentsConst segments, + SegmentsOccupancyConst segmentsOccupancy, + Triplets triplets, + TripletsOccupancy tripletsOccupancy, + ObjectRangesConst ranges, + uint16_t* index_gpu, + uint16_t nonZeroModules, + const float ptCut) const { + for (uint16_t innerLowerModuleArrayIdx : cms::alpakatools::uniform_groups_z(acc, nonZeroModules)) { + uint16_t innerInnerLowerModuleIndex = index_gpu[innerLowerModuleArrayIdx]; + if (innerInnerLowerModuleIndex >= modules.nLowerModules()) + continue; + + uint16_t nConnectedModules = modules.nConnectedModules()[innerInnerLowerModuleIndex]; + if (nConnectedModules == 0) + continue; + + unsigned int nInnerSegments = segmentsOccupancy.nSegments()[innerInnerLowerModuleIndex]; + if (nInnerSegments == 0) + continue; + + for (unsigned int innerSegmentArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerSegments)) { + unsigned int innerSegmentIndex = + ranges.segmentRanges()[innerInnerLowerModuleIndex][0] + innerSegmentArrayIndex; + + uint16_t middleLowerModuleIndex = segments.outerLowerModuleIndices()[innerSegmentIndex]; + int middleMDIndiceInner = segments.mdIndices()[innerSegmentIndex][1]; + + unsigned int nOuterSegments = segmentsOccupancy.nSegments()[middleLowerModuleIndex]; + for (unsigned int outerSegmentArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterSegments)) { + unsigned int outerSegmentIndex = ranges.segmentRanges()[middleLowerModuleIndex][0] + outerSegmentArrayIndex; + + int middleMDIndiceOuter = segments.mdIndices()[outerSegmentIndex][0]; + if (middleMDIndiceInner != middleMDIndiceOuter) + continue; + + uint16_t outerOuterLowerModuleIndex = segments.outerLowerModuleIndices()[outerSegmentIndex]; + unsigned int firstMDIndex = segments.mdIndices()[innerSegmentIndex][0]; + unsigned int secondMDIndex = segments.mdIndices()[outerSegmentIndex][0]; + unsigned int thirdMDIndex = segments.mdIndices()[outerSegmentIndex][1]; + + if (not passPointingConstraint(acc, + modules, + mds, + segments, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + ptCut)) + continue; + + float betaIn, betaInCut, circleRadius, circleCenterX, circleCenterY; + short charge; + float t3Scores[dnn::t3dnn::kOutputFeatures] = {0.f}; + + bool success = runTripletConstraintsAndAlgo(acc, + modules, + mds, + segments, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + betaIn, + betaInCut, + circleRadius, + circleCenterX, + circleCenterY, + ptCut, + t3Scores, + charge); + if (success) { + unsigned int totOccupancyTriplets = + alpaka::atomicAdd(acc, + &tripletsOccupancy.totOccupancyTriplets()[innerInnerLowerModuleIndex], + 1u, + alpaka::hierarchy::Threads{}); + if (static_cast(totOccupancyTriplets) >= + ranges.tripletModuleOccupancy()[innerInnerLowerModuleIndex]) { +#ifdef WARNINGS + printf("Triplet excess alert! Module index = %d, Occupancy = %d\n", + innerInnerLowerModuleIndex, + totOccupancyTriplets); +#endif + } else { + unsigned int tripletModuleIndex = alpaka::atomicAdd( + acc, &tripletsOccupancy.nTriplets()[innerInnerLowerModuleIndex], 1u, alpaka::hierarchy::Threads{}); + unsigned int tripletIndex = + ranges.tripletModuleIndices()[innerInnerLowerModuleIndex] + tripletModuleIndex; + + addTripletToMemory(modules, + mds, + segments, + triplets, + innerSegmentIndex, + outerSegmentIndex, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + betaIn, + betaInCut, + circleRadius, + circleCenterX, + circleCenterY, + tripletIndex, + t3Scores, + charge); + } + } + } + } + } + } + }; + struct CreateTriplets { ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, @@ -704,6 +830,91 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } }; + // Reduced-memory version of CountSegmentConnections: runs full T3 algorithm + // for exact triplet counts instead of just counting pointing-constraint matches. + // Only launched when reduceMem is enabled. + struct CountSegmentConnectionsDense { + ALPAKA_FN_ACC void operator()(Acc3D const& acc, + ModulesConst modules, + MiniDoubletsConst mds, + Segments segments, + SegmentsOccupancyConst segOcc, + ObjectRangesConst ranges, + const float ptCut) const { + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[1] == 1) && + (alpaka::getWorkDiv(acc)[2] == 1)); + const auto& mdIndices = segments.mdIndices(); + const auto& outerLowerModuleIndices = segments.outerLowerModuleIndices(); + const auto& segmentRanges = ranges.segmentRanges(); + + for (uint16_t innerLowerModuleArrayIdx : cms::alpakatools::uniform_groups_z(acc, modules.nLowerModules())) { + const unsigned int nInnerSegments = segOcc.nSegments()[innerLowerModuleArrayIdx]; + if (nInnerSegments == 0) + continue; + + for (unsigned int innerSegmentArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerSegments)) { + const unsigned int innerSegmentIndex = segmentRanges[innerLowerModuleArrayIdx][0] + innerSegmentArrayIndex; + const uint16_t middleLowerModuleIndex = outerLowerModuleIndices[innerSegmentIndex]; + const unsigned int mdShared = mdIndices[innerSegmentIndex][1]; + + const unsigned int nOuterSegments = segOcc.nSegments()[middleLowerModuleIndex]; + if (nOuterSegments == 0) + continue; + + for (unsigned int outerSegmentArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterSegments)) { + const unsigned int outerSegmentIndex = segmentRanges[middleLowerModuleIndex][0] + outerSegmentArrayIndex; + + if (mdIndices[outerSegmentIndex][0] != mdShared) + continue; + + unsigned int firstMDIndex = mdIndices[innerSegmentIndex][0]; + unsigned int secondMDIndex = mdShared; + unsigned int thirdMDIndex = mdIndices[outerSegmentIndex][1]; + uint16_t outerOuterLowerModuleIndex = outerLowerModuleIndices[outerSegmentIndex]; + + if (not passPointingConstraint(acc, + modules, + mds, + segments, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + innerLowerModuleArrayIdx, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + ptCut)) + continue; + + // Run full triplet algo for exact count + float betaIn, betaInCut, circleRadius, circleCenterX, circleCenterY; + short charge; + float t3Scores[dnn::t3dnn::kOutputFeatures] = {0.f}; + if (runTripletConstraintsAndAlgo(acc, + modules, + mds, + segments, + innerLowerModuleArrayIdx, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + betaIn, + betaInCut, + circleRadius, + circleCenterX, + circleCenterY, + ptCut, + t3Scores, + charge)) { + alpaka::atomicAdd(acc, &segments.connectedMax()[innerSegmentIndex], 1u, alpaka::hierarchy::Threads{}); + } + } + } + } + } + }; + struct CreateTripletArrayRanges { ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, diff --git a/RecoTracker/LSTCore/standalone/bin/lst.cc b/RecoTracker/LSTCore/standalone/bin/lst.cc index bd1d2825309c1..879d2803ebdcf 100644 --- a/RecoTracker/LSTCore/standalone/bin/lst.cc +++ b/RecoTracker/LSTCore/standalone/bin/lst.cc @@ -72,7 +72,8 @@ int main(int argc, char **argv) { "I,job_index", "job_index of split jobs (--nsplit_jobs must be set. index starts from 0. i.e. 0, 1, 2, 3, etc...)", cxxopts::value())("3,tc_pls_triplets", "Allow triplet pLSs in TC collection")( - "2,no_pls_dupclean", "Disable pLS duplicate cleaning (both steps)")("h,help", "Print help")( + "2,no_pls_dupclean", "Disable pLS duplicate cleaning (both steps)")( + "r,reduce-mem", "Enable reduced memory mode (exact buffer sizing via counting kernels)")("h,help", "Print help")( "md", "Write MD branches in output ntuple.")("ls", "Write LS branches in output ntuple.")( "t3", "Write T3 branches in output ntuple.")("t5", "Write T5 branches in output ntuple.")( "pls", "Write pLS branches in output ntuple.")("pt3", "Write pT3 branches in output ntuple.")( @@ -255,6 +256,10 @@ int main(int argc, char **argv) { // --no_pls_dupclean ana.no_pls_dupclean = result["no_pls_dupclean"].as(); + //_______________________________________________________________________________ + // --reduce-mem + ana.reduce_mem = result["reduce-mem"].as(); + //_______________________________________________________________________________ // --md ana.md_branches = result["md"].as() || result["allobj"].as(); @@ -331,6 +336,7 @@ int main(int argc, char **argv) { std::cout << " ana.nmatch_threshold: " << ana.nmatch_threshold << std::endl; std::cout << " ana.tc_pls_triplets: " << ana.tc_pls_triplets << std::endl; std::cout << " ana.no_pls_dupclean: " << ana.no_pls_dupclean << std::endl; + std::cout << " ana.reduce_mem: " << ana.reduce_mem << std::endl; std::cout << "=========================================================" << std::endl; // Create the TChain that holds the TTree's of the baby ntuples @@ -439,7 +445,8 @@ void run_lst() { std::vector events; std::vector event_queues; for (int s = 0; s < ana.streams; s++) { - LSTEvent *event = new LSTEvent(ana.verbose >= 2, ana.ptCut, ana.clustSizeCut, queues[s], &deviceESData); + LSTEvent *event = + new LSTEvent(ana.verbose >= 2, ana.ptCut, ana.clustSizeCut, queues[s], &deviceESData, ana.reduce_mem); events.push_back(event); event_queues.push_back(&queues[s]); } @@ -466,7 +473,7 @@ void run_lst() { #pragma omp for // nowait// private(event) for (int evt = 0; evt < static_cast(out_lstInputHC.size()); evt++) { if (ana.verbose >= 1) - std::cout << "Running Event number = " << evt << " " << omp_get_thread_num() << std::endl; + printf("Running Event number = %d Stream = %d\n", evt, omp_get_thread_num()); events.at(omp_get_thread_num())->initSync(); diff --git a/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h b/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h index d663568b9c423..f8672d4bcf3f4 100644 --- a/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h +++ b/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h @@ -132,6 +132,9 @@ class AnalysisConfig { // Boolean to disable pLS duplicate cleaning bool no_pls_dupclean; + // Boolean to enable reduced memory mode (exact buffer sizing via counting kernels) + bool reduce_mem; + // Boolean to enable MD branches bool md_branches;