From 6230e159d2d73f581c899bf4635efd3a2d2ce05e Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Fri, 30 Jan 2026 15:13:01 -0500 Subject: [PATCH] LST T3 Merging --- RecoTracker/LSTCore/interface/alpaka/Common.h | 3 + .../LSTCore/src/alpaka/LSTEvent.dev.cc | 15 ++ .../LSTCore/src/alpaka/TrackCandidate.h | 245 ++++++++++++++++++ .../standalone/code/core/write_lst_ntuple.cc | 18 ++ 4 files changed, 281 insertions(+) diff --git a/RecoTracker/LSTCore/interface/alpaka/Common.h b/RecoTracker/LSTCore/interface/alpaka/Common.h index 5ed0546351490..edae0b97b19a6 100644 --- a/RecoTracker/LSTCore/interface/alpaka/Common.h +++ b/RecoTracker/LSTCore/interface/alpaka/Common.h @@ -83,6 +83,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { HOST_DEVICE_CONSTANT float kWp_displaced[kPtBins][kEtaBins] = { {0.0334f, 0.0504f, 0.0748f, 0.0994f, 0.1128f, 0.1123f, 0.1118f, 0.1525f, 0.1867f, 0.1847f}, {0.0091f, 0.0075f, 0.0350f, 0.0213f, 0.0435f, 0.0676f, 0.1957f, 0.1649f, 0.1080f, 0.1046f}}; + HOST_DEVICE_CONSTANT float kWp_extend_fake[kPtBins][kEtaBins] = { + {0.0165f, 0.0128f, 0.0291f, 0.0415f, 0.0001f, 0.0066f, 0.0117f, 0.0059f, 0.0169f, 0.0167f}, + {0.0165f, 0.0128f, 0.0291f, 0.0415f, 0.0001f, 0.0066f, 0.0117f, 0.0059f, 0.0169f, 0.0167f}}; } // namespace t3dnn namespace t5dnn { diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc index 084c376d53408..f2353d4091ea6 100644 --- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc +++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc @@ -659,6 +659,21 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) trackCandidatesBaseDC_->view(), trackCandidatesExtendedDC_->view()); + alpaka::exec(queue_, + wd, + ExtendTrackCandidatesFromT3{}, + modules_.const_view().modules(), + rangesDC_->const_view(), + tripletsDC_->const_view().triplets(), + tripletsDC_->const_view().tripletsOccupancy(), + segmentsDC_->const_view().segments(), + miniDoubletsDC_->const_view().miniDoublets(), + quintupletsDC_->const_view().quintuplets(), + pixelTripletsDC_->const_view(), + quadrupletsDC_->const_view().quadruplets(), + trackCandidatesBaseDC_->view(), + trackCandidatesExtendedDC_->view()); + // Check if either n_max_pixel_track_candidates or n_max_nonpixel_track_candidates was reached auto nTrackCanpT5Host_buf = cms::alpakatools::make_host_buffer(queue_); auto nTrackCanpT3Host_buf = cms::alpakatools::make_host_buffer(queue_); diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h index 24c614ee52db9..16671a43f2fad 100644 --- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h +++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h @@ -688,6 +688,56 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } } + // Helper function to count shared hits between a T3 and a TC, and find the unmatched T3 layer + ALPAKA_FN_ACC ALPAKA_FN_INLINE void countSharedT3HitsWithTC(TripletsConst triplets, + unsigned int t3Index, + TrackCandidatesBaseConst candsBase, + unsigned int tcIndex, + int& sharedHitCount, + int& unmatchedLayerSlot) { + sharedHitCount = 0; + unmatchedLayerSlot = -1; + + // T3 has 3 layers, 6 hits (2 per layer) + CMS_UNROLL_LOOP + for (int layerIndex = 0; layerIndex < Params_T3::kLayers; ++layerIndex) { + const unsigned int t3Hit0 = triplets.hitIndices()[t3Index][2 * layerIndex + 0]; + const unsigned int t3Hit1 = triplets.hitIndices()[t3Index][2 * layerIndex + 1]; + + bool hit0InTC = false; + bool hit1InTC = false; + + // Check against all TC hits (13 layers * 2 hits per layer) + CMS_UNROLL_LOOP + for (int tcLayerSlot = 0; tcLayerSlot < Params_TC::kLayers; ++tcLayerSlot) { + const unsigned int tcHit0 = candsBase.hitIndices()[tcIndex][tcLayerSlot][0]; + const unsigned int tcHit1 = candsBase.hitIndices()[tcIndex][tcLayerSlot][1]; + + // Skip empty slots + if (tcHit0 == kTCEmptyHitIdx && tcHit1 == kTCEmptyHitIdx) + continue; + + if (t3Hit0 == tcHit0 || t3Hit0 == tcHit1) + hit0InTC = true; + if (t3Hit1 == tcHit0 || t3Hit1 == tcHit1) + hit1InTC = true; + + if (hit0InTC && hit1InTC) + break; + } + + if (hit0InTC) + ++sharedHitCount; + if (hit1InTC) + ++sharedHitCount; + + // Track unmatched layer (both hits not found in TC) + if (!hit0InTC && !hit1InTC) { + unmatchedLayerSlot = layerIndex; + } + } + } + struct ExtendTrackCandidatesFromDupT5 { static constexpr int kPackedScoreShift = 32; // Bit shift for packing the score (upper 32 bits). static constexpr int kPackedIndexShift = 4; // Bit shift for packing the T5 index. @@ -871,6 +921,201 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } }; + struct ExtendTrackCandidatesFromT3 { + // Packing constants - NOTE: we invert fakeScore since lower is better + static constexpr int kPackedScoreShift = 32; + static constexpr int kPackedIndexShift = 4; + static constexpr unsigned int kPackedIndexMask = 0xFFFFFFF; + static constexpr unsigned int kPackedSlotMask = 0xF; + static constexpr int kT3MinSharedHits = 4; + + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + ModulesConst modules, + ObjectRangesConst ranges, + TripletsConst triplets, + TripletsOccupancyConst tripletsOccupancy, + SegmentsConst segments, + MiniDoubletsConst mds, + QuintupletsConst quintuplets, + PixelTripletsConst pixelTriplets, + QuadrupletsConst quadruplets, + TrackCandidatesBase candsBase, + TrackCandidatesExtended candsExtended) const { + // Assert that the number of blocks equals nTC + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0u] == candsBase.nTrackCandidates())); + + // Shared memory: Packed best candidate info per logical OT layer (1..11) + // Format: [Score (32 bits) | T3 Index (28 bits) | Layer Slot (4 bits)] + uint64_t* sharedBestPacked = alpaka::declareSharedVar(acc); + + // In 1D, the Grid index [0] is the Block index (which is our TC index) + const unsigned int trackCandidateIndex = alpaka::getIdx(acc)[0u]; + + // Initialize shared memory once per block + if (cms::alpakatools::once_per_block(acc)) { + for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) { + sharedBestPacked[logicalLayerBin] = 0; + } + } + alpaka::syncBlockThreads(acc); + + // Only consider merging T3s onto non-pLS TCs (pT5, pT3, T4, T5) + const LSTObjType trackCandidateType = candsBase.trackCandidateType()[trackCandidateIndex]; + if (trackCandidateType == LSTObjType::pLS || trackCandidateType == LSTObjType::T4) + return; + + // Get TC eta/phi based on type + float baseEta, basePhi; + const unsigned int innerObjIdx = candsExtended.objectIndices()[trackCandidateIndex][0]; + const unsigned int outerObjIdx = candsExtended.objectIndices()[trackCandidateIndex][1]; + + if (trackCandidateType == LSTObjType::T5) { + baseEta = __H2F(quintuplets.eta()[innerObjIdx]); + basePhi = __H2F(quintuplets.phi()[innerObjIdx]); + } else if (trackCandidateType == LSTObjType::pT5) { + // pT5: objectIndices[1] is the T5 index + baseEta = __H2F(quintuplets.eta()[outerObjIdx]); + basePhi = __H2F(quintuplets.phi()[outerObjIdx]); + } else if (trackCandidateType == LSTObjType::pT3) { + baseEta = __H2F(pixelTriplets.eta()[innerObjIdx]); + basePhi = __H2F(pixelTriplets.phi()[innerObjIdx]); + } else if (trackCandidateType == LSTObjType::T4) { + // T4: Get eta/phi from first triplet's first MD + const unsigned int innerTripletIdx = quadruplets.tripletIndices()[innerObjIdx][0]; + const unsigned int firstSegmentIdx = triplets.segmentIndices()[innerTripletIdx][0]; + const unsigned int firstMDIdx = segments.mdIndices()[firstSegmentIdx][0]; + baseEta = mds.anchorEta()[firstMDIdx]; + basePhi = mds.anchorPhi()[firstMDIdx]; + } else { + return; // Unknown type + } + + // Linear thread index and block dimension in 1D + const int threadIndexFlat = alpaka::getIdx(acc)[0u]; + const int blockDimFlat = alpaka::getWorkDiv(acc)[0u]; + + // Scan over all lower modules to find matching T3s + for (int lowerModuleIndex = threadIndexFlat; lowerModuleIndex < static_cast(modules.nLowerModules()); + lowerModuleIndex += blockDimFlat) { + const int firstTripletInModule = ranges.tripletModuleIndices()[lowerModuleIndex]; + if (firstTripletInModule == -1) + continue; + + const unsigned int nTripletsInModule = tripletsOccupancy.nTriplets()[lowerModuleIndex]; + + for (unsigned int tripletOffset = 0; tripletOffset < nTripletsInModule; ++tripletOffset) { + const unsigned int t3Index = firstTripletInModule + tripletOffset; + + // Get T3 eta/phi from first MD + const unsigned int firstSegmentIdx = triplets.segmentIndices()[t3Index][0]; + const unsigned int firstMDIdx = segments.mdIndices()[firstSegmentIdx][0]; + const float t3Eta = mds.anchorEta()[firstMDIdx]; + const float t3Phi = mds.anchorPhi()[firstMDIdx]; + + // Quick eta/phi window selection + if (alpaka::math::abs(acc, baseEta - t3Eta) > 0.2f) + continue; + if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, t3Phi)) > 0.2f) + continue; + + // Apply T3 DNN fake score cut (fakeScore must be BELOW threshold) + const float fakeScore = triplets.fakeScore()[t3Index]; + const float absT3Eta = alpaka::math::abs(acc, t3Eta); + const float t3_pt = triplets.radius()[t3Index] * lst::k2Rinv1GeVf * 2; + const uint8_t pt_index = (t3_pt > 5.0f); + const uint8_t bin_index = + (absT3Eta > 2.5f) ? (dnn::kEtaBins - 1) : static_cast(absT3Eta / dnn::kEtaSize); + + if (fakeScore >= dnn::t3dnn::kWp_extend_fake[pt_index][bin_index]) + continue; + + // Count shared hits with TC + int sharedHitCount = 0; + int unmatchedLayerSlot = -1; + countSharedT3HitsWithTC( + triplets, t3Index, candsBase, trackCandidateIndex, sharedHitCount, unmatchedLayerSlot); + + if (sharedHitCount < kT3MinSharedHits) + continue; + if (unmatchedLayerSlot < 0) + continue; // Must have exactly 1 unmatched layer + + // Candidate score: invert fakeScore since lower fake = better quality + // We want higher values to win in atomic compare, so use (1 - fakeScore) + const float candidateScore = 1.0f - fakeScore; + + // Get the logical layer of the unmatched T3 layer + const uint8_t newLogicalLayer = triplets.logicalLayers()[t3Index][unmatchedLayerSlot]; + const int logicalLayerBin = static_cast(newLogicalLayer) - 1; // 0..kLogicalOTLayers-1 + + if (logicalLayerBin < 0 || logicalLayerBin >= lst::kLogicalOTLayers) + continue; + + // Pack the Score, Index, and Slot into 64 bits for atomic update + uint64_t scoreBits = (uint64_t)std::bit_cast(candidateScore); + uint64_t newPacked = (scoreBits << kPackedScoreShift) | + ((uint64_t)(t3Index & kPackedIndexMask) << kPackedIndexShift) | + (unmatchedLayerSlot & kPackedSlotMask); + + // Atomic CAS loop to update best T3 for this layer + uint64_t oldPacked = sharedBestPacked[logicalLayerBin]; + while (true) { + const float oldScore = std::bit_cast((int)(oldPacked >> kPackedScoreShift)); + + // If we aren't strictly better, stop trying + if (candidateScore <= oldScore) { + break; + } + + uint64_t assumedOld = alpaka::atomicCas( + acc, &sharedBestPacked[logicalLayerBin], oldPacked, newPacked, alpaka::hierarchy::Threads{}); + + if (assumedOld == oldPacked) { + break; // Success + } else { + oldPacked = assumedOld; // Failed, update view and retry + } + } + } + } + + // One thread per block finalizes by actually extending the TC + alpaka::syncBlockThreads(acc); + + if (cms::alpakatools::once_per_block(acc)) { + CMS_UNROLL_LOOP + for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) { + uint64_t bestPacked = sharedBestPacked[logicalLayerBin]; + + // Check if valid update occurred (non-zero score) + int bestScoreInt = (int)(bestPacked >> kPackedScoreShift); + if (bestScoreInt == 0) + continue; + + // Unpack Index (bits 4-31) and Slot (bits 0-3) + const int bestT3Index = (int)((bestPacked >> kPackedIndexShift) & kPackedIndexMask); + const int bestT3LayerSlot = (int)(bestPacked & kPackedSlotMask); + + const uint8_t logicalLayer = static_cast(logicalLayerBin + 1); // 1..11 + const int layerSlot = (logicalLayer - 1) + Params_TC::kPixelLayerSlots; // OT slots for TC + + const uint16_t lowerModuleIndex = triplets.lowerModuleIndices()[bestT3Index][bestT3LayerSlot]; + const unsigned int hitIndex0 = triplets.hitIndices()[bestT3Index][2 * bestT3LayerSlot + 0]; + const unsigned int hitIndex1 = triplets.hitIndices()[bestT3Index][2 * bestT3LayerSlot + 1]; + + addTrackCandidateLayerHits(candsBase, + candsExtended, + trackCandidateIndex, + layerSlot, + logicalLayer, + lowerModuleIndex, + hitIndex0, + hitIndex1); + } + } + } + }; + struct AddpT5asTrackCandidate { ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint16_t nLowerModules, diff --git a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc index f709c060b563a..9d53e3454709e 100644 --- a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc +++ b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc @@ -122,6 +122,8 @@ void createT5DNNBranches() { //________________________________________________________________________________________________________________________________ void createT3DNNBranches() { // Common branches for T3 properties based on TripletsSoA fields + ana.tx->createBranch>("t3_fakeScore"); + ana.tx->createBranch>>("t3_hitIndices"); ana.tx->createBranch>("t3_betaIn"); ana.tx->createBranch>("t3_centerX"); ana.tx->createBranch>("t3_centerY"); @@ -319,6 +321,7 @@ void createTrackCandidateBranches() { ana.tx->createBranch>("tc_nhitOT"); ana.tx->createBranch>("tc_nhits"); ana.tx->createBranch>("tc_nlayers"); + ana.tx->createBranch>>("tc_hitIndices"); // list of idx of all matched (> 0%) simulated track ana.tx->createBranch>>("tc_simIdxAll"); // list of idx of all matched (> 0%) simulated track @@ -2219,6 +2222,13 @@ void setTrackCandidateBranches(LSTEvent* event, percent_matched, matchfrac); + std::vector tc_hits_u; + std::vector tc_types_u; + if (type != LSTObjType::pLS) + std::tie(tc_hits_u, tc_types_u) = getHitIdxsAndTypesFromTC(event, tc_idx); + std::vector tc_hits_i(tc_hits_u.begin(), tc_hits_u.end()); + ana.tx->pushbackToBranch>("tc_hitIndices", tc_hits_i); + int nPixHits = 0, nOtHits = 0, nLayers = 0; for (int layerSlot = 0; layerSlot < Params_TC::kLayers; ++layerSlot) { if (trackCandidatesExtended.lowerModuleIndices()[tc_idx][layerSlot] == lst::kTCEmptyLowerModule) @@ -2493,10 +2503,18 @@ void fillT3DNNBranches(LSTEvent* event, unsigned int iT3) { auto const& hitsBase = event->getInput(); auto const& hitsExtended = event->getHits(); auto const& modules = event->getModules(); + auto const& triplets = event->getTriplets(); std::vector hitIdx = getHitsFromT3(event, iT3); std::vector hitObjects; + std::vector current_t3_hitIndices; + for (unsigned int h : hitIdx) { + current_t3_hitIndices.push_back(static_cast(h)); + } + ana.tx->pushbackToBranch>("t3_hitIndices", current_t3_hitIndices); + ana.tx->pushbackToBranch("t3_fakeScore", triplets.fakeScore()[iT3]); + for (int i = 0; i < hitIdx.size(); ++i) { unsigned int hit = hitIdx[i]; float x = hitsBase.xs()[hit];