From 6230e159d2d73f581c899bf4635efd3a2d2ce05e Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Fri, 30 Jan 2026 15:13:01 -0500
Subject: [PATCH] LST T3 Merging

---
 RecoTracker/LSTCore/interface/alpaka/Common.h |   3 +
 .../LSTCore/src/alpaka/LSTEvent.dev.cc        |  15 ++
 .../LSTCore/src/alpaka/TrackCandidate.h       | 245 ++++++++++++++++++
 .../standalone/code/core/write_lst_ntuple.cc  |  18 ++
 4 files changed, 281 insertions(+)
diff --git a/RecoTracker/LSTCore/interface/alpaka/Common.h b/RecoTracker/LSTCore/interface/alpaka/Common.h
index 5ed0546351490..edae0b97b19a6 100644
--- a/RecoTracker/LSTCore/interface/alpaka/Common.h
+++ b/RecoTracker/LSTCore/interface/alpaka/Common.h
@@ -83,6 +83,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       HOST_DEVICE_CONSTANT float kWp_displaced[kPtBins][kEtaBins] = {
           {0.0334f, 0.0504f, 0.0748f, 0.0994f, 0.1128f, 0.1123f, 0.1118f, 0.1525f, 0.1867f, 0.1847f},
           {0.0091f, 0.0075f, 0.0350f, 0.0213f, 0.0435f, 0.0676f, 0.1957f, 0.1649f, 0.1080f, 0.1046f}};
+      HOST_DEVICE_CONSTANT float kWp_extend_fake[kPtBins][kEtaBins] = {
+          {0.0165f, 0.0128f, 0.0291f, 0.0415f, 0.0001f, 0.0066f, 0.0117f, 0.0059f, 0.0169f, 0.0167f},
+          {0.0165f, 0.0128f, 0.0291f, 0.0415f, 0.0001f, 0.0066f, 0.0117f, 0.0059f, 0.0169f, 0.0167f}};
     }  // namespace t3dnn
 
     namespace t5dnn {
diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
index 084c376d53408..f2353d4091ea6 100644
--- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
+++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
@@ -659,6 +659,21 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets)
                       trackCandidatesBaseDC_->view(),
                       trackCandidatesExtendedDC_->view());
 
+  alpaka::exec<Acc1D>(queue_,
+                      wd,
+                      ExtendTrackCandidatesFromT3{},
+                      modules_.const_view().modules(),
+                      rangesDC_->const_view(),
+                      tripletsDC_->const_view().triplets(),
+                      tripletsDC_->const_view().tripletsOccupancy(),
+                      segmentsDC_->const_view().segments(),
+                      miniDoubletsDC_->const_view().miniDoublets(),
+                      quintupletsDC_->const_view().quintuplets(),
+                      pixelTripletsDC_->const_view(),
+                      quadrupletsDC_->const_view().quadruplets(),
+                      trackCandidatesBaseDC_->view(),
+                      trackCandidatesExtendedDC_->view());
+
   // Check if either n_max_pixel_track_candidates or n_max_nonpixel_track_candidates was reached
   auto nTrackCanpT5Host_buf = cms::alpakatools::make_host_buffer<unsigned int>(queue_);
   auto nTrackCanpT3Host_buf = cms::alpakatools::make_host_buffer<unsigned int>(queue_);
diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
index 24c614ee52db9..16671a43f2fad 100644
--- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
+++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
@@ -688,6 +688,56 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     }
   }
 
+  // Helper function to count shared hits between a T3 and a TC, and find the unmatched T3 layer
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void countSharedT3HitsWithTC(TripletsConst triplets,
+                                                              unsigned int t3Index,
+                                                              TrackCandidatesBaseConst candsBase,
+                                                              unsigned int tcIndex,
+                                                              int& sharedHitCount,
+                                                              int& unmatchedLayerSlot) {
+    sharedHitCount = 0;
+    unmatchedLayerSlot = -1;
+
+    // T3 has 3 layers, 6 hits (2 per layer)
+    CMS_UNROLL_LOOP
+    for (int layerIndex = 0; layerIndex < Params_T3::kLayers; ++layerIndex) {
+      const unsigned int t3Hit0 = triplets.hitIndices()[t3Index][2 * layerIndex + 0];
+      const unsigned int t3Hit1 = triplets.hitIndices()[t3Index][2 * layerIndex + 1];
+
+      bool hit0InTC = false;
+      bool hit1InTC = false;
+
+      // Check against all TC hits (13 layers * 2 hits per layer)
+      CMS_UNROLL_LOOP
+      for (int tcLayerSlot = 0; tcLayerSlot < Params_TC::kLayers; ++tcLayerSlot) {
+        const unsigned int tcHit0 = candsBase.hitIndices()[tcIndex][tcLayerSlot][0];
+        const unsigned int tcHit1 = candsBase.hitIndices()[tcIndex][tcLayerSlot][1];
+
+        // Skip empty slots
+        if (tcHit0 == kTCEmptyHitIdx && tcHit1 == kTCEmptyHitIdx)
+          continue;
+
+        if (t3Hit0 == tcHit0 || t3Hit0 == tcHit1)
+          hit0InTC = true;
+        if (t3Hit1 == tcHit0 || t3Hit1 == tcHit1)
+          hit1InTC = true;
+
+        if (hit0InTC && hit1InTC)
+          break;
+      }
+
+      if (hit0InTC)
+        ++sharedHitCount;
+      if (hit1InTC)
+        ++sharedHitCount;
+
+      // Track unmatched layer (both hits not found in TC)
+      if (!hit0InTC && !hit1InTC) {
+        unmatchedLayerSlot = layerIndex;
+      }
+    }
+  }
+
   struct ExtendTrackCandidatesFromDupT5 {
     static constexpr int kPackedScoreShift = 32;                 // Bit shift for packing the score (upper 32 bits).
     static constexpr int kPackedIndexShift = 4;                  // Bit shift for packing the T5 index.
@@ -871,6 +921,201 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     }
   };
 
+  struct ExtendTrackCandidatesFromT3 {
+    // Packing constants - NOTE: we invert fakeScore since lower is better
+    static constexpr int kPackedScoreShift = 32;
+    static constexpr int kPackedIndexShift = 4;
+    static constexpr unsigned int kPackedIndexMask = 0xFFFFFFF;
+    static constexpr unsigned int kPackedSlotMask = 0xF;
+    static constexpr int kT3MinSharedHits = 4;
+
+    ALPAKA_FN_ACC void operator()(Acc1D const& acc,
+                                  ModulesConst modules,
+                                  ObjectRangesConst ranges,
+                                  TripletsConst triplets,
+                                  TripletsOccupancyConst tripletsOccupancy,
+                                  SegmentsConst segments,
+                                  MiniDoubletsConst mds,
+                                  QuintupletsConst quintuplets,
+                                  PixelTripletsConst pixelTriplets,
+                                  QuadrupletsConst quadruplets,
+                                  TrackCandidatesBase candsBase,
+                                  TrackCandidatesExtended candsExtended) const {
+      // Assert that the number of blocks equals nTC
+      ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u] == candsBase.nTrackCandidates()));
+
+      // Shared memory: Packed best candidate info per logical OT layer (1..11)
+      // Format: [Score (32 bits) | T3 Index (28 bits) | Layer Slot (4 bits)]
+      uint64_t* sharedBestPacked = alpaka::declareSharedVar<uint64_t[lst::kLogicalOTLayers], __COUNTER__>(acc);
+
+      // In 1D, the Grid index [0] is the Block index (which is our TC index)
+      const unsigned int trackCandidateIndex = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
+
+      // Initialize shared memory once per block
+      if (cms::alpakatools::once_per_block(acc)) {
+        for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) {
+          sharedBestPacked[logicalLayerBin] = 0;
+        }
+      }
+      alpaka::syncBlockThreads(acc);
+
+      // Only consider merging T3s onto non-pLS TCs (pT5, pT3, T4, T5)
+      const LSTObjType trackCandidateType = candsBase.trackCandidateType()[trackCandidateIndex];
+      if (trackCandidateType == LSTObjType::pLS || trackCandidateType == LSTObjType::T4)
+        return;
+
+      // Get TC eta/phi based on type
+      float baseEta, basePhi;
+      const unsigned int innerObjIdx = candsExtended.objectIndices()[trackCandidateIndex][0];
+      const unsigned int outerObjIdx = candsExtended.objectIndices()[trackCandidateIndex][1];
+
+      if (trackCandidateType == LSTObjType::T5) {
+        baseEta = __H2F(quintuplets.eta()[innerObjIdx]);
+        basePhi = __H2F(quintuplets.phi()[innerObjIdx]);
+      } else if (trackCandidateType == LSTObjType::pT5) {
+        // pT5: objectIndices[1] is the T5 index
+        baseEta = __H2F(quintuplets.eta()[outerObjIdx]);
+        basePhi = __H2F(quintuplets.phi()[outerObjIdx]);
+      } else if (trackCandidateType == LSTObjType::pT3) {
+        baseEta = __H2F(pixelTriplets.eta()[innerObjIdx]);
+        basePhi = __H2F(pixelTriplets.phi()[innerObjIdx]);
+      } else if (trackCandidateType == LSTObjType::T4) {
+        // T4: Get eta/phi from first triplet's first MD
+        const unsigned int innerTripletIdx = quadruplets.tripletIndices()[innerObjIdx][0];
+        const unsigned int firstSegmentIdx = triplets.segmentIndices()[innerTripletIdx][0];
+        const unsigned int firstMDIdx = segments.mdIndices()[firstSegmentIdx][0];
+        baseEta = mds.anchorEta()[firstMDIdx];
+        basePhi = mds.anchorPhi()[firstMDIdx];
+      } else {
+        return;  // Unknown type
+      }
+
+      // Linear thread index and block dimension in 1D
+      const int threadIndexFlat = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
+      const int blockDimFlat = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
+
+      // Scan over all lower modules to find matching T3s
+      for (int lowerModuleIndex = threadIndexFlat; lowerModuleIndex < static_cast<int>(modules.nLowerModules());
+           lowerModuleIndex += blockDimFlat) {
+        const int firstTripletInModule = ranges.tripletModuleIndices()[lowerModuleIndex];
+        if (firstTripletInModule == -1)
+          continue;
+
+        const unsigned int nTripletsInModule = tripletsOccupancy.nTriplets()[lowerModuleIndex];
+
+        for (unsigned int tripletOffset = 0; tripletOffset < nTripletsInModule; ++tripletOffset) {
+          const unsigned int t3Index = firstTripletInModule + tripletOffset;
+
+          // Get T3 eta/phi from first MD
+          const unsigned int firstSegmentIdx = triplets.segmentIndices()[t3Index][0];
+          const unsigned int firstMDIdx = segments.mdIndices()[firstSegmentIdx][0];
+          const float t3Eta = mds.anchorEta()[firstMDIdx];
+          const float t3Phi = mds.anchorPhi()[firstMDIdx];
+
+          // Quick eta/phi window selection
+          if (alpaka::math::abs(acc, baseEta - t3Eta) > 0.2f)
+            continue;
+          if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, t3Phi)) > 0.2f)
+            continue;
+
+          // Apply T3 DNN fake score cut (fakeScore must be BELOW threshold)
+          const float fakeScore = triplets.fakeScore()[t3Index];
+          const float absT3Eta = alpaka::math::abs(acc, t3Eta);
+          const float t3_pt = triplets.radius()[t3Index] * lst::k2Rinv1GeVf * 2;
+          const uint8_t pt_index = (t3_pt > 5.0f);
+          const uint8_t bin_index =
+              (absT3Eta > 2.5f) ? (dnn::kEtaBins - 1) : static_cast<uint8_t>(absT3Eta / dnn::kEtaSize);
+
+          if (fakeScore >= dnn::t3dnn::kWp_extend_fake[pt_index][bin_index])
+            continue;
+
+          // Count shared hits with TC
+          int sharedHitCount = 0;
+          int unmatchedLayerSlot = -1;
+          countSharedT3HitsWithTC(
+              triplets, t3Index, candsBase, trackCandidateIndex, sharedHitCount, unmatchedLayerSlot);
+
+          if (sharedHitCount < kT3MinSharedHits)
+            continue;
+          if (unmatchedLayerSlot < 0)
+            continue;  // Must have exactly 1 unmatched layer
+
+          // Candidate score: invert fakeScore since lower fake = better quality
+          // We want higher values to win in atomic compare, so use (1 - fakeScore)
+          const float candidateScore = 1.0f - fakeScore;
+
+          // Get the logical layer of the unmatched T3 layer
+          const uint8_t newLogicalLayer = triplets.logicalLayers()[t3Index][unmatchedLayerSlot];
+          const int logicalLayerBin = static_cast<int>(newLogicalLayer) - 1;  // 0..kLogicalOTLayers-1
+
+          if (logicalLayerBin < 0 || logicalLayerBin >= lst::kLogicalOTLayers)
+            continue;
+
+          // Pack the Score, Index, and Slot into 64 bits for atomic update
+          uint64_t scoreBits = (uint64_t)std::bit_cast<int>(candidateScore);
+          uint64_t newPacked = (scoreBits << kPackedScoreShift) |
+                               ((uint64_t)(t3Index & kPackedIndexMask) << kPackedIndexShift) |
+                               (unmatchedLayerSlot & kPackedSlotMask);
+
+          // Atomic CAS loop to update best T3 for this layer
+          uint64_t oldPacked = sharedBestPacked[logicalLayerBin];
+          while (true) {
+            const float oldScore = std::bit_cast<float>((int)(oldPacked >> kPackedScoreShift));
+
+            // If we aren't strictly better, stop trying
+            if (candidateScore <= oldScore) {
+              break;
+            }
+
+            uint64_t assumedOld = alpaka::atomicCas(
+                acc, &sharedBestPacked[logicalLayerBin], oldPacked, newPacked, alpaka::hierarchy::Threads{});
+
+            if (assumedOld == oldPacked) {
+              break;  // Success
+            } else {
+              oldPacked = assumedOld;  // Failed, update view and retry
+            }
+          }
+        }
+      }
+
+      // One thread per block finalizes by actually extending the TC
+      alpaka::syncBlockThreads(acc);
+
+      if (cms::alpakatools::once_per_block(acc)) {
+        CMS_UNROLL_LOOP
+        for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) {
+          uint64_t bestPacked = sharedBestPacked[logicalLayerBin];
+
+          // Check if valid update occurred (non-zero score)
+          int bestScoreInt = (int)(bestPacked >> kPackedScoreShift);
+          if (bestScoreInt == 0)
+            continue;
+
+          // Unpack Index (bits 4-31) and Slot (bits 0-3)
+          const int bestT3Index = (int)((bestPacked >> kPackedIndexShift) & kPackedIndexMask);
+          const int bestT3LayerSlot = (int)(bestPacked & kPackedSlotMask);
+
+          const uint8_t logicalLayer = static_cast<uint8_t>(logicalLayerBin + 1);  // 1..11
+          const int layerSlot = (logicalLayer - 1) + Params_TC::kPixelLayerSlots;  // OT slots for TC
+
+          const uint16_t lowerModuleIndex = triplets.lowerModuleIndices()[bestT3Index][bestT3LayerSlot];
+          const unsigned int hitIndex0 = triplets.hitIndices()[bestT3Index][2 * bestT3LayerSlot + 0];
+          const unsigned int hitIndex1 = triplets.hitIndices()[bestT3Index][2 * bestT3LayerSlot + 1];
+
+          addTrackCandidateLayerHits(candsBase,
+                                     candsExtended,
+                                     trackCandidateIndex,
+                                     layerSlot,
+                                     logicalLayer,
+                                     lowerModuleIndex,
+                                     hitIndex0,
+                                     hitIndex1);
+        }
+      }
+    }
+  };
+
   struct AddpT5asTrackCandidate {
     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
                                   uint16_t nLowerModules,
diff --git a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc
index f709c060b563a..9d53e3454709e 100644
--- a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc
+++ b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc
@@ -122,6 +122,8 @@ void createT5DNNBranches() {
 //________________________________________________________________________________________________________________________________
 void createT3DNNBranches() {
   // Common branches for T3 properties based on TripletsSoA fields
+  ana.tx->createBranch<std::vector<float>>("t3_fakeScore");
+  ana.tx->createBranch<std::vector<std::vector<int>>>("t3_hitIndices");
   ana.tx->createBranch<std::vector<float>>("t3_betaIn");
   ana.tx->createBranch<std::vector<float>>("t3_centerX");
   ana.tx->createBranch<std::vector<float>>("t3_centerY");
@@ -319,6 +321,7 @@ void createTrackCandidateBranches() {
   ana.tx->createBranch<std::vector<int>>("tc_nhitOT");
   ana.tx->createBranch<std::vector<int>>("tc_nhits");
   ana.tx->createBranch<std::vector<int>>("tc_nlayers");
+  ana.tx->createBranch<std::vector<std::vector<int>>>("tc_hitIndices");
   // list of idx of all matched (> 0%) simulated track
   ana.tx->createBranch<std::vector<std::vector<int>>>("tc_simIdxAll");
   // list of idx of all matched (> 0%) simulated track
@@ -2219,6 +2222,13 @@ void setTrackCandidateBranches(LSTEvent* event,
                                                                                            percent_matched,
                                                                                            matchfrac);
 
+    std::vector<unsigned int> tc_hits_u;
+    std::vector<unsigned int> tc_types_u;
+    if (type != LSTObjType::pLS)
+      std::tie(tc_hits_u, tc_types_u) = getHitIdxsAndTypesFromTC(event, tc_idx);
+    std::vector<int> tc_hits_i(tc_hits_u.begin(), tc_hits_u.end());
+    ana.tx->pushbackToBranch<std::vector<int>>("tc_hitIndices", tc_hits_i);
+
     int nPixHits = 0, nOtHits = 0, nLayers = 0;
     for (int layerSlot = 0; layerSlot < Params_TC::kLayers; ++layerSlot) {
       if (trackCandidatesExtended.lowerModuleIndices()[tc_idx][layerSlot] == lst::kTCEmptyLowerModule)
@@ -2493,10 +2503,18 @@ void fillT3DNNBranches(LSTEvent* event, unsigned int iT3) {
   auto const& hitsBase = event->getInput<HitsBaseSoA>();
   auto const& hitsExtended = event->getHits<HitsExtendedSoA>();
   auto const& modules = event->getModules<ModulesSoA>();
+  auto const& triplets = event->getTriplets<TripletsSoA>();
 
   std::vector<unsigned int> hitIdx = getHitsFromT3(event, iT3);
   std::vector<lst_math::Hit> hitObjects;
 
+  std::vector<int> current_t3_hitIndices;
+  for (unsigned int h : hitIdx) {
+    current_t3_hitIndices.push_back(static_cast<int>(h));
+  }
+  ana.tx->pushbackToBranch<std::vector<int>>("t3_hitIndices", current_t3_hitIndices);
+  ana.tx->pushbackToBranch<float>("t3_fakeScore", triplets.fakeScore()[iT3]);
+
   for (int i = 0; i < hitIdx.size(); ++i) {
     unsigned int hit = hitIdx[i];
     float x = hitsBase.xs()[hit];