From 27b2ca280aaf10e42458b4c8695424a9b923d09d Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 11 Mar 2026 18:24:29 -0400
Subject: [PATCH] various cpu improvements

---
 RecoTracker/LSTCore/src/alpaka/Kernels.h      |  95 ++-
 .../LSTCore/src/alpaka/LSTEvent.dev.cc        |  38 +-
 RecoTracker/LSTCore/src/alpaka/LSTEvent.h     |   4 +
 RecoTracker/LSTCore/src/alpaka/MiniDoublet.h  | 745 +++++++++---------
 .../LSTCore/src/alpaka/NeuralNetwork.h        |   3 +-
 .../LSTCore/src/alpaka/PixelQuintuplet.h      |  72 +-
 RecoTracker/LSTCore/src/alpaka/PixelTriplet.h | 479 +++++------
 RecoTracker/LSTCore/src/alpaka/Quintuplet.h   | 112 ++-
 RecoTracker/LSTCore/src/alpaka/Segment.h      | 400 ++++++----
 .../LSTCore/src/alpaka/TrackCandidate.h       |  59 +-
 RecoTracker/LSTCore/src/alpaka/Triplet.h      | 304 ++++---
 11 files changed, 1243 insertions(+), 1068 deletions(-)

diff --git a/RecoTracker/LSTCore/src/alpaka/Kernels.h b/RecoTracker/LSTCore/src/alpaka/Kernels.h
index 49c700f48ecc6..3533b08fdb51e 100644
--- a/RecoTracker/LSTCore/src/alpaka/Kernels.h
+++ b/RecoTracker/LSTCore/src/alpaka/Kernels.h
@@ -242,12 +242,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
           unsigned int quintupletModuleIndices_lowmod2 = ranges.quintupletModuleIndices()[lowmod2];
 
+          // Module-level eta pre-check: skip module pairs where Q5s are far apart in eta.
+          // Within a module, Q5 etas are tightly clustered (<0.1), so 0.3 margin is very safe.
+          if (alpaka::math::abs(acc,
+                                __H2F(quintuplets.eta()[quintupletModuleIndices_lowmod1]) -
+                                    __H2F(quintuplets.eta()[quintupletModuleIndices_lowmod2])) > 0.3f)
+            continue;
+
           for (unsigned int ix1 = 0; ix1 < nQuintuplets_lowmod1; ix1 += 1) {
             unsigned int ix = quintupletModuleIndices_lowmod1 + ix1;
             if (quintuplets.isDup()[ix] & 1)
               continue;
 
-            bool isPT5_ix = quintuplets.partOfPT5()[ix];
+            const bool isPT5_ix = quintuplets.partOfPT5()[ix];
+            const float eta1 = __H2F(quintuplets.eta()[ix]);
+            const float phi1 = __H2F(quintuplets.phi()[ix]);
+            const float score_rphisum1 = __H2F(quintuplets.score_rphisum()[ix]);
 
             for (unsigned int jx1 = 0; jx1 < nQuintuplets_lowmod2; jx1++) {
               unsigned int jx = quintupletModuleIndices_lowmod2 + jx1;
@@ -257,31 +267,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
               if (quintuplets.isDup()[jx] & 1)
                 continue;
 
-              bool isPT5_jx = quintuplets.partOfPT5()[jx];
+              const bool isPT5_jx = quintuplets.partOfPT5()[jx];
 
               if (isPT5_ix && isPT5_jx)
                 continue;
 
-              float eta1 = __H2F(quintuplets.eta()[ix]);
-              float phi1 = __H2F(quintuplets.phi()[ix]);
-              float score_rphisum1 = __H2F(quintuplets.score_rphisum()[ix]);
-
-              float eta2 = __H2F(quintuplets.eta()[jx]);
-              float phi2 = __H2F(quintuplets.phi()[jx]);
-              float score_rphisum2 = __H2F(quintuplets.score_rphisum()[jx]);
-
-              float dEta = alpaka::math::abs(acc, eta1 - eta2);
-              float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
-
+              const float eta2 = __H2F(quintuplets.eta()[jx]);
+              const float dEta = alpaka::math::abs(acc, eta1 - eta2);
               if (dEta > 0.1f)
                 continue;
 
+              const float phi2 = __H2F(quintuplets.phi()[jx]);
+              const float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
               if (alpaka::math::abs(acc, dPhi) > 0.1f)
                 continue;
 
-              float dR2 = dEta * dEta + dPhi * dPhi;
-              int nMatched = checkHitsT5(ix, jx, quintuplets);
-              const int minNHitsForDup_T5 = 5;
+              const float dR2 = dEta * dEta + dPhi * dPhi;
+              const int nMatched = checkHitsT5(ix, jx, quintuplets);
+              constexpr int minNHitsForDup_T5 = 5;
 
               float d2 = 0.f;
               CMS_UNROLL_LOOP
@@ -291,12 +294,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
               }
 
               if (((dR2 < 0.001f || nMatched >= minNHitsForDup_T5) && d2 < 1.0f) || (dR2 < 0.02f && d2 < 0.1f)) {
+                const float score_rphisum2 = __H2F(quintuplets.score_rphisum()[jx]);
                 if (isPT5_jx || score_rphisum1 > score_rphisum2) {
                   rmQuintupletFromMemory(quintuplets, ix, true);
+                  break;  // ix is now dup, no need to compare further
                 } else if (isPT5_ix || score_rphisum1 < score_rphisum2) {
                   rmQuintupletFromMemory(quintuplets, jx, true);
                 } else {
                   rmQuintupletFromMemory(quintuplets, (ix < jx ? ix : jx), true);
+                  if (ix < jx)
+                    break;  // ix was marked dup
                 }
               }
             }
@@ -458,11 +465,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     ALPAKA_FN_ACC void operator()(Acc2D const& acc, PixelQuintuplets pixelQuintuplets) const {
       unsigned int nPixelQuintuplets = pixelQuintuplets.nPixelQuintuplets();
       for (unsigned int ix : cms::alpakatools::uniform_elements_y(acc, nPixelQuintuplets)) {
+        float eta1 = __H2F(pixelQuintuplets.eta()[ix]);
+        float phi1 = __H2F(pixelQuintuplets.phi()[ix]);
         float score1 = __H2F(pixelQuintuplets.score()[ix]);
         for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, nPixelQuintuplets)) {
           if (ix == jx)
             continue;
 
+          float eta2 = __H2F(pixelQuintuplets.eta()[jx]);
+          if (alpaka::math::abs(acc, eta1 - eta2) > 0.1f)
+            continue;
+
+          float phi2 = __H2F(pixelQuintuplets.phi()[jx]);
+          if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, phi1, phi2)) > 0.1f)
+            continue;
+
           int nMatched = checkHitspT5(ix, jx, pixelQuintuplets);
           float score2 = __H2F(pixelQuintuplets.score()[jx]);
           const int minNHitsForDup_pT5 = 7;
@@ -477,53 +494,64 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     }
   };
 
+  // pLS dedup using eta-sorted indices. On CPU the sorted order enables early termination
+  // via break; on GPU each thread that breaks simply exits its own iteration.
   struct CheckHitspLS {
     ALPAKA_FN_ACC void operator()(Acc2D const& acc,
                                   ModulesConst modules,
                                   SegmentsOccupancyConst segmentsOccupancy,
                                   PixelSeedsConst pixelSeeds,
                                   PixelSegments pixelSegments,
-                                  bool secondpass) const {
+                                  bool secondpass,
+                                  unsigned int const* __restrict__ etaOrder) const {
       int pixelModuleIndex = modules.nLowerModules();
       unsigned int nPixelSegments = segmentsOccupancy.nSegments()[pixelModuleIndex];
 
       if (nPixelSegments > n_max_pixel_segments_per_module)
         nPixelSegments = n_max_pixel_segments_per_module;
 
-      for (unsigned int ix : cms::alpakatools::uniform_elements_y(acc, nPixelSegments)) {
+      for (unsigned int si : cms::alpakatools::uniform_elements_y(acc, nPixelSegments)) {
+        unsigned int ix = etaOrder[si];
         if (secondpass && (!pixelSeeds.isQuad()[ix] || (pixelSegments.isDup()[ix] & 1)))
           continue;
 
-        auto const& phits1 = pixelSegments.pLSHitsIdxs()[ix];
         float eta_pix1 = pixelSeeds.eta()[ix];
         float phi_pix1 = pixelSeeds.phi()[ix];
 
-        for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, ix + 1, nPixelSegments)) {
+        for (unsigned int sj : cms::alpakatools::uniform_elements_x(acc, si + 1, nPixelSegments)) {
+          unsigned int jx = etaOrder[sj];
           float eta_pix2 = pixelSeeds.eta()[jx];
-          float phi_pix2 = pixelSeeds.phi()[jx];
 
-          if (alpaka::math::abs(acc, eta_pix2 - eta_pix1) > 0.1f)
-            continue;
+          if (eta_pix2 - eta_pix1 > 0.1f)
+            break;
 
           if (secondpass && (!pixelSeeds.isQuad()[jx] || (pixelSegments.isDup()[jx] & 1)))
             continue;
 
-          int8_t quad_diff = pixelSeeds.isQuad()[ix] - pixelSeeds.isQuad()[jx];
-          float score_diff = pixelSegments.score()[ix] - pixelSegments.score()[jx];
-          // Always keep quads over trips. If they are the same, we want the object with better score
+          float phi_pix2 = pixelSeeds.phi()[jx];
+
+          // Use min/max of original indices so hit comparison direction is consistent
+          // (lower index = phits1), since the eta-sorted iteration order doesn't
+          // guarantee ix < jx in original indices.
+          unsigned int lo = (ix < jx) ? ix : jx;
+          unsigned int hi = (ix < jx) ? jx : ix;
+
+          int8_t quad_diff = pixelSeeds.isQuad()[lo] - pixelSeeds.isQuad()[hi];
+          float score_diff = pixelSegments.score()[lo] - pixelSegments.score()[hi];
           int idxToRemove;
           if (quad_diff > 0)
-            idxToRemove = jx;
+            idxToRemove = hi;
           else if (quad_diff < 0)
-            idxToRemove = ix;
+            idxToRemove = lo;
           else if (score_diff < 0)
-            idxToRemove = jx;
+            idxToRemove = hi;
           else if (score_diff > 0)
-            idxToRemove = ix;
+            idxToRemove = lo;
           else
-            idxToRemove = ix;
+            idxToRemove = lo;
 
-          auto const& phits2 = pixelSegments.pLSHitsIdxs()[jx];
+          auto const& phits1 = pixelSegments.pLSHitsIdxs()[lo];
+          auto const& phits2 = pixelSegments.pLSHitsIdxs()[hi];
 
           int npMatched = 0;
           for (int i = 0; i < Params_pLS::kHits; i++) {
@@ -536,7 +564,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
             }
             if (pmatched) {
               npMatched++;
-              // Only one hit is enough
               if (secondpass)
                 break;
             }
@@ -546,7 +573,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
             rmPixelSegmentFromMemory(pixelSegments, idxToRemove, secondpass);
           }
           if (secondpass) {
-            float dEta = alpaka::math::abs(acc, eta_pix1 - eta_pix2);
+            float dEta = alpaka::math::abs(acc, eta_pix2 - eta_pix1);
             float dPhi = cms::alpakatools::deltaPhi(acc, phi_pix1, phi_pix2);
 
             float dR2 = dEta * dEta + dPhi * dPhi;
diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
index 111afe0a286db..9f35b7d2b54f1 100644
--- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
+++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
@@ -15,7 +15,9 @@
 #include "Triplet.h"
 #include "Quadruplet.h"
 
+#include <algorithm>
 #include <format>
+#include <numeric>
 
 using Device = ALPAKA_ACCELERATOR_NAMESPACE::Device;
 using Queue = ALPAKA_ACCELERATOR_NAMESPACE::Queue;
@@ -75,6 +77,7 @@ void LSTEvent::resetEventSync() {
   pixelTripletsDC_.reset();
   pixelQuintupletsDC_.reset();
   quadrupletsDC_.reset();
+  etaSortedPLSIndex_.reset();
 
   lstInputHC_.reset();
   hitsHC_.reset();
@@ -569,6 +572,7 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets)
   if (!no_pls_dupclean) {
     auto const checkHitspLS_workDiv = cms::alpakatools::make_workdiv<Acc2D>({max_blocks * 4, max_blocks / 4}, {16, 16});
 
+    buildEtaSortedPLSIndex();
     alpaka::exec<Acc2D>(queue_,
                         checkHitspLS_workDiv,
                         CheckHitspLS{},
@@ -576,7 +580,8 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets)
                         segmentsDC_->const_view().segmentsOccupancy(),
                         lstInputDC_->const_view().pixelSeeds(),
                         pixelSegmentsDC_->view(),
-                        true);
+                        true,
+                        etaSortedPLSIndex_->data());
   }
 
   // Counting kernel
@@ -1005,10 +1010,38 @@ void LSTEvent::createQuintuplets() {
   }
 }
 
+void LSTEvent::buildEtaSortedPLSIndex() {
+  if (etaSortedPLSIndex_)
+    return;  // already built
+
+  auto nSeg_buf_h = cms::alpakatools::make_host_buffer<unsigned int>(queue_);
+  auto nSeg_view_d = cms::alpakatools::make_device_view(
+      queue_, segmentsDC_->const_view().segmentsOccupancy().nSegments()[pixelModuleIndex_]);
+  alpaka::memcpy(queue_, nSeg_buf_h, nSeg_view_d);
+  alpaka::wait(queue_);
+  unsigned int nPixelSegments = std::min(*nSeg_buf_h.data(), n_max_pixel_segments_per_module);
+
+  auto eta_buf_h = cms::alpakatools::make_host_buffer<float[]>(queue_, nPixelSegments);
+  auto eta_view_d =
+      cms::alpakatools::make_device_view(queue_, lstInputDC_->const_view().pixelSeeds().eta(), nPixelSegments);
+  alpaka::memcpy(queue_, eta_buf_h, eta_view_d, nPixelSegments);
+  alpaka::wait(queue_);
+
+  auto order_buf_h = cms::alpakatools::make_host_buffer<unsigned int[]>(queue_, nPixelSegments);
+  auto* order = order_buf_h.data();
+  std::iota(order, order + nPixelSegments, 0u);
+  auto const* eta = eta_buf_h.data();
+  std::sort(order, order + nPixelSegments, [eta](unsigned int a, unsigned int b) { return eta[a] < eta[b]; });
+
+  etaSortedPLSIndex_.emplace(cms::alpakatools::make_device_buffer<unsigned int[]>(queue_, nPixelSegments));
+  alpaka::memcpy(queue_, *etaSortedPLSIndex_, order_buf_h, nPixelSegments);
+}
+
 void LSTEvent::pixelLineSegmentCleaning(bool no_pls_dupclean) {
   if (!no_pls_dupclean) {
     auto const checkHitspLS_workDiv = cms::alpakatools::make_workdiv<Acc2D>({max_blocks * 4, max_blocks / 4}, {16, 16});
 
+    buildEtaSortedPLSIndex();
     alpaka::exec<Acc2D>(queue_,
                         checkHitspLS_workDiv,
                         CheckHitspLS{},
@@ -1016,7 +1049,8 @@ void LSTEvent::pixelLineSegmentCleaning(bool no_pls_dupclean) {
                         segmentsDC_->const_view().segmentsOccupancy(),
                         lstInputDC_->const_view().pixelSeeds(),
                         pixelSegmentsDC_->view(),
-                        false);
+                        false,
+                        etaSortedPLSIndex_->data());
   }
 }
 
diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.h b/RecoTracker/LSTCore/src/alpaka/LSTEvent.h
index ca94566cad404..0ae3a47378dd1 100644
--- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.h
+++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.h
@@ -35,6 +35,7 @@
 #include "RecoTracker/LSTCore/interface/alpaka/EndcapGeometryDevDeviceCollection.h"
 
 #include "HeterogeneousCore/AlpakaInterface/interface/host.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
@@ -98,6 +99,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     EndcapGeometryDevDeviceCollection const& endcapGeometry_;
     bool objectsStatistics_ = false;
     double memoryAllocatedMB_ = 0;
+    std::optional<cms::alpakatools::device_buffer<Device, unsigned int[]>> etaSortedPLSIndex_;
+
+    void buildEtaSortedPLSIndex();
 
   public:
     // Constructor used for CMSSW integration. Uses an external queue.
diff --git a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h
index 806869522e062..e5b3e700d106e 100644
--- a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h
+++ b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h
@@ -13,15 +13,41 @@
 #include "RecoTracker/LSTCore/interface/ObjectRangesSoA.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
+
+  // Pre-computed module-constant data for MiniDoublet kernels.
+  // Populated once per module to avoid redundant SoA loads in the inner hit-pair loop.
+  struct ModuleMDData {
+    uint16_t lowerModuleIndex;
+    short subdet;
+    short side;
+    short moduleType;
+    short moduleLayerType;
+    unsigned int iL;  // layer - 1
+    bool isTilted;
+    bool isEndcapTwoS;
+    bool isGloballyInner;
+
+    // shiftStripHits constants
+    float slope;      // dxdys
+    float drdz;       // drdzs[lowerModuleIndex]
+    float moduleSep;  // moduleGapSize result
+
+    // dPhiThreshold pre-computed constants
+    float miniPVoff;
+    float miniMuls;
+    float miniTilt2;     // 0 for non-tilted and endcap
+    float sqrtTermBase;  // miniMuls^2 + miniPVoff^2
+    float sqrtTermFlat;  // sqrt(sqrtTermBase), valid for barrel flat
+  };
+
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(TAcc const& acc,
                                                     MiniDoublets mds,
                                                     HitsBaseConst hitsBase,
                                                     HitsExtendedConst hitsExtended,
-                                                    ModulesConst modules,
+                                                    ModuleMDData const& mod,
                                                     unsigned int lowerHitIdx,
                                                     unsigned int upperHitIdx,
-                                                    uint16_t lowerModuleIdx,
                                                     float dz,
                                                     float dPhi,
                                                     float dPhiChange,
@@ -34,9 +60,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     //the index into which this MD needs to be written will be computed in the kernel
     //nMDs variable will be incremented in the kernel, no need to worry about that here
 
-    mds.moduleIndices()[idx] = lowerModuleIdx;
+    mds.moduleIndices()[idx] = mod.lowerModuleIndex;
     unsigned int anchorHitIndex, outerHitIndex;
-    if (modules.moduleType()[lowerModuleIdx] == PS and modules.moduleLayerType()[lowerModuleIdx] == Strip) {
+    if (mod.moduleType == PS and mod.moduleLayerType == Strip) {
       mds.anchorHitIndices()[idx] = upperHitIdx;
       mds.outerHitIndices()[idx] = lowerHitIdx;
 
@@ -72,8 +98,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     mds.anchorHighEdgeY()[idx] = hitsExtended.highEdgeYs()[anchorHitIndex];
     mds.anchorLowEdgeX()[idx] = hitsExtended.lowEdgeXs()[anchorHitIndex];
     mds.anchorLowEdgeY()[idx] = hitsExtended.lowEdgeYs()[anchorHitIndex];
-    mds.anchorHighEdgePhi()[idx] = alpaka::math::atan2(acc, mds.anchorHighEdgeY()[idx], mds.anchorHighEdgeX()[idx]);
-    mds.anchorLowEdgePhi()[idx] = alpaka::math::atan2(acc, mds.anchorLowEdgeY()[idx], mds.anchorLowEdgeX()[idx]);
+    // Edge phi values are only meaningful for TwoS endcap modules (where highEdgeX/Y are set in HitLoopKernel).
+    // Downstream readers (Segment.h, Quintuplet.h) only access these when outerLayerEndcapTwoS is true.
+    if (mod.isEndcapTwoS) {
+      mds.anchorHighEdgePhi()[idx] = alpaka::math::atan2(acc, mds.anchorHighEdgeY()[idx], mds.anchorHighEdgeX()[idx]);
+      mds.anchorLowEdgePhi()[idx] = alpaka::math::atan2(acc, mds.anchorLowEdgeY()[idx], mds.anchorLowEdgeX()[idx]);
+    }
 
     mds.outerX()[idx] = hitsBase.xs()[outerHitIndex];
     mds.outerY()[idx] = hitsBase.ys()[outerHitIndex];
@@ -89,6 +119,49 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 #endif
   }
 
+  // Overload for callers (Segment.h) that still pass ModulesConst + moduleIndex.
+  template <alpaka::concepts::Acc TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(TAcc const& acc,
+                                                    MiniDoublets mds,
+                                                    HitsBaseConst hitsBase,
+                                                    HitsExtendedConst hitsExtended,
+                                                    ModulesConst modules,
+                                                    unsigned int lowerHitIdx,
+                                                    unsigned int upperHitIdx,
+                                                    uint16_t lowerModuleIdx,
+                                                    float dz,
+                                                    float dPhi,
+                                                    float dPhiChange,
+                                                    float shiftedX,
+                                                    float shiftedY,
+                                                    float shiftedZ,
+                                                    float noShiftedDphi,
+                                                    float noShiftedDPhiChange,
+                                                    unsigned int idx) {
+    ModuleMDData mod;
+    mod.lowerModuleIndex = lowerModuleIdx;
+    mod.moduleType = modules.moduleType()[lowerModuleIdx];
+    mod.moduleLayerType = modules.moduleLayerType()[lowerModuleIdx];
+    mod.subdet = modules.subdets()[lowerModuleIdx];
+    mod.isEndcapTwoS = (mod.subdet == Endcap && mod.moduleType == TwoS);
+    addMDToMemory(acc,
+                  mds,
+                  hitsBase,
+                  hitsExtended,
+                  mod,
+                  lowerHitIdx,
+                  upperHitIdx,
+                  dz,
+                  dPhi,
+                  dPhiChange,
+                  shiftedX,
+                  shiftedY,
+                  shiftedZ,
+                  noShiftedDphi,
+                  noShiftedDPhiChange,
+                  idx);
+  }
+
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool isTighterTiltedModules(ModulesConst modules, uint16_t moduleIndex) {
     // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing
     // This is the same as what was previously considered as"isNormalTiltedModules"
@@ -132,76 +205,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   }
 
   template <alpaka::concepts::Acc TAcc>
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE float dPhiThreshold(TAcc const& acc,
-                                                     float rt,
-                                                     ModulesConst modules,
-                                                     uint16_t moduleIndex,
-                                                     const float ptCut,
-                                                     float dPhi = 0,
-                                                     float dz = 0) {
-    // =================================================================
-    // Various constants
-    // =================================================================
-    //mean of the horizontal layer position in y; treat this as R below
-
-    // =================================================================
-    // Computing some components that make up the cut threshold
-    // =================================================================
-
-    unsigned int iL = modules.layers()[moduleIndex] - 1;
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE float dPhiThreshold(
+      TAcc const& acc, float rt, ModuleMDData const& mod, const float ptCut, float dPhi = 0, float dz = 0) {
     const float miniSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rt * k2Rinv1GeVf / ptCut, kSinAlphaMax));
-    const float rLayNominal =
-        ((modules.subdets()[moduleIndex] == Barrel) ? kMiniRminMeanBarrel[iL] : kMiniRminMeanEndcap[iL]);
-    const float miniPVoff = 0.1f / rLayNominal;
-    const float miniMuls = ((modules.subdets()[moduleIndex] == Barrel) ? kMiniMulsPtScaleBarrel[iL] * 3.f / ptCut
-                                                                       : kMiniMulsPtScaleEndcap[iL] * 3.f / ptCut);
-    const bool isTilted = modules.subdets()[moduleIndex] == Barrel and modules.sides()[moduleIndex] != Center;
-    //the lower module is sent in irrespective of its layer type. We need to fetch the drdz properly
-
-    float drdz;
-    if (isTilted) {
-      if (modules.moduleType()[moduleIndex] == PS and modules.moduleLayerType()[moduleIndex] == Strip) {
-        drdz = modules.drdzs()[moduleIndex];
-      } else {
-        drdz = modules.drdzs()[modules.partnerModuleIndices()[moduleIndex]];
-      }
+
+    if (mod.subdet == Barrel and mod.side == Center) {
+      return miniSlope + mod.sqrtTermFlat;
+    } else if (mod.subdet == Barrel) {
+      return miniSlope + alpaka::math::sqrt(acc, mod.sqrtTermBase + mod.miniTilt2 * miniSlope * miniSlope);
     } else {
-      drdz = 0;
-    }
-    const float miniTilt2 = ((isTilted) ? (0.5f * 0.5f) * (kPixelPSZpitch * kPixelPSZpitch) * (drdz * drdz) /
-                                              (1.f + drdz * drdz) / moduleGapSize(modules, moduleIndex)
-                                        : 0);
-
-    // Compute luminous region requirement for endcap
-    const float miniLum = alpaka::math::abs(acc, dPhi * kDeltaZLum / dz);  // Balaji's new error
-
-    // =================================================================
-    // Return the threshold value
-    // =================================================================
-    // Following condition is met if the module is central and flatly lying
-    if (modules.subdets()[moduleIndex] == Barrel and modules.sides()[moduleIndex] == Center) {
-      return miniSlope + alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff);
-    }
-    // Following condition is met if the module is central and tilted
-    else if (modules.subdets()[moduleIndex] == Barrel and
-             modules.sides()[moduleIndex] != Center)  //all types of tilted modules
-    {
-      return miniSlope +
-             alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff + miniTilt2 * miniSlope * miniSlope);
-    }
-    // If not barrel, it is Endcap
-    else {
-      return miniSlope + alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff + miniLum * miniLum);
+      const float miniLum = alpaka::math::abs(acc, dPhi * kDeltaZLum / dz);
+      return miniSlope + alpaka::math::sqrt(acc, mod.sqrtTermBase + miniLum * miniLum);
     }
   }
 
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_INLINE ALPAKA_FN_ACC void shiftStripHits(TAcc const& acc,
-                                                     ModulesConst modules,
-                                                     uint16_t lowerModuleIndex,
-                                                     uint16_t upperModuleIndex,
-                                                     unsigned int lowerHitIndex,
-                                                     unsigned int upperHitIndex,
+                                                     ModuleMDData const& mod,
                                                      float* shiftedCoords,
                                                      float xLower,
                                                      float yLower,
@@ -211,52 +231,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                                      float yUpper,
                                                      float zUpper,
                                                      float rtUpper) {
-    // This is the strip shift scheme that is explained in http://uaf-10.t2.ucsd.edu/~phchang/talks/PhilipChang20190607_SDL_Update.pdf (see backup slides)
-    // The main feature of this shifting is that the strip hits are shifted to be "aligned" in the line of sight from interaction point to the the pixel hit.
-    // (since pixel hit is well defined in 3-d)
-    // The strip hit is shifted along the strip detector to be placed in a guessed position where we think they would have actually crossed
-    // The size of the radial direction shift due to module separation gap is computed in "radial" size, while the shift is done along the actual strip orientation
-    // This means that there may be very very subtle edge effects coming from whether the strip hit is center of the module or the at the edge of the module
-    // But this should be relatively minor effect
-
-    // dependent variables for this if statement
-    // lowerModule
-    // lowerHit
-    // upperHit
-    // endcapGeometry
-    // tiltedGeometry
-
-    // Some variables relevant to the function
-    float xp;       // pixel x (pixel hit x)
-    float yp;       // pixel y (pixel hit y)
-    float zp;       // pixel y (pixel hit y)
-    float rtp;      // pixel y (pixel hit y)
-    float xa;       // "anchor" x (the anchor position on the strip module plane from pixel hit)
-    float ya;       // "anchor" y (the anchor position on the strip module plane from pixel hit)
-    float xo;       // old x (before the strip hit is moved up or down)
-    float yo;       // old y (before the strip hit is moved up or down)
-    float xn;       // new x (after the strip hit is moved up or down)
-    float yn;       // new y (after the strip hit is moved up or down)
-    float abszn;    // new z in absolute value
-    float zn;       // new z with the sign (+/-) accounted
-    float angleA;   // in r-z plane the theta of the pixel hit in polar coordinate is the angleA
-    float angleB;   // this is the angle of tilted module in r-z plane ("drdz"), for endcap this is 90 degrees
-    bool isEndcap;  // If endcap, drdz = infinity
-    float moduleSeparation;
-    float drprime;    // The radial shift size in x-y plane projection
-    float drprime_x;  // x-component of drprime
-    float drprime_y;  // y-component of drprime
-    const float& slope =
-        modules.dxdys()[lowerModuleIndex];  // The slope of the possible strip hits for a given module in x-y plane
-    float absArctanSlope;
-    float angleM;  // the angle M is the angle of rotation of the module in x-y plane if the possible strip hits are along the x-axis, then angleM = 0, and if the possible strip hits are along y-axis angleM = 90 degrees
-    float absdzprime;  // The distance between the two points after shifting
-    const float& drdz_ = modules.drdzs()[lowerModuleIndex];
-    // Assign hit pointers based on their hit type
+    float xp, yp, zp, rtp;
+    float xo, yo;
     bool pHitInverted = false;
-    if (modules.moduleType()[lowerModuleIndex] == PS) {
-      // TODO: This is somewhat of an mystery.... somewhat confused why this is the case
-      if (modules.moduleLayerType()[lowerModuleIndex] == Pixel) {
+    if (mod.moduleType == PS) {
+      if (mod.moduleLayerType == Pixel) {
         xo = xUpper;
         yo = yUpper;
         xp = xLower;
@@ -281,82 +260,64 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       rtp = rtLower;
     }
 
-    // If it is endcap some of the math gets simplified (and also computers don't like infinities)
-    isEndcap = modules.subdets()[lowerModuleIndex] == Endcap;
+    const bool isEndcap = (mod.subdet == Endcap);
+
+    const float hypot_rz = alpaka::math::sqrt(acc, rtp * rtp + zp * zp);
+    const float sinA = rtp / hypot_rz;
+    const float cosA = alpaka::math::abs(acc, zp) / hypot_rz;
 
-    // NOTE: TODO: Keep in mind that the sin(atan) function can be simplified to something like x / sqrt(1 + x^2) and similar for cos
-    // I am not sure how slow sin, atan, cos, functions are in c++. If x / sqrt(1 + x^2) are faster change this later to reduce arithmetic computation time
-    angleA = alpaka::math::abs(acc, alpaka::math::atan(acc, rtp / zp));
-    angleB =
-        ((isEndcap)
-             ? kPi / 2.f
-             : alpaka::math::atan(
-                   acc,
-                   drdz_));  // The tilt module on the positive z-axis has negative drdz slope in r-z plane and vice versa
+    float sinApB;
+    if (isEndcap) {
+      sinApB = cosA;
+    } else {
+      const float inv_hypot_drdz = 1.f / alpaka::math::sqrt(acc, 1.f + mod.drdz * mod.drdz);
+      sinApB = sinA * inv_hypot_drdz + cosA * mod.drdz * inv_hypot_drdz;
+    }
 
-    moduleSeparation = moduleGapSize(modules, lowerModuleIndex);
+    float moduleSeparation = mod.moduleSep;
 
-    // Sign flips if the pixel is later layer
-    if (modules.isGloballyInner()[lowerModuleIndex] == pHitInverted) {
+    if (mod.isGloballyInner == pHitInverted) {
       moduleSeparation *= -1;
     }
 
-    drprime = (moduleSeparation / alpaka::math::sin(acc, angleA + angleB)) * alpaka::math::sin(acc, angleA);
-
-    // Compute arctan of the slope and take care of the slope = infinity case
-    absArctanSlope =
-        ((slope != kVerticalModuleSlope && edm::isFinite(slope)) ? fabs(alpaka::math::atan(acc, slope)) : kPi / 2.f);
+    float drprime = moduleSeparation * sinA / sinApB;
 
-    // Depending on which quadrant the pixel hit lies, we define the angleM by shifting them slightly differently
-    if (xp > 0 and yp > 0) {
-      angleM = absArctanSlope;
-    } else if (xp > 0 and yp < 0) {
-      angleM = kPi - absArctanSlope;
-    } else if (xp < 0 and yp < 0) {
-      angleM = kPi + absArctanSlope;
-    } else  // if (xp < 0 and yp > 0)
-    {
-      angleM = 2.f * kPi - absArctanSlope;
+    float drprime_x, drprime_y;
+    const float slope = mod.slope;
+    if (slope != kVerticalModuleSlope && edm::isFinite(slope)) {
+      const float inv_hypot_slope = 1.f / alpaka::math::sqrt(acc, 1.f + slope * slope);
+      drprime_x = drprime * ((xp > 0.f) - (xp < 0.f)) * alpaka::math::abs(acc, slope) * inv_hypot_slope;
+      drprime_y = drprime * ((yp > 0.f) - (yp < 0.f)) * inv_hypot_slope;
+    } else {
+      drprime_x = drprime * ((xp > 0.f) - (xp < 0.f));
+      drprime_y = 0.f;
     }
 
-    // Then since the angleM sign is taken care of properly
-    drprime_x = drprime * alpaka::math::sin(acc, angleM);
-    drprime_y = drprime * alpaka::math::cos(acc, angleM);
-
-    // The new anchor position is
-    xa = xp + drprime_x;
-    ya = yp + drprime_y;
+    float xa = xp + drprime_x;
+    float ya = yp + drprime_y;
 
-    // Compute the new strip hit position (if the slope value is in special condition take care of the exceptions)
-    if (slope == kVerticalModuleSlope ||
-        edm::isNotFinite(slope))  // Designated for tilted module when the slope is infinity (module lying along y-axis)
-    {
-      xn = xa;  // New x point is simply where the anchor is
-      yn = yo;  // No shift in y
+    float xn, yn;
+    if (slope == kVerticalModuleSlope || edm::isNotFinite(slope)) {
+      xn = xa;
+      yn = yo;
     } else if (slope == 0) {
-      xn = xo;  // New x point is simply where the anchor is
-      yn = ya;  // No shift in y
+      xn = xo;
+      yn = ya;
     } else {
-      xn = (slope * xa + (1.f / slope) * xo - ya + yo) / (slope + (1.f / slope));  // new xn
-      yn = (xn - xa) * slope + ya;                                                 // new yn
+      xn = (slope * xa + (1.f / slope) * xo - ya + yo) / (slope + (1.f / slope));
+      yn = (xn - xa) * slope + ya;
     }
 
-    // Computing new Z position
-    absdzprime = alpaka::math::abs(
-        acc,
-        moduleSeparation / alpaka::math::sin(acc, angleA + angleB) *
-            alpaka::math::cos(
-                acc,
-                angleA));  // module separation sign is for shifting in radial direction for z-axis direction take care of the sign later
+    float absdzprime = alpaka::math::abs(acc, moduleSeparation * cosA / sinApB);
 
-    // Depending on which one as closer to the interactin point compute the new z wrt to the pixel properly
-    if (modules.moduleLayerType()[lowerModuleIndex] == Pixel) {
+    float abszn;
+    if (mod.moduleLayerType == Pixel) {
       abszn = alpaka::math::abs(acc, zp) + absdzprime;
     } else {
       abszn = alpaka::math::abs(acc, zp) - absdzprime;
     }
 
-    zn = abszn * ((zp > 0) ? 1 : -1);  // Apply the sign of the zn
+    float zn = abszn * ((zp > 0) ? 1 : -1);
 
     shiftedCoords[0] = xn;
     shiftedCoords[1] = yn;
@@ -364,282 +325,288 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   }
 
   template <alpaka::concepts::Acc TAcc>
-  ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoBarrel(TAcc const& acc,
-                                                     ModulesConst modules,
-                                                     uint16_t lowerModuleIndex,
-                                                     uint16_t upperModuleIndex,
-                                                     unsigned int lowerHitIndex,
-                                                     unsigned int upperHitIndex,
-                                                     float& dz,
-                                                     float& dPhi,
-                                                     float& dPhiChange,
-                                                     float& shiftedX,
-                                                     float& shiftedY,
-                                                     float& shiftedZ,
-                                                     float& noShiftedDphi,
-                                                     float& noShiftedDphiChange,
-                                                     float xLower,
-                                                     float yLower,
-                                                     float zLower,
-                                                     float rtLower,
-                                                     float xUpper,
-                                                     float yUpper,
-                                                     float zUpper,
-                                                     float rtUpper,
-                                                     const float ptCut) {
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runMiniDoubletDefaultAlgoBarrel(TAcc const& acc,
+                                                                      ModuleMDData const& mod,
+                                                                      float& dz,
+                                                                      float& dPhi,
+                                                                      float& dPhiChange,
+                                                                      float& shiftedX,
+                                                                      float& shiftedY,
+                                                                      float& shiftedZ,
+                                                                      float& noShiftedDphi,
+                                                                      float& noShiftedDphiChange,
+                                                                      float xLower,
+                                                                      float yLower,
+                                                                      float zLower,
+                                                                      float rtLower,
+                                                                      float xUpper,
+                                                                      float yUpper,
+                                                                      float zUpper,
+                                                                      float rtUpper,
+                                                                      const float ptCut) {
     dz = zLower - zUpper;
-    const float dzCut = modules.moduleType()[lowerModuleIndex] == PS ? 2.f : 10.f;
+    const float dzCut = mod.moduleType == PS ? 2.f : 10.f;
     const float sign = ((dz > 0) - (dz < 0)) * ((zLower > 0) - (zLower < 0));
     const float invertedcrossercut = (alpaka::math::abs(acc, dz) > 2) * sign;
 
-    if ((alpaka::math::abs(acc, dz) >= dzCut) || (invertedcrossercut > 0))
+    if ((alpaka::math::abs(acc, dz) >= dzCut) || (invertedcrossercut > 0)) {
       return false;
+    }
 
-    float miniCut = 0;
-
-    miniCut = modules.moduleLayerType()[lowerModuleIndex] == Pixel
-                  ? dPhiThreshold(acc, rtLower, modules, lowerModuleIndex, ptCut)
-                  : dPhiThreshold(acc, rtUpper, modules, lowerModuleIndex, ptCut);
+    float miniCut = mod.moduleLayerType == Pixel ? dPhiThreshold(acc, rtLower, mod, ptCut)
+                                                 : dPhiThreshold(acc, rtUpper, mod, ptCut);
+    const float miniCutSq = miniCut * miniCut;
+    const float tanMiniCutSq = miniCutSq * (1.f + (2.f / 3.f) * miniCutSq);
 
-    // Cut #2: dphi difference
-    // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3085
-    float xn = 0.f, yn = 0.f;
+    float x1, y1, x2, y2, r1sq, r2sq;
     float shiftedRt2 = 0.f;
-    if (modules.sides()[lowerModuleIndex] != Center)  // If barrel and not center it is tilted
-    {
-      // Shift the hits and calculate new xn, yn position
+
+    if (mod.isTilted) {
       float shiftedCoords[3];
-      shiftStripHits(acc,
-                     modules,
-                     lowerModuleIndex,
-                     upperModuleIndex,
-                     lowerHitIndex,
-                     upperHitIndex,
-                     shiftedCoords,
-                     xLower,
-                     yLower,
-                     zLower,
-                     rtLower,
-                     xUpper,
-                     yUpper,
-                     zUpper,
-                     rtUpper);
-      xn = shiftedCoords[0];
-      yn = shiftedCoords[1];
-
-      // Lower or the upper hit needs to be modified depending on which one was actually shifted
-      if (modules.moduleLayerType()[lowerModuleIndex] == Pixel) {
+      shiftStripHits(acc, mod, shiftedCoords, xLower, yLower, zLower, rtLower, xUpper, yUpper, zUpper, rtUpper);
+      float xn = shiftedCoords[0];
+      float yn = shiftedCoords[1];
+      shiftedRt2 = xn * xn + yn * yn;
+
+      if (mod.moduleLayerType == Pixel) {
         shiftedX = xn;
         shiftedY = yn;
         shiftedZ = zUpper;
-        shiftedRt2 = xn * xn + yn * yn;
-
-        dPhi = cms::alpakatools::deltaPhi(acc, xLower, yLower, shiftedX, shiftedY);  //function from Hit.cc
-        noShiftedDphi = cms::alpakatools::deltaPhi(acc, xLower, yLower, xUpper, yUpper);
+        x1 = xLower;
+        y1 = yLower;
+        x2 = xn;
+        y2 = yn;
+        r1sq = rtLower * rtLower;
+        r2sq = shiftedRt2;
       } else {
         shiftedX = xn;
         shiftedY = yn;
         shiftedZ = zLower;
-        shiftedRt2 = xn * xn + yn * yn;
-        dPhi = cms::alpakatools::deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper);
-        noShiftedDphi = cms::alpakatools::deltaPhi(acc, xLower, yLower, xUpper, yUpper);
+        x1 = xn;
+        y1 = yn;
+        x2 = xUpper;
+        y2 = yUpper;
+        r1sq = shiftedRt2;
+        r2sq = rtUpper * rtUpper;
       }
     } else {
       shiftedX = 0.f;
       shiftedY = 0.f;
       shiftedZ = 0.f;
-      shiftedRt2 = 0.f;
-      dPhi = cms::alpakatools::deltaPhi(acc, xLower, yLower, xUpper, yUpper);
-      noShiftedDphi = dPhi;
+      x1 = xLower;
+      y1 = yLower;
+      x2 = xUpper;
+      y2 = yUpper;
+      r1sq = rtLower * rtLower;
+      r2sq = rtUpper * rtUpper;
+    }
+
+    const float crossDP = x1 * y2 - x2 * y1;
+    const float dotDP = x1 * x2 + y1 * y2;
+    if (dotDP <= 0.f || crossDP * crossDP >= tanMiniCutSq * dotDP * dotDP) {
+      return false;
     }
 
-    if (alpaka::math::abs(acc, dPhi) >= miniCut)
+    const float rInnerSq = alpaka::math::min(acc, r1sq, r2sq);
+    const float dotDC = dotDP - rInnerSq;
+    if (dotDC <= 0.f || crossDP * crossDP >= tanMiniCutSq * dotDC * dotDC) {
       return false;
+    }
+
+    dPhi = alpaka::math::atan2(acc, crossDP, dotDP);
+    if (r1sq < r2sq) {
+      dPhiChange = alpaka::math::atan2(acc, crossDP, dotDP - r1sq);
+    } else {
+      dPhiChange = alpaka::math::atan2(acc, -crossDP, dotDP - r2sq);
+    }
 
-    // Cut #3: The dphi change going from lower Hit to upper Hit
-    // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3076
-    if (modules.sides()[lowerModuleIndex] != Center) {
-      // When it is tilted, use the new shifted positions
-      if (modules.moduleLayerType()[lowerModuleIndex] == Pixel) {
-        // dPhi Change should be calculated so that the upper hit has higher rt.
-        // In principle, this kind of check rt_lower < rt_upper should not be necessary because the hit shifting should have taken care of this.
-        // (i.e. the strip hit is shifted to be aligned in the line of sight from interaction point to pixel hit of PS module guaranteeing rt ordering)
-        // But I still placed this check for safety. (TODO: After checking explicitly if not needed remove later?)
-        // setdeltaPhiChange(lowerHit.rt() < upperHitMod.rt() ? lowerHit.deltaPhiChange(upperHitMod) : upperHitMod.deltaPhiChange(lowerHit));
-
-        dPhiChange = (rtLower * rtLower < shiftedRt2) ? deltaPhiChange(acc, xLower, yLower, shiftedX, shiftedY)
-                                                      : deltaPhiChange(acc, shiftedX, shiftedY, xLower, yLower);
-        noShiftedDphiChange = rtLower < rtUpper ? deltaPhiChange(acc, xLower, yLower, xUpper, yUpper)
-                                                : deltaPhiChange(acc, xUpper, yUpper, xLower, yLower);
+#ifdef CUT_VALUE_DEBUG
+    if (mod.isTilted) {
+      const float crossNS = xLower * yUpper - xUpper * yLower;
+      const float dotNS = xLower * xUpper + yLower * yUpper;
+      noShiftedDphi = alpaka::math::atan2(acc, crossNS, dotNS);
+      if (rtLower < rtUpper) {
+        noShiftedDphiChange = alpaka::math::atan2(acc, crossNS, dotNS - rtLower * rtLower);
       } else {
-        // dPhi Change should be calculated so that the upper hit has higher rt.
-        // In principle, this kind of check rt_lower < rt_upper should not be necessary because the hit shifting should have taken care of this.
-        // (i.e. the strip hit is shifted to be aligned in the line of sight from interaction point to pixel hit of PS module guaranteeing rt ordering)
-        // But I still placed this check for safety. (TODO: After checking explicitly if not needed remove later?)
-
-        dPhiChange = (shiftedRt2 < rtUpper * rtUpper) ? deltaPhiChange(acc, shiftedX, shiftedY, xUpper, yUpper)
-                                                      : deltaPhiChange(acc, xUpper, yUpper, shiftedX, shiftedY);
-        noShiftedDphiChange = rtLower < rtUpper ? deltaPhiChange(acc, xLower, yLower, xUpper, yUpper)
-                                                : deltaPhiChange(acc, xUpper, yUpper, xLower, yLower);
+        noShiftedDphiChange = alpaka::math::atan2(acc, -crossNS, dotNS - rtUpper * rtUpper);
       }
     } else {
-      // When it is flat lying module, whichever is the lowerSide will always have rt lower
-      dPhiChange = deltaPhiChange(acc, xLower, yLower, xUpper, yUpper);
+      noShiftedDphi = dPhi;
       noShiftedDphiChange = dPhiChange;
     }
+#endif
 
-    return alpaka::math::abs(acc, dPhiChange) < miniCut;
+    return true;
   }
 
   template <alpaka::concepts::Acc TAcc>
-  ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoEndcap(TAcc const& acc,
-                                                     ModulesConst modules,
-                                                     uint16_t lowerModuleIndex,
-                                                     uint16_t upperModuleIndex,
-                                                     unsigned int lowerHitIndex,
-                                                     unsigned int upperHitIndex,
-                                                     float& drt,
-                                                     float& dPhi,
-                                                     float& dPhiChange,
-                                                     float& shiftedX,
-                                                     float& shiftedY,
-                                                     float& shiftedZ,
-                                                     float& noShiftedDphi,
-                                                     float& noShiftedDphichange,
-                                                     float xLower,
-                                                     float yLower,
-                                                     float zLower,
-                                                     float rtLower,
-                                                     float xUpper,
-                                                     float yUpper,
-                                                     float zUpper,
-                                                     float rtUpper,
-                                                     const float ptCut) {
-    // There are series of cuts that applies to mini-doublet in a "endcap" region
-    // Cut #1 : dz cut. The dz difference can't be larger than 1cm. (max separation is 4mm for modules in the endcap)
-    // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3093
-    // For PS module in case when it is tilted a different dz (after the strip hit shift) is calculated later.
-
-    float dz = zLower - zUpper;  // Not const since later it might change depending on the type of module
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runMiniDoubletDefaultAlgoEndcap(TAcc const& acc,
+                                                                      ModuleMDData const& mod,
+                                                                      float& drt,
+                                                                      float& dPhi,
+                                                                      float& dPhiChange,
+                                                                      float& shiftedX,
+                                                                      float& shiftedY,
+                                                                      float& shiftedZ,
+                                                                      float& noShiftedDphi,
+                                                                      float& noShiftedDphichange,
+                                                                      float xLower,
+                                                                      float yLower,
+                                                                      float zLower,
+                                                                      float rtLower,
+                                                                      float xUpper,
+                                                                      float yUpper,
+                                                                      float zUpper,
+                                                                      float rtUpper,
+                                                                      const float ptCut) {
+    float dz = zLower - zUpper;
 
     const float dzCut = 1.f;
 
-    if (alpaka::math::abs(acc, dz) >= dzCut)
+    if (alpaka::math::abs(acc, dz) >= dzCut) {
       return false;
-    // Cut #2 : drt cut. The dz difference can't be larger than 1cm. (max separation is 4mm for modules in the endcap)
-    // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3100
-    const float drtCut = modules.moduleType()[lowerModuleIndex] == PS ? 2.f : 10.f;
+    }
+    const float drtCut = mod.moduleType == PS ? 2.f : 10.f;
     drt = rtLower - rtUpper;
-    if (alpaka::math::abs(acc, drt) >= drtCut)
+    if (alpaka::math::abs(acc, drt) >= drtCut) {
       return false;
-    // The new scheme shifts strip hits to be "aligned" along the line of sight from interaction point to the pixel hit (if it is PS modules)
+    }
     float xn = 0, yn = 0, zn = 0;
 
     float shiftedCoords[3];
-    shiftStripHits(acc,
-                   modules,
-                   lowerModuleIndex,
-                   upperModuleIndex,
-                   lowerHitIndex,
-                   upperHitIndex,
-                   shiftedCoords,
-                   xLower,
-                   yLower,
-                   zLower,
-                   rtLower,
-                   xUpper,
-                   yUpper,
-                   zUpper,
-                   rtUpper);
+    shiftStripHits(acc, mod, shiftedCoords, xLower, yLower, zLower, rtLower, xUpper, yUpper, zUpper, rtUpper);
 
     xn = shiftedCoords[0];
     yn = shiftedCoords[1];
     zn = shiftedCoords[2];
 
-    if (modules.moduleType()[lowerModuleIndex] == PS) {
-      // Appropriate lower or upper hit is modified after checking which one was actually shifted
-      if (modules.moduleLayerType()[lowerModuleIndex] == Pixel) {
+    // Compute dPhi using cross/dot product with single atan2 (instead of 2 atan2 from deltaPhi).
+    // deltaPhi(x1,y1,x2,y2) = atan2(x1*y2 - x2*y1, x1*x2 + y1*y2).
+    float dpX1, dpY1, dpX2, dpY2;  // the two points for dPhi
+    if (mod.moduleType == PS) {
+      if (mod.moduleLayerType == Pixel) {
         shiftedX = xn;
         shiftedY = yn;
         shiftedZ = zUpper;
-        dPhi = cms::alpakatools::deltaPhi(acc, xLower, yLower, shiftedX, shiftedY);
-        noShiftedDphi = cms::alpakatools::deltaPhi(acc, xLower, yLower, xUpper, yUpper);
+        dpX1 = xLower;
+        dpY1 = yLower;
+        dpX2 = xn;
+        dpY2 = yn;
       } else {
         shiftedX = xn;
         shiftedY = yn;
         shiftedZ = zLower;
-        dPhi = cms::alpakatools::deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper);
-        noShiftedDphi = cms::alpakatools::deltaPhi(acc, xLower, yLower, xUpper, yUpper);
+        dpX1 = xn;
+        dpY1 = yn;
+        dpX2 = xUpper;
+        dpY2 = yUpper;
       }
     } else {
       shiftedX = xn;
       shiftedY = yn;
       shiftedZ = zUpper;
-      dPhi = cms::alpakatools::deltaPhi(acc, xLower, yLower, xn, yn);
-      noShiftedDphi = cms::alpakatools::deltaPhi(acc, xLower, yLower, xUpper, yUpper);
+      dpX1 = xLower;
+      dpY1 = yLower;
+      dpX2 = xn;
+      dpY2 = yn;
+    }
+
+    const float crossDP = dpX1 * dpY2 - dpX2 * dpY1;
+    const float dotDP = dpX1 * dpX2 + dpY1 * dpY2;
+
+    // Algebraic dPhiChange pre-check: reject pairs guaranteed to fail the dPhiChange cut
+    // without computing atan2.  Uses |tan(dPhi)| = |crossDP/dotDP| as an overestimate of
+    // |dPhi| for the miniLum term (since tan(x) >= x for x in [0, pi/2)), making this
+    // strictly looser than the actual cut -- physics-safe (never rejects a pair that would pass).
+    {
+      float rt = mod.moduleLayerType == Pixel ? rtLower : rtUpper;
+      float miniSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rt * k2Rinv1GeVf / ptCut, kSinAlphaMax));
+      if (dotDP > 0.f) {
+        // Use |tan(dPhi)| = |crossDP|/dotDP as overestimate of |dPhi| for miniLum
+        float absCrossDP = alpaka::math::abs(acc, crossDP);
+        float tanDPhiOverDz = absCrossDP / (dotDP * alpaka::math::abs(acc, dz));
+        float miniLum = tanDPhiOverDz * kDeltaZLum;  // overestimates actual miniLum
+        float miniCut = miniSlope + alpaka::math::sqrt(acc, mod.sqrtTermBase + miniLum * miniLum);
+        // dPhiChange = dPhi * (|zLower| + |dz|) / |dz|, so the dPhiChange cut
+        // |dPhiChange| >= miniCut becomes |dPhi| >= miniCut * |dz| / (|zLower| + |dz|)
+        // Using crossDP/dotDP: |crossDP/dotDP| >= miniCut * |dz| / (|zLower| + |dz|)
+        // => |crossDP| * (|zLower| + |dz|) >= miniCut * dotDP * |dz|
+        float absDz = alpaka::math::abs(acc, dz);
+        float lhs = absCrossDP * (alpaka::math::abs(acc, zLower) + absDz);
+        float rhs = miniCut * dotDP * absDz;
+        if (lhs >= rhs)
+          return false;
+      } else {
+        // dotDP <= 0 means |dPhi| > pi/2 -- guaranteed to fail any reasonable cut
+        return false;
+      }
     }
 
+    dPhi = alpaka::math::atan2(acc, crossDP, dotDP);
+
     // dz needs to change if it is a PS module where the strip hits are shifted in order to properly account for the case when a tilted module falls under "endcap logic"
     // if it was an endcap it will have zero effect
-    if (modules.moduleType()[lowerModuleIndex] == PS) {
-      dz = modules.moduleLayerType()[lowerModuleIndex] == Pixel ? zLower - zn : zUpper - zn;
+    if (mod.moduleType == PS) {
+      dz = mod.moduleLayerType == Pixel ? zLower - zn : zUpper - zn;
     }
 
-    float miniCut = 0;
-    miniCut = modules.moduleLayerType()[lowerModuleIndex] == Pixel
-                  ? dPhiThreshold(acc, rtLower, modules, lowerModuleIndex, ptCut, dPhi, dz)
-                  : dPhiThreshold(acc, rtUpper, modules, lowerModuleIndex, ptCut, dPhi, dz);
+    float miniCut = mod.moduleLayerType == Pixel ? dPhiThreshold(acc, rtLower, mod, ptCut, dPhi, dz)
+                                                 : dPhiThreshold(acc, rtUpper, mod, ptCut, dPhi, dz);
 
-    if (alpaka::math::abs(acc, dPhi) >= miniCut)
+    if (alpaka::math::abs(acc, dPhi) >= miniCut) {
       return false;
+    }
 
     // Cut #4: Another cut on the dphi after some modification
     // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3119-L3124
 
     float dzFrac = alpaka::math::abs(acc, dz) / alpaka::math::abs(acc, zLower);
     dPhiChange = dPhi / dzFrac * (1.f + dzFrac);
+
+    if (alpaka::math::abs(acc, dPhiChange) >= miniCut) {
+      return false;
+    }
+
+    // Passed all cuts -- compute noShifted values for storage (only ~2% of pairs reach here).
+#ifdef CUT_VALUE_DEBUG
+    const float crossNS = xLower * yUpper - xUpper * yLower;
+    const float dotNS = xLower * xUpper + yLower * yUpper;
+    noShiftedDphi = alpaka::math::atan2(acc, crossNS, dotNS);
     noShiftedDphichange = noShiftedDphi / dzFrac * (1.f + dzFrac);
+#endif
 
-    return alpaka::math::abs(acc, dPhiChange) < miniCut;
+    return true;
   }
 
   template <alpaka::concepts::Acc TAcc>
-  ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgo(TAcc const& acc,
-                                               ModulesConst modules,
-                                               uint16_t lowerModuleIndex,
-                                               uint16_t upperModuleIndex,
-                                               unsigned int lowerHitIndex,
-                                               unsigned int upperHitIndex,
-                                               float& dz,
-                                               float& dPhi,
-                                               float& dPhiChange,
-                                               float& shiftedX,
-                                               float& shiftedY,
-                                               float& shiftedZ,
-                                               float& noShiftedDphi,
-                                               float& noShiftedDphiChange,
-                                               float xLower,
-                                               float yLower,
-                                               float zLower,
-                                               float rtLower,
-                                               float xUpper,
-                                               float yUpper,
-                                               float zUpper,
-                                               float rtUpper,
-                                               const float ptCut,
-                                               uint16_t clustSizeLower,
-                                               uint16_t clustSizeUpper,
-                                               const uint16_t clustSizeCut) {
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runMiniDoubletDefaultAlgo(TAcc const& acc,
+                                                                ModuleMDData const& mod,
+                                                                float& dz,
+                                                                float& dPhi,
+                                                                float& dPhiChange,
+                                                                float& shiftedX,
+                                                                float& shiftedY,
+                                                                float& shiftedZ,
+                                                                float& noShiftedDphi,
+                                                                float& noShiftedDphiChange,
+                                                                float xLower,
+                                                                float yLower,
+                                                                float zLower,
+                                                                float rtLower,
+                                                                float xUpper,
+                                                                float yUpper,
+                                                                float zUpper,
+                                                                float rtUpper,
+                                                                const float ptCut,
+                                                                uint16_t clustSizeLower,
+                                                                uint16_t clustSizeUpper,
+                                                                const uint16_t clustSizeCut) {
     if (clustSizeLower > clustSizeCut or clustSizeUpper > clustSizeCut) {
       return false;
     }
-    if (modules.subdets()[lowerModuleIndex] == Barrel) {
+    if (mod.subdet == Barrel) {
       return runMiniDoubletDefaultAlgoBarrel(acc,
-                                             modules,
-                                             lowerModuleIndex,
-                                             upperModuleIndex,
-                                             lowerHitIndex,
-                                             upperHitIndex,
+                                             mod,
                                              dz,
                                              dPhi,
                                              dPhiChange,
@@ -659,11 +626,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                              ptCut);
     } else {
       return runMiniDoubletDefaultAlgoEndcap(acc,
-                                             modules,
-                                             lowerModuleIndex,
-                                             upperModuleIndex,
-                                             lowerHitIndex,
-                                             upperHitIndex,
+                                             mod,
                                              dz,
                                              dPhi,
                                              dPhiChange,
@@ -696,7 +659,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                   const float ptCut,
                                   const uint16_t clustSizeCut) const {
       for (uint16_t lowerModuleIndex : cms::alpakatools::uniform_elements_y(acc, modules.nLowerModules())) {
-        uint16_t upperModuleIndex = modules.partnerModuleIndices()[lowerModuleIndex];
         int nLowerHits = hitsRanges.hitRangesnLower()[lowerModuleIndex];
         int nUpperHits = hitsRanges.hitRangesnUpper()[lowerModuleIndex];
         if (hitsRanges.hitRangesLower()[lowerModuleIndex] == -1)
@@ -705,6 +667,42 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         unsigned int loHitArrayIndex = hitsRanges.hitRangesLower()[lowerModuleIndex];
         int limit = nUpperHits * nLowerHits;
 
+        // Hoist module-constant data once per module to avoid redundant SoA loads per hit pair.
+        ModuleMDData mod;
+        mod.lowerModuleIndex = lowerModuleIndex;
+        mod.subdet = modules.subdets()[lowerModuleIndex];
+        mod.side = modules.sides()[lowerModuleIndex];
+        mod.moduleType = modules.moduleType()[lowerModuleIndex];
+        mod.moduleLayerType = modules.moduleLayerType()[lowerModuleIndex];
+        mod.iL = modules.layers()[lowerModuleIndex] - 1;
+        mod.isTilted = (mod.subdet == Barrel && mod.side != Center);
+        mod.isEndcapTwoS = (mod.subdet == Endcap && mod.moduleType == TwoS);
+        mod.isGloballyInner = modules.isGloballyInner()[lowerModuleIndex];
+        mod.slope = modules.dxdys()[lowerModuleIndex];
+        mod.drdz = modules.drdzs()[lowerModuleIndex];
+        mod.moduleSep = moduleGapSize(modules, lowerModuleIndex);
+
+        // Pre-compute dPhiThreshold module-constant parts
+        float rLayNominal = (mod.subdet == Barrel) ? kMiniRminMeanBarrel[mod.iL] : kMiniRminMeanEndcap[mod.iL];
+        mod.miniPVoff = 0.1f / rLayNominal;
+        mod.miniMuls = (mod.subdet == Barrel) ? kMiniMulsPtScaleBarrel[mod.iL] * 3.f / ptCut
+                                              : kMiniMulsPtScaleEndcap[mod.iL] * 3.f / ptCut;
+        mod.sqrtTermBase = mod.miniMuls * mod.miniMuls + mod.miniPVoff * mod.miniPVoff;
+        mod.sqrtTermFlat = alpaka::math::sqrt(acc, mod.sqrtTermBase);
+
+        if (mod.isTilted) {
+          float drdzThresh;
+          if (mod.moduleType == PS and mod.moduleLayerType == Strip) {
+            drdzThresh = modules.drdzs()[lowerModuleIndex];
+          } else {
+            drdzThresh = modules.drdzs()[modules.partnerModuleIndices()[lowerModuleIndex]];
+          }
+          mod.miniTilt2 = 0.25f * (kPixelPSZpitch * kPixelPSZpitch) * (drdzThresh * drdzThresh) /
+                          (1.f + drdzThresh * drdzThresh) / mod.moduleSep;
+        } else {
+          mod.miniTilt2 = 0.f;
+        }
+
         for (int hitIndex : cms::alpakatools::uniform_elements_x(acc, limit)) {
           int lowerHitIndex = hitIndex / nUpperHits;
           int upperHitIndex = hitIndex % nUpperHits;
@@ -727,11 +725,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
           float dz, dphi, dphichange, shiftedX, shiftedY, shiftedZ, noShiftedDphi, noShiftedDphiChange;
           bool success = runMiniDoubletDefaultAlgo(acc,
-                                                   modules,
-                                                   lowerModuleIndex,
-                                                   upperModuleIndex,
-                                                   lowerHitArrayIndex,
-                                                   upperHitArrayIndex,
+                                                   mod,
                                                    dz,
                                                    dphi,
                                                    dphichange,
@@ -769,10 +763,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                             mds,
                             hitsBase,
                             hitsExtended,
-                            modules,
+                            mod,
                             lowerHitArrayIndex,
                             upperHitArrayIndex,
-                            lowerModuleIndex,
                             dz,
                             dphi,
                             dphichange,
diff --git a/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h b/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h
index d58917c81dee3..8e12327f5bcb6 100644
--- a/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h
+++ b/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h
@@ -43,7 +43,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE float sigmoid_activation(TAcc const& acc, const float x) {
-    return alpaka::math::exp(acc, x) / (alpaka::math::exp(acc, x) + 1.f);
+    const float e = alpaka::math::exp(acc, x);
+    return e / (e + 1.f);
   }
 
   template <int IN_FEATURES, int OUT_FEATURES>
diff --git a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h
index e81b3bff105a8..dc525da4fbf99 100644
--- a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h
+++ b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h
@@ -493,15 +493,38 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float pixelRadiusTemp, tripletRadius, rPhiChiSquaredTemp, rzChiSquaredTemp, rPhiChiSquaredInwardsTemp, centerXTemp,
         centerYTemp, pixelRadiusErrorTemp;
 
+    // Construct PixelSeedData for the pixel segment (shared with pT3 path).
+    unsigned int pixelInnerMDIndex = segments.mdIndices()[pixelSegmentIndex][0];
+    unsigned int pixelOuterMDIndex = segments.mdIndices()[pixelSegmentIndex][1];
+    PixelSeedData pix;
+    pix.ptIn = pixelSeeds.ptIn()[pixelSegmentArrayIndex];
+    pix.px = pixelSeeds.px()[pixelSegmentArrayIndex];
+    pix.py = pixelSeeds.py()[pixelSegmentArrayIndex];
+    pix.pz = pixelSeeds.pz()[pixelSegmentArrayIndex];
+    pix.ptErr = pixelSeeds.ptErr()[pixelSegmentArrayIndex];
+    pix.etaErr = pixelSeeds.etaErr()[pixelSegmentArrayIndex];
+    pix.eta = pixelSeeds.eta()[pixelSegmentArrayIndex];
+    pix.phi = pixelSeeds.phi()[pixelSegmentArrayIndex];
+    pix.charge = pixelSeeds.charge()[pixelSegmentArrayIndex];
+    pix.circleCenterX = pixelSegments.circleCenterX()[pixelSegmentArrayIndex];
+    pix.circleCenterY = pixelSegments.circleCenterY()[pixelSegmentArrayIndex];
+    pix.circleRadius = pixelSegments.circleRadius()[pixelSegmentArrayIndex];
+    pix.x_InLo = mds.anchorX()[pixelInnerMDIndex];
+    pix.y_InLo = mds.anchorY()[pixelInnerMDIndex];
+    pix.z_InLo = mds.anchorZ()[pixelInnerMDIndex];
+    pix.rt_InLo = mds.anchorRt()[pixelInnerMDIndex];
+    pix.x_InUp = mds.anchorX()[pixelOuterMDIndex];
+    pix.y_InUp = mds.anchorY()[pixelOuterMDIndex];
+    pix.z_InUp = mds.anchorZ()[pixelOuterMDIndex];
+    pix.rt_InUp = mds.anchorRt()[pixelOuterMDIndex];
+    pix.alpha_InLo = __H2F(segments.dPhiChanges()[pixelSegmentIndex]);
+
     if (not runPixelTripletDefaultAlgo<dnn::pt3dnn::pT5WP>(acc,
                                                            modules,
-                                                           ranges,
                                                            mds,
                                                            segments,
-                                                           pixelSeeds,
-                                                           pixelSegments,
+                                                           pix,
                                                            triplets,
-                                                           pixelSegmentIndex,
                                                            t5InnerT3Index,
                                                            pixelRadiusTemp,
                                                            tripletRadius,
@@ -520,9 +543,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     unsigned int secondSegmentIndex = triplets.segmentIndices()[t5InnerT3Index][1];
     unsigned int thirdSegmentIndex = triplets.segmentIndices()[t5OuterT3Index][0];
     unsigned int fourthSegmentIndex = triplets.segmentIndices()[t5OuterT3Index][1];
-
-    unsigned int pixelInnerMDIndex = segments.mdIndices()[pixelSegmentIndex][0];
-    unsigned int pixelOuterMDIndex = segments.mdIndices()[pixelSegmentIndex][1];
     unsigned int firstMDIndex = segments.mdIndices()[firstSegmentIndex][0];
     unsigned int secondMDIndex = segments.mdIndices()[secondSegmentIndex][0];
     unsigned int thirdMDIndex = segments.mdIndices()[secondSegmentIndex][1];
@@ -538,10 +558,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     uint16_t lowerModuleIndices[Params_T5::kLayers] = {
         lowerModuleIndex1, lowerModuleIndex2, lowerModuleIndex3, lowerModuleIndex4, lowerModuleIndex5};
 
-    float rtPix[Params_pLS::kLayers] = {mds.anchorRt()[pixelInnerMDIndex], mds.anchorRt()[pixelOuterMDIndex]};
-    float xPix[Params_pLS::kLayers] = {mds.anchorX()[pixelInnerMDIndex], mds.anchorX()[pixelOuterMDIndex]};
-    float yPix[Params_pLS::kLayers] = {mds.anchorY()[pixelInnerMDIndex], mds.anchorY()[pixelOuterMDIndex]};
-    float zPix[Params_pLS::kLayers] = {mds.anchorZ()[pixelInnerMDIndex], mds.anchorZ()[pixelOuterMDIndex]};
+    float rtPix[Params_pLS::kLayers] = {pix.rt_InLo, pix.rt_InUp};
+    float xPix[Params_pLS::kLayers] = {pix.x_InLo, pix.x_InUp};
+    float yPix[Params_pLS::kLayers] = {pix.y_InLo, pix.y_InUp};
+    float zPix[Params_pLS::kLayers] = {pix.z_InLo, pix.z_InUp};
     float zs[Params_T5::kLayers] = {mds.anchorZ()[firstMDIndex],
                                     mds.anchorZ()[secondMDIndex],
                                     mds.anchorZ()[thirdMDIndex],
@@ -553,31 +573,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                      mds.anchorRt()[fourthMDIndex],
                                      mds.anchorRt()[fifthMDIndex]};
 
-    float pixelSegmentPt = pixelSeeds.ptIn()[pixelSegmentArrayIndex];
-    float pixelSegmentPx = pixelSeeds.px()[pixelSegmentArrayIndex];
-    float pixelSegmentPy = pixelSeeds.py()[pixelSegmentArrayIndex];
-    float pixelSegmentPz = pixelSeeds.pz()[pixelSegmentArrayIndex];
-    int pixelSegmentCharge = pixelSeeds.charge()[pixelSegmentArrayIndex];
-
     rzChiSquared = 0;
 
     //get the appropriate centers
-    pixelRadius = pixelSegments.circleRadius()[pixelSegmentArrayIndex];
-
-    rzChiSquared = computePT5RZChiSquared(acc,
-                                          modules,
-                                          lowerModuleIndices,
-                                          rtPix,
-                                          xPix,
-                                          yPix,
-                                          zPix,
-                                          rts,
-                                          zs,
-                                          pixelSegmentPt,
-                                          pixelSegmentPx,
-                                          pixelSegmentPy,
-                                          pixelSegmentPz,
-                                          pixelSegmentCharge);
+    pixelRadius = pix.circleRadius;
+
+    rzChiSquared = computePT5RZChiSquared(
+        acc, modules, lowerModuleIndices, rtPix, xPix, yPix, zPix, rts, zs, pix.ptIn, pix.px, pix.py, pix.pz, pix.charge);
 
     if (pixelRadius < 5.0f * kR1GeVf) {  //only apply r-z chi2 cuts for <5GeV tracks
       if (not passPT5RZChiSquaredCuts(modules,
@@ -603,8 +605,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                     mds.anchorY()[fifthMDIndex]};
 
     //get the appropriate centers
-    centerX = pixelSegments.circleCenterX()[pixelSegmentArrayIndex];
-    centerY = pixelSegments.circleCenterY()[pixelSegmentArrayIndex];
+    centerX = pix.circleCenterX;
+    centerY = pix.circleCenterY;
 
     float T5CenterX = quintuplets.regressionCenterX()[quintupletIndex];
     float T5CenterY = quintuplets.regressionCenterY()[quintupletIndex];
diff --git a/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h
index 6dfa2358d568f..5c2fe8c8b787a 100644
--- a/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h
+++ b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h
@@ -15,37 +15,39 @@
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
+  // Pre-loaded pixel segment data, invariant across triplets for a given pixel segment.
+  // Hoisted to avoid redundant SoA loads through the pT3 function chain.
+  struct PixelSeedData {
+    float ptIn, px, py, pz, ptErr, etaErr;
+    float eta, phi;
+    int charge;
+    float circleCenterX, circleCenterY, circleRadius;
+    float x_InLo, y_InLo, z_InLo, rt_InLo;  // pixel inner MD (pLSMD0)
+    float x_InUp, y_InUp, z_InUp, rt_InUp;  // pixel outer MD (pLSMD1)
+    float alpha_InLo;                       // pixel segment dPhiChange
+  };
+
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const& acc,
                                                                 ModulesConst modules,
-                                                                ObjectRangesConst ranges,
                                                                 MiniDoubletsConst mds,
                                                                 SegmentsConst segments,
-                                                                PixelSeedsConst pixelSeeds,
-                                                                uint16_t pixelModuleIndex,
+                                                                const PixelSeedData& pix,
                                                                 uint16_t segmentInnerModuleIndex,
                                                                 uint16_t segmentOuterModuleIndex,
-                                                                unsigned int pLSIndex,
                                                                 unsigned int segmentIndex,
-                                                                unsigned int pLSMD0Index,
-                                                                unsigned int pLSMD1Index,
                                                                 unsigned int segmentMD0Index,
                                                                 unsigned int segmentMD1Index,
                                                                 const float ptCut);
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const& acc,
                                                                 ModulesConst modules,
-                                                                ObjectRangesConst ranges,
                                                                 MiniDoubletsConst mds,
                                                                 SegmentsConst segments,
-                                                                PixelSeedsConst pixelSeeds,
-                                                                uint16_t pixelModuleIndex,
+                                                                const PixelSeedData& pix,
                                                                 uint16_t segmentInnerModuleIndex,
                                                                 uint16_t segmentOuterModuleIndex,
-                                                                unsigned int pLSIndex,
                                                                 unsigned int segmentIndex,
-                                                                unsigned int pLSMD0Index,
-                                                                unsigned int pLSMD1Index,
                                                                 unsigned int segmentMD0Index,
                                                                 unsigned int segmentMD1Index,
                                                                 const float ptCut);
@@ -124,59 +126,43 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const& acc,
                                                                      ModulesConst modules,
-                                                                     ObjectRangesConst ranges,
                                                                      MiniDoubletsConst mds,
                                                                      SegmentsConst segments,
-                                                                     PixelSeedsConst pixelSeeds,
-                                                                     uint16_t pixelLowerModuleIndex,
+                                                                     const PixelSeedData& pix,
                                                                      uint16_t outerInnerLowerModuleIndex,
                                                                      uint16_t outerOuterLowerModuleIndex,
-                                                                     unsigned int innerSegmentIndex,
                                                                      unsigned int outerSegmentIndex,
                                                                      const float ptCut) {
     short outerInnerLowerModuleSubdet = modules.subdets()[outerInnerLowerModuleIndex];
     short outerOuterLowerModuleSubdet = modules.subdets()[outerOuterLowerModuleIndex];
 
-    unsigned int firstMDIndex = segments.mdIndices()[innerSegmentIndex][0];
-    unsigned int secondMDIndex = segments.mdIndices()[innerSegmentIndex][1];
-
-    unsigned int thirdMDIndex = segments.mdIndices()[outerSegmentIndex][0];
-    unsigned int fourthMDIndex = segments.mdIndices()[outerSegmentIndex][1];
+    unsigned int segmentMD0Index = segments.mdIndices()[outerSegmentIndex][0];
+    unsigned int segmentMD1Index = segments.mdIndices()[outerSegmentIndex][1];
 
     if (outerInnerLowerModuleSubdet == Barrel and
         (outerOuterLowerModuleSubdet == Barrel or outerOuterLowerModuleSubdet == Endcap)) {
       return runTripletDefaultAlgoPPBB(acc,
                                        modules,
-                                       ranges,
                                        mds,
                                        segments,
-                                       pixelSeeds,
-                                       pixelLowerModuleIndex,
+                                       pix,
                                        outerInnerLowerModuleIndex,
                                        outerOuterLowerModuleIndex,
-                                       innerSegmentIndex,
                                        outerSegmentIndex,
-                                       firstMDIndex,
-                                       secondMDIndex,
-                                       thirdMDIndex,
-                                       fourthMDIndex,
+                                       segmentMD0Index,
+                                       segmentMD1Index,
                                        ptCut);
     } else if (outerInnerLowerModuleSubdet == Endcap and outerOuterLowerModuleSubdet == Endcap) {
       return runTripletDefaultAlgoPPEE(acc,
                                        modules,
-                                       ranges,
                                        mds,
                                        segments,
-                                       pixelSeeds,
-                                       pixelLowerModuleIndex,
+                                       pix,
                                        outerInnerLowerModuleIndex,
                                        outerOuterLowerModuleIndex,
-                                       innerSegmentIndex,
                                        outerSegmentIndex,
-                                       firstMDIndex,
-                                       secondMDIndex,
-                                       thirdMDIndex,
-                                       fourthMDIndex,
+                                       segmentMD0Index,
+                                       segmentMD1Index,
                                        ptCut);
     }
     return false;
@@ -537,13 +523,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   template <typename WP = dnn::pt3dnn::pT3WP, alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const& acc,
                                                                  ModulesConst modules,
-                                                                 ObjectRangesConst ranges,
                                                                  MiniDoubletsConst mds,
                                                                  SegmentsConst segments,
-                                                                 PixelSeedsConst pixelSeeds,
-                                                                 PixelSegmentsConst pixelSegments,
+                                                                 const PixelSeedData& pix,
                                                                  TripletsConst triplets,
-                                                                 unsigned int pixelSegmentIndex,
                                                                  unsigned int tripletIndex,
                                                                  float& pixelRadius,
                                                                  float& tripletRadius,
@@ -556,63 +539,48 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                                                  const float ptCut,
                                                                  bool runDNN = true,
                                                                  bool runChiSquaredCuts = true) {
-    //run pT4 compatibility between the pixel segment and inner segment, and between the pixel and outer segment of the triplet
-    uint16_t pixelModuleIndex = segments.innerLowerModuleIndices()[pixelSegmentIndex];
-
     uint16_t lowerModuleIndex = triplets.lowerModuleIndices()[tripletIndex][0];
     uint16_t middleModuleIndex = triplets.lowerModuleIndices()[tripletIndex][1];
     uint16_t upperModuleIndex = triplets.lowerModuleIndices()[tripletIndex][2];
 
-    {
-      // pixel segment vs inner segment of the triplet
-      if (not runPixelTrackletDefaultAlgopT3(acc,
-                                             modules,
-                                             ranges,
-                                             mds,
-                                             segments,
-                                             pixelSeeds,
-                                             pixelModuleIndex,
-                                             lowerModuleIndex,
-                                             middleModuleIndex,
-                                             pixelSegmentIndex,
-                                             triplets.segmentIndices()[tripletIndex][0],
-                                             ptCut))
-        return false;
-
-      //pixel segment vs outer segment of triplet
-      if (not runPixelTrackletDefaultAlgopT3(acc,
-                                             modules,
-                                             ranges,
-                                             mds,
-                                             segments,
-                                             pixelSeeds,
-                                             pixelModuleIndex,
-                                             middleModuleIndex,
-                                             upperModuleIndex,
-                                             pixelSegmentIndex,
-                                             triplets.segmentIndices()[tripletIndex][1],
-                                             ptCut))
-        return false;
-    }
+    // Radius criterion FIRST (cheap check before expensive tracklet computations)
+    pixelRadius = pix.ptIn * kR1GeVf;
+    pixelRadiusError = pix.ptErr * kR1GeVf;
+    tripletRadius = triplets.radius()[tripletIndex];
 
-    //pt matching between the pixel ptin and the triplet circle pt
-    unsigned int pixelSegmentArrayIndex = pixelSegmentIndex - ranges.segmentModuleIndices()[pixelModuleIndex];
-    float pixelSegmentPt = pixelSeeds.ptIn()[pixelSegmentArrayIndex];
-    float pixelSegmentPtError = pixelSeeds.ptErr()[pixelSegmentArrayIndex];
-    float pixelSegmentPx = pixelSeeds.px()[pixelSegmentArrayIndex];
-    float pixelSegmentPy = pixelSeeds.py()[pixelSegmentArrayIndex];
-    float pixelSegmentPz = pixelSeeds.pz()[pixelSegmentArrayIndex];
-    int pixelSegmentCharge = pixelSeeds.charge()[pixelSegmentArrayIndex];
+    if (not passRadiusCriterion(acc,
+                                modules,
+                                pixelRadius,
+                                pixelRadiusError,
+                                tripletRadius,
+                                lowerModuleIndex,
+                                middleModuleIndex,
+                                upperModuleIndex))
+      return false;
 
-    float pixelG = pixelSegments.circleCenterX()[pixelSegmentArrayIndex];
-    float pixelF = pixelSegments.circleCenterY()[pixelSegmentArrayIndex];
-    float pixelRadiusPCA = pixelSegments.circleRadius()[pixelSegmentArrayIndex];
+    // Expensive pT4 compatibility checks (pixel segment vs inner/outer segments of the triplet)
+    if (not runPixelTrackletDefaultAlgopT3(acc,
+                                           modules,
+                                           mds,
+                                           segments,
+                                           pix,
+                                           lowerModuleIndex,
+                                           middleModuleIndex,
+                                           triplets.segmentIndices()[tripletIndex][0],
+                                           ptCut))
+      return false;
 
-    unsigned int pixelInnerMDIndex = segments.mdIndices()[pixelSegmentIndex][0];
-    unsigned int pixelOuterMDIndex = segments.mdIndices()[pixelSegmentIndex][1];
+    if (not runPixelTrackletDefaultAlgopT3(acc,
+                                           modules,
+                                           mds,
+                                           segments,
+                                           pix,
+                                           middleModuleIndex,
+                                           upperModuleIndex,
+                                           triplets.segmentIndices()[tripletIndex][1],
+                                           ptCut))
+      return false;
 
-    pixelRadius = pixelSegmentPt * kR1GeVf;
-    pixelRadiusError = pixelSegmentPtError * kR1GeVf;
     unsigned int tripletInnerSegmentIndex = triplets.segmentIndices()[tripletIndex][0];
     unsigned int tripletOuterSegmentIndex = triplets.segmentIndices()[tripletIndex][1];
 
@@ -625,20 +593,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float ys[Params_T3::kLayers] = {
         mds.anchorY()[firstMDIndex], mds.anchorY()[secondMDIndex], mds.anchorY()[thirdMDIndex]};
 
-    float g, f;
-    tripletRadius = triplets.radius()[tripletIndex];
-    g = triplets.centerX()[tripletIndex];
-    f = triplets.centerY()[tripletIndex];
-
-    if (not passRadiusCriterion(acc,
-                                modules,
-                                pixelRadius,
-                                pixelRadiusError,
-                                tripletRadius,
-                                lowerModuleIndex,
-                                middleModuleIndex,
-                                upperModuleIndex))
-      return false;
+    float g = triplets.centerX()[tripletIndex];
+    float f = triplets.centerY()[tripletIndex];
 
     uint16_t lowerModuleIndices[Params_T3::kLayers] = {lowerModuleIndex, middleModuleIndex, upperModuleIndex};
 
@@ -647,10 +603,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
           mds.anchorRt()[firstMDIndex], mds.anchorRt()[secondMDIndex], mds.anchorRt()[thirdMDIndex]};
       float zs[Params_T3::kLayers] = {
           mds.anchorZ()[firstMDIndex], mds.anchorZ()[secondMDIndex], mds.anchorZ()[thirdMDIndex]};
-      float rtPix[Params_pLS::kLayers] = {mds.anchorRt()[pixelInnerMDIndex], mds.anchorRt()[pixelOuterMDIndex]};
-      float xPix[Params_pLS::kLayers] = {mds.anchorX()[pixelInnerMDIndex], mds.anchorX()[pixelOuterMDIndex]};
-      float yPix[Params_pLS::kLayers] = {mds.anchorY()[pixelInnerMDIndex], mds.anchorY()[pixelOuterMDIndex]};
-      float zPix[Params_pLS::kLayers] = {mds.anchorZ()[pixelInnerMDIndex], mds.anchorZ()[pixelOuterMDIndex]};
+      float rtPix[Params_pLS::kLayers] = {pix.rt_InLo, pix.rt_InUp};
+      float xPix[Params_pLS::kLayers] = {pix.x_InLo, pix.x_InUp};
+      float yPix[Params_pLS::kLayers] = {pix.y_InLo, pix.y_InUp};
+      float zPix[Params_pLS::kLayers] = {pix.z_InLo, pix.z_InUp};
 
       rzChiSquared = computePT3RZChiSquared(acc,
                                             modules,
@@ -663,18 +619,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                             xs,
                                             ys,
                                             zs,
-                                            pixelSegmentPt,
-                                            pixelSegmentPx,
-                                            pixelSegmentPy,
-                                            pixelSegmentPz,
-                                            pixelSegmentCharge);
-      if (runChiSquaredCuts && pixelSegmentPt < 5.0f) {
+                                            pix.ptIn,
+                                            pix.px,
+                                            pix.py,
+                                            pix.pz,
+                                            pix.charge);
+      if (runChiSquaredCuts && pix.ptIn < 5.0f) {
         if (!passPT3RZChiSquaredCuts(modules, lowerModuleIndex, middleModuleIndex, upperModuleIndex, rzChiSquared))
           return false;
       }
 
-      rPhiChiSquared =
-          computePT3RPhiChiSquared(acc, modules, lowerModuleIndices, pixelG, pixelF, pixelRadiusPCA, xs, ys);
+      rPhiChiSquared = computePT3RPhiChiSquared(
+          acc, modules, lowerModuleIndices, pix.circleCenterX, pix.circleCenterY, pix.circleRadius, xs, ys);
 
       rPhiChiSquaredInwards = computePT3RPhiChiSquaredInwards(g, f, tripletRadius, xPix, yPix);
     }
@@ -691,8 +647,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                                   pixelRadius,
                                                   pixelRadiusError,
                                                   rzChiSquared,
-                                                  pixelSeeds.eta()[pixelSegmentArrayIndex],
-                                                  pixelSegmentPt,
+                                                  pix.eta,
+                                                  pix.ptIn,
                                                   module_type_3)) {
       return false;
     }
@@ -719,6 +675,41 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       for (unsigned int i_pLS : cms::alpakatools::uniform_elements_z(acc, nPixelSegments)) {
         auto iLSModule_max = connectedPixelIndex[i_pLS] + connectedPixelSize[i_pLS];
 
+        if (pixelSegments.isDup()[i_pLS])
+          continue;
+        if (pixelSegments.partOfPT5()[i_pLS])
+          continue;
+
+        // Hoist pixel segment data: loaded once per pixel segment instead of
+        // redundantly through the runPixelTripletDefaultAlgo call chain.
+        uint16_t pixelModuleIndex = modules.nLowerModules();
+        unsigned int pixelSegmentIndex = ranges.segmentModuleIndices()[pixelModuleIndex] + i_pLS;
+        unsigned int pixelMD0 = segments.mdIndices()[pixelSegmentIndex][0];
+        unsigned int pixelMD1 = segments.mdIndices()[pixelSegmentIndex][1];
+
+        PixelSeedData pix;
+        pix.ptIn = pixelSeeds.ptIn()[i_pLS];
+        pix.px = pixelSeeds.px()[i_pLS];
+        pix.py = pixelSeeds.py()[i_pLS];
+        pix.pz = pixelSeeds.pz()[i_pLS];
+        pix.ptErr = pixelSeeds.ptErr()[i_pLS];
+        pix.etaErr = pixelSeeds.etaErr()[i_pLS];
+        pix.eta = pixelSeeds.eta()[i_pLS];
+        pix.phi = pixelSeeds.phi()[i_pLS];
+        pix.charge = pixelSeeds.charge()[i_pLS];
+        pix.circleCenterX = pixelSegments.circleCenterX()[i_pLS];
+        pix.circleCenterY = pixelSegments.circleCenterY()[i_pLS];
+        pix.circleRadius = pixelSegments.circleRadius()[i_pLS];
+        pix.x_InLo = mds.anchorX()[pixelMD0];
+        pix.y_InLo = mds.anchorY()[pixelMD0];
+        pix.z_InLo = mds.anchorZ()[pixelMD0];
+        pix.rt_InLo = mds.anchorRt()[pixelMD0];
+        pix.x_InUp = mds.anchorX()[pixelMD1];
+        pix.y_InUp = mds.anchorY()[pixelMD1];
+        pix.z_InUp = mds.anchorZ()[pixelMD1];
+        pix.rt_InUp = mds.anchorRt()[pixelMD1];
+        pix.alpha_InLo = __H2F(segments.dPhiChanges()[pixelSegmentIndex]);
+
         for (unsigned int iLSModule :
              cms::alpakatools::uniform_elements_y(acc, connectedPixelIndex[i_pLS], iLSModule_max)) {
           uint16_t tripletLowerModuleIndex =
@@ -736,18 +727,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
           if (modules.moduleType()[tripletLowerModuleIndex] == TwoS)
             continue;
 
-          uint16_t pixelModuleIndex = modules.nLowerModules();
           unsigned int nOuterTriplets = tripletsOccupancy.nTriplets()[tripletLowerModuleIndex];
           if (nOuterTriplets == 0)
             continue;
 
-          unsigned int pixelSegmentIndex = ranges.segmentModuleIndices()[pixelModuleIndex] + i_pLS;
-
-          if (pixelSegments.isDup()[i_pLS])
-            continue;
-          if (pixelSegments.partOfPT5()[i_pLS])
-            continue;  //don't make pT3s for those pixels that are part of pT5
-
           short layer2_adjustment;
           if (modules.layers()[tripletLowerModuleIndex] == 1) {
             layer2_adjustment = 1;
@@ -773,13 +756,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                 pixelRadiusError;
             bool success = runPixelTripletDefaultAlgo(acc,
                                                       modules,
-                                                      ranges,
                                                       mds,
                                                       segments,
-                                                      pixelSeeds,
-                                                      pixelSegments,
+                                                      pix,
                                                       triplets,
-                                                      pixelSegmentIndex,
                                                       outerTripletIndex,
                                                       pixelRadius,
                                                       tripletRadius,
@@ -798,9 +778,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
               float eta =
                   mds.anchorEta()[segments
                                       .mdIndices()[triplets.segmentIndices()[outerTripletIndex][0]][layer2_adjustment]];
-              float eta_pix = pixelSeeds.eta()[i_pLS];
-              float phi_pix = pixelSeeds.phi()[i_pLS];
-              float pt = pixelSeeds.ptIn()[i_pLS];
               float score = rPhiChiSquared + rPhiChiSquaredInwards;
               unsigned int totOccupancyPixelTriplets =
                   alpaka::atomicAdd(acc, &pixelTriplets.totOccupancyPixelTriplets(), 1u, alpaka::hierarchy::Threads{});
@@ -825,11 +802,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                         rPhiChiSquaredInwards,
                                         rzChiSquared,
                                         pixelTripletIndex,
-                                        pt,
+                                        pix.ptIn,
                                         eta,
                                         phi,
-                                        eta_pix,
-                                        phi_pix,
+                                        pix.eta,
+                                        pix.phi,
                                         pixelRadiusError,
                                         score);
                 triplets.partOfPT3()[outerTripletIndex] = true;
@@ -844,17 +821,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const& acc,
                                                                 ModulesConst modules,
-                                                                ObjectRangesConst ranges,
                                                                 MiniDoubletsConst mds,
                                                                 SegmentsConst segments,
-                                                                PixelSeedsConst pixelSeeds,
-                                                                uint16_t pixelModuleIndex,
+                                                                const PixelSeedData& pix,
                                                                 uint16_t segmentInnerModuleIndex,
                                                                 uint16_t segmentOuterModuleIndex,
-                                                                unsigned int pLSIndex,
                                                                 unsigned int segmentIndex,
-                                                                unsigned int pLSMD0Index,
-                                                                unsigned int pLSMD1Index,
                                                                 unsigned int segmentMD0Index,
                                                                 unsigned int segmentMD1Index,
                                                                 const float ptCut) {
@@ -862,44 +834,43 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     bool isPS_OutLo = (modules.moduleType()[segmentInnerModuleIndex] == PS);
 
-    float rt_InLo = mds.anchorRt()[pLSMD0Index];
-    float rt_InUp = mds.anchorRt()[pLSMD1Index];
+    const float rt_InLo = pix.rt_InLo;
+    const float rt_InUp = pix.rt_InUp;
     float rt_OutLo = mds.anchorRt()[segmentMD0Index];
     float rt_OutUp = mds.anchorRt()[segmentMD1Index];
 
-    float z_InUp = mds.anchorZ()[pLSMD1Index];
+    const float z_InUp = pix.z_InUp;
     float z_OutLo = mds.anchorZ()[segmentMD0Index];
 
-    float x_InLo = mds.anchorX()[pLSMD0Index];
-    float x_InUp = mds.anchorX()[pLSMD1Index];
+    const float x_InLo = pix.x_InLo;
+    const float x_InUp = pix.x_InUp;
     float x_OutLo = mds.anchorX()[segmentMD0Index];
     float x_OutUp = mds.anchorX()[segmentMD1Index];
 
-    float y_InLo = mds.anchorY()[pLSMD0Index];
-    float y_InUp = mds.anchorY()[pLSMD1Index];
+    const float y_InLo = pix.y_InLo;
+    const float y_InUp = pix.y_InUp;
     float y_OutLo = mds.anchorY()[segmentMD0Index];
     float y_OutUp = mds.anchorY()[segmentMD1Index];
 
     float rt_InOut = rt_InUp;
 
-    unsigned int pixelSegmentArrayIndex = pLSIndex - ranges.segmentModuleIndices()[pixelModuleIndex];
-    float ptIn = pixelSeeds.ptIn()[pixelSegmentArrayIndex];
+    float ptIn = pix.ptIn;
     float ptSLo = ptIn;
-    float px = pixelSeeds.px()[pixelSegmentArrayIndex];
-    float py = pixelSeeds.py()[pixelSegmentArrayIndex];
-    float pz = pixelSeeds.pz()[pixelSegmentArrayIndex];
-    float ptErr = pixelSeeds.ptErr()[pixelSegmentArrayIndex];
-    float etaErr = pixelSeeds.etaErr()[pixelSegmentArrayIndex];
+    float px = pix.px;
+    float py = pix.py;
+    float pz = pix.pz;
+    float ptErr = pix.ptErr;
+    float etaErr = pix.etaErr;
     ptSLo = alpaka::math::max(acc, ptCut, ptSLo - 10.0f * alpaka::math::max(acc, ptErr, 0.005f * ptSLo));
     ptSLo = alpaka::math::min(acc, 10.0f, ptSLo);
 
-    float alpha1GeV_OutLo =
-        alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax));
+    // For the typical range rt_OutLo * k2Rinv1GeVf / ptCut < 0.04, asin(x) ~ x
+    // and tan(x)/x ~ 1 + x^2/3.
+    float alpha1GeV_OutLo = alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax);
     const float rtRatio_OutLoInOut =
         rt_OutLo / rt_InOut;  // Outer segment beginning rt divided by inner segment beginning rt;
 
-    float dzDrtScale =
-        alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo;  // The track can bend in r-z plane slightly
+    float dzDrtScale = 1.f + alpha1GeV_OutLo * alpha1GeV_OutLo / 3.f;  // The track can bend in r-z plane slightly
     const float zpitch_InLo = 0.05f;
     const float zpitch_InOut = 0.05f;
     float zpitch_OutLo = (isPS_OutLo ? kPixelPSZpitch : kStrip2SZpitch);
@@ -953,21 +924,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float diffX = x_OutLo - x_InLo;
     float diffY = y_OutLo - y_InLo;
 
-    dPhi = cms::alpakatools::deltaPhi(acc, midPointX, midPointY, diffX, diffY);
+    dPhi = fastDeltaPhi(acc, midPointX, midPointY, diffX, diffY);
 
     if (alpaka::math::abs(acc, dPhi) > dPhiCut)
       return false;
 
     //lots of array accesses below this...
 
-    float alpha_InLo = __H2F(segments.dPhiChanges()[pLSIndex]);
+    float alpha_InLo = pix.alpha_InLo;
     float alpha_OutLo = __H2F(segments.dPhiChanges()[segmentIndex]);
 
     bool isEC_lastLayer =
         modules.subdets()[segmentOuterModuleIndex] == Endcap and modules.moduleType()[segmentOuterModuleIndex] == TwoS;
 
     float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge;
-    alpha_OutUp = cms::alpakatools::deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo);
+    alpha_OutUp = fastDeltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo);
 
     alpha_OutUp_highEdge = alpha_OutUp;
     alpha_OutUp_lowEdge = alpha_OutUp;
@@ -981,42 +952,42 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float tl_axis_lowEdge_x = tl_axis_x;
     float tl_axis_lowEdge_y = tl_axis_y;
 
-    betaIn = -cms::alpakatools::deltaPhi(acc, px, py, tl_axis_x, tl_axis_y);
+    betaIn = -fastDeltaPhi(acc, px, py, tl_axis_x, tl_axis_y);
     float betaInRHmin = betaIn;
     float betaInRHmax = betaIn;
 
-    betaOut = -alpha_OutUp + cms::alpakatools::deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y);
+    betaOut = -alpha_OutUp + fastDeltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y);
 
     float betaOutRHmin = betaOut;
     float betaOutRHmax = betaOut;
 
     if (isEC_lastLayer) {
-      alpha_OutUp_highEdge = cms::alpakatools::deltaPhi(acc,
-                                                        mds.anchorHighEdgeX()[segmentMD1Index],
-                                                        mds.anchorHighEdgeY()[segmentMD1Index],
-                                                        mds.anchorHighEdgeX()[segmentMD1Index] - x_OutLo,
-                                                        mds.anchorHighEdgeY()[segmentMD1Index] - y_OutLo);
-      alpha_OutUp_lowEdge = cms::alpakatools::deltaPhi(acc,
-                                                       mds.anchorLowEdgeX()[segmentMD1Index],
-                                                       mds.anchorLowEdgeY()[segmentMD1Index],
-                                                       mds.anchorLowEdgeX()[segmentMD1Index] - x_OutLo,
-                                                       mds.anchorLowEdgeY()[segmentMD1Index] - y_OutLo);
+      alpha_OutUp_highEdge = fastDeltaPhi(acc,
+                                          mds.anchorHighEdgeX()[segmentMD1Index],
+                                          mds.anchorHighEdgeY()[segmentMD1Index],
+                                          mds.anchorHighEdgeX()[segmentMD1Index] - x_OutLo,
+                                          mds.anchorHighEdgeY()[segmentMD1Index] - y_OutLo);
+      alpha_OutUp_lowEdge = fastDeltaPhi(acc,
+                                         mds.anchorLowEdgeX()[segmentMD1Index],
+                                         mds.anchorLowEdgeY()[segmentMD1Index],
+                                         mds.anchorLowEdgeX()[segmentMD1Index] - x_OutLo,
+                                         mds.anchorLowEdgeY()[segmentMD1Index] - y_OutLo);
 
       tl_axis_highEdge_x = mds.anchorHighEdgeX()[segmentMD1Index] - x_InUp;
       tl_axis_highEdge_y = mds.anchorHighEdgeY()[segmentMD1Index] - y_InUp;
       tl_axis_lowEdge_x = mds.anchorLowEdgeX()[segmentMD1Index] - x_InUp;
       tl_axis_lowEdge_y = mds.anchorLowEdgeY()[segmentMD1Index] - y_InUp;
 
-      betaOutRHmin = -alpha_OutUp_highEdge + cms::alpakatools::deltaPhi(acc,
-                                                                        mds.anchorHighEdgeX()[segmentMD1Index],
-                                                                        mds.anchorHighEdgeY()[segmentMD1Index],
-                                                                        tl_axis_highEdge_x,
-                                                                        tl_axis_highEdge_y);
-      betaOutRHmax = -alpha_OutUp_lowEdge + cms::alpakatools::deltaPhi(acc,
-                                                                       mds.anchorLowEdgeX()[segmentMD1Index],
-                                                                       mds.anchorLowEdgeY()[segmentMD1Index],
-                                                                       tl_axis_lowEdge_x,
-                                                                       tl_axis_lowEdge_y);
+      betaOutRHmin = -alpha_OutUp_highEdge + fastDeltaPhi(acc,
+                                                          mds.anchorHighEdgeX()[segmentMD1Index],
+                                                          mds.anchorHighEdgeY()[segmentMD1Index],
+                                                          tl_axis_highEdge_x,
+                                                          tl_axis_highEdge_y);
+      betaOutRHmax = -alpha_OutUp_lowEdge + fastDeltaPhi(acc,
+                                                         mds.anchorLowEdgeX()[segmentMD1Index],
+                                                         mds.anchorLowEdgeY()[segmentMD1Index],
+                                                         tl_axis_lowEdge_x,
+                                                         tl_axis_lowEdge_y);
     }
 
     //beta computation
@@ -1052,19 +1023,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float min_ptBeta_ptBetaMax = alpaka::math::min(
         acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax);  //need to confirm the range-out value of 7 GeV
     const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_ptBetaMax * min_ptBeta_ptBetaMax);
-    const float alphaInAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, alpha_InLo),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_InUp * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
-    const float alphaOutAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, alpha_OutLo),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
+    // asin(x) ~ x for small arguments (rt * k2Rinv1GeVf / 3.0 < 0.01)
+    const float alphaInAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, alpha_InLo), alpaka::math::min(acc, rt_InUp * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
+    const float alphaOutAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, alpha_OutLo), alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
     const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InUp);
     const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo);
     const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum);
 
-    const float sinDPhi = alpaka::math::sin(acc, dPhi);
+    // dPhi is small (passed dPhiCut), so sin(dPhi) ~ dPhi
+    const float sinDPhi = dPhi;
     const float dBetaRIn2 = 0;  // TODO-RH
 
     float dBetaROut = 0;
@@ -1081,9 +1050,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     const float dBetaROut2 = dBetaROut * dBetaROut;
 
-    //FIXME: need faster version
-    betaOutCut = alpaka::math::asin(acc, alpaka::math::min(acc, drt_tl_axis * k2Rinv1GeVf / ptCut, kSinAlphaMax)) +
-                 (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2);
+    betaOutCut = alpaka::math::min(acc, drt_tl_axis * k2Rinv1GeVf / ptCut, kSinAlphaMax) + (0.02f / sdOut_d) +
+                 alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2);
 
     //Cut #6: The real beta cut
     if (alpaka::math::abs(acc, betaOut) >= betaOutCut)
@@ -1101,17 +1069,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const& acc,
                                                                 ModulesConst modules,
-                                                                ObjectRangesConst ranges,
                                                                 MiniDoubletsConst mds,
                                                                 SegmentsConst segments,
-                                                                PixelSeedsConst pixelSeeds,
-                                                                uint16_t pixelModuleIndex,
+                                                                const PixelSeedData& pix,
                                                                 uint16_t segmentInnerModuleIndex,
                                                                 uint16_t segmentOuterModuleIndex,
-                                                                unsigned int pLSIndex,
                                                                 unsigned int segmentIndex,
-                                                                unsigned int pLSMD0Index,
-                                                                unsigned int pLSMD1Index,
                                                                 unsigned int segmentMD0Index,
                                                                 unsigned int segmentMD1Index,
                                                                 const float ptCut) {
@@ -1119,36 +1082,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     bool isPS_OutLo = (modules.moduleType()[segmentInnerModuleIndex] == PS);
 
-    float z_InUp = mds.anchorZ()[pLSMD1Index];
+    const float z_InUp = pix.z_InUp;
     float z_OutLo = mds.anchorZ()[segmentMD0Index];
 
     if (z_InUp * z_OutLo <= 0)
       return false;
 
-    float rt_InLo = mds.anchorRt()[pLSMD0Index];
-    float rt_InUp = mds.anchorRt()[pLSMD1Index];
+    const float rt_InLo = pix.rt_InLo;
+    const float rt_InUp = pix.rt_InUp;
     float rt_OutLo = mds.anchorRt()[segmentMD0Index];
     float rt_OutUp = mds.anchorRt()[segmentMD1Index];
 
-    float x_InLo = mds.anchorX()[pLSMD0Index];
-    float x_InUp = mds.anchorX()[pLSMD1Index];
+    const float x_InLo = pix.x_InLo;
+    const float x_InUp = pix.x_InUp;
     float x_OutLo = mds.anchorX()[segmentMD0Index];
     float x_OutUp = mds.anchorX()[segmentMD1Index];
 
-    float y_InLo = mds.anchorY()[pLSMD0Index];
-    float y_InUp = mds.anchorY()[pLSMD1Index];
+    const float y_InLo = pix.y_InLo;
+    const float y_InUp = pix.y_InUp;
     float y_OutLo = mds.anchorY()[segmentMD0Index];
     float y_OutUp = mds.anchorY()[segmentMD1Index];
 
-    unsigned int pixelSegmentArrayIndex = pLSIndex - ranges.segmentModuleIndices()[pixelModuleIndex];
-
-    float ptIn = pixelSeeds.ptIn()[pixelSegmentArrayIndex];
+    float ptIn = pix.ptIn;
     float ptSLo = ptIn;
-    float px = pixelSeeds.px()[pixelSegmentArrayIndex];
-    float py = pixelSeeds.py()[pixelSegmentArrayIndex];
-    float pz = pixelSeeds.pz()[pixelSegmentArrayIndex];
-    float ptErr = pixelSeeds.ptErr()[pixelSegmentArrayIndex];
-    float etaErr = pixelSeeds.etaErr()[pixelSegmentArrayIndex];
+    float px = pix.px;
+    float py = pix.py;
+    float pz = pix.pz;
+    float ptErr = pix.ptErr;
+    float etaErr = pix.etaErr;
 
     ptSLo = alpaka::math::max(acc, ptCut, ptSLo - 10.0f * alpaka::math::max(acc, ptErr, 0.005f * ptSLo));
     ptSLo = alpaka::math::min(acc, 10.0f, ptSLo);
@@ -1157,8 +1118,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float zpitch_OutLo = (isPS_OutLo ? kPixelPSZpitch : kStrip2SZpitch);
     float zGeom = zpitch_InLo + zpitch_OutLo;
 
-    const float slope = alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax));
-    const float dzDrtScale = alpaka::math::tan(acc, slope) / slope;  //FIXME: need approximate value
+    // asin(x) ~ x and tan(x)/x ~ 1 + x^2/3 for small x
+    const float alpha1GeV_OutLo_EE = alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax);
+    const float dzDrtScale = 1.f + alpha1GeV_OutLo_EE * alpha1GeV_OutLo_EE / 3.f;
 
     const float dLum = alpaka::math::copysign(acc, kDeltaZLum, z_InUp);
     bool isOutSgInnerMDPS = modules.moduleType()[segmentInnerModuleIndex] == PS;
@@ -1207,10 +1169,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     if ((rt_OutLo < rtLo_point) || (rt_OutLo > rtHi_point))
       return false;
 
-    const float alpha1GeV_OutLo =
-        alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax));
+    // Reuse alpha1GeV_OutLo_EE computed above (same value as asin approximation)
     const float pvOffset = 0.1f / rt_OutLo;
-    dPhiCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, muls2 + pvOffset * pvOffset);
+    dPhiCut = alpha1GeV_OutLo_EE + alpaka::math::sqrt(acc, muls2 + pvOffset * pvOffset);
 
     float midPointX = 0.5f * (x_InLo + x_OutLo);
     float midPointY = 0.5f * (y_InLo + y_OutLo);
@@ -1218,13 +1179,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float diffX = x_OutLo - x_InLo;
     float diffY = y_OutLo - y_InLo;
 
-    dPhi = cms::alpakatools::deltaPhi(acc, midPointX, midPointY, diffX, diffY);
+    dPhi = fastDeltaPhi(acc, midPointX, midPointY, diffX, diffY);
 
     // Cut #5: deltaPhiChange
     if (alpaka::math::abs(acc, dPhi) > dPhiCut)
       return false;
 
-    float alpha_InLo = __H2F(segments.dPhiChanges()[pLSIndex]);
+    float alpha_InLo = pix.alpha_InLo;
     float alpha_OutLo = __H2F(segments.dPhiChanges()[segmentIndex]);
 
     bool isEC_lastLayer =
@@ -1232,7 +1193,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge;
 
-    alpha_OutUp = cms::alpakatools::deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo);
+    alpha_OutUp = fastDeltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo);
     alpha_OutUp_highEdge = alpha_OutUp;
     alpha_OutUp_lowEdge = alpha_OutUp;
 
@@ -1245,41 +1206,41 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float tl_axis_lowEdge_x = tl_axis_x;
     float tl_axis_lowEdge_y = tl_axis_y;
 
-    betaIn = -cms::alpakatools::deltaPhi(acc, px, py, tl_axis_x, tl_axis_y);
+    betaIn = -fastDeltaPhi(acc, px, py, tl_axis_x, tl_axis_y);
     float betaInRHmin = betaIn;
     float betaInRHmax = betaIn;
 
-    betaOut = -alpha_OutUp + cms::alpakatools::deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y);
+    betaOut = -alpha_OutUp + fastDeltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y);
     float betaOutRHmin = betaOut;
     float betaOutRHmax = betaOut;
 
     if (isEC_lastLayer) {
-      alpha_OutUp_highEdge = cms::alpakatools::deltaPhi(acc,
-                                                        mds.anchorHighEdgeX()[segmentMD1Index],
-                                                        mds.anchorHighEdgeY()[segmentMD1Index],
-                                                        mds.anchorHighEdgeX()[segmentMD1Index] - x_OutLo,
-                                                        mds.anchorHighEdgeY()[segmentMD1Index] - y_OutLo);
-      alpha_OutUp_lowEdge = cms::alpakatools::deltaPhi(acc,
-                                                       mds.anchorLowEdgeX()[segmentMD1Index],
-                                                       mds.anchorLowEdgeY()[segmentMD1Index],
-                                                       mds.anchorLowEdgeX()[segmentMD1Index] - x_OutLo,
-                                                       mds.anchorLowEdgeY()[segmentMD1Index] - y_OutLo);
+      alpha_OutUp_highEdge = fastDeltaPhi(acc,
+                                          mds.anchorHighEdgeX()[segmentMD1Index],
+                                          mds.anchorHighEdgeY()[segmentMD1Index],
+                                          mds.anchorHighEdgeX()[segmentMD1Index] - x_OutLo,
+                                          mds.anchorHighEdgeY()[segmentMD1Index] - y_OutLo);
+      alpha_OutUp_lowEdge = fastDeltaPhi(acc,
+                                         mds.anchorLowEdgeX()[segmentMD1Index],
+                                         mds.anchorLowEdgeY()[segmentMD1Index],
+                                         mds.anchorLowEdgeX()[segmentMD1Index] - x_OutLo,
+                                         mds.anchorLowEdgeY()[segmentMD1Index] - y_OutLo);
 
       tl_axis_highEdge_x = mds.anchorHighEdgeX()[segmentMD1Index] - x_InUp;
       tl_axis_highEdge_y = mds.anchorHighEdgeY()[segmentMD1Index] - y_InUp;
       tl_axis_lowEdge_x = mds.anchorLowEdgeX()[segmentMD1Index] - x_InUp;
       tl_axis_lowEdge_y = mds.anchorLowEdgeY()[segmentMD1Index] - y_InUp;
 
-      betaOutRHmin = -alpha_OutUp_highEdge + cms::alpakatools::deltaPhi(acc,
-                                                                        mds.anchorHighEdgeX()[segmentMD1Index],
-                                                                        mds.anchorHighEdgeY()[segmentMD1Index],
-                                                                        tl_axis_highEdge_x,
-                                                                        tl_axis_highEdge_y);
-      betaOutRHmax = -alpha_OutUp_lowEdge + cms::alpakatools::deltaPhi(acc,
-                                                                       mds.anchorLowEdgeX()[segmentMD1Index],
-                                                                       mds.anchorLowEdgeY()[segmentMD1Index],
-                                                                       tl_axis_lowEdge_x,
-                                                                       tl_axis_lowEdge_y);
+      betaOutRHmin = -alpha_OutUp_highEdge + fastDeltaPhi(acc,
+                                                          mds.anchorHighEdgeX()[segmentMD1Index],
+                                                          mds.anchorHighEdgeY()[segmentMD1Index],
+                                                          tl_axis_highEdge_x,
+                                                          tl_axis_highEdge_y);
+      betaOutRHmax = -alpha_OutUp_lowEdge + fastDeltaPhi(acc,
+                                                         mds.anchorLowEdgeX()[segmentMD1Index],
+                                                         mds.anchorLowEdgeY()[segmentMD1Index],
+                                                         tl_axis_lowEdge_x,
+                                                         tl_axis_lowEdge_y);
     }
 
     //beta computation
@@ -1314,19 +1275,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax);  //need to confirm the range-out value of 7 GeV
     const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_ptBetaMax * min_ptBeta_ptBetaMax);
 
-    const float alphaInAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, alpha_InLo),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_InUp * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
-    const float alphaOutAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, alpha_OutLo),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
+    // asin(x) ~ x for small arguments
+    const float alphaInAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, alpha_InLo), alpaka::math::min(acc, rt_InUp * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
+    const float alphaOutAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, alpha_OutLo), alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
     const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InUp);
     const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo);
     const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum);
 
-    const float sinDPhi = alpaka::math::sin(acc, dPhi);
+    // dPhi is small (passed dPhiCut), so sin(dPhi) ~ dPhi
+    const float sinDPhi = dPhi;
     const float dBetaRIn2 = 0;  // TODO-RH
 
     float dBetaROut = 0;
@@ -1343,10 +1302,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     const float dBetaROut2 = dBetaROut * dBetaROut;
 
-    betaOutCut =
-        alpaka::math::asin(
-            acc, alpaka::math::min(acc, drt_tl_axis * k2Rinv1GeVf / ptCut, kSinAlphaMax))  //FIXME: need faster version
-        + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2);
+    betaOutCut = alpaka::math::min(acc, drt_tl_axis * k2Rinv1GeVf / ptCut, kSinAlphaMax) + (0.02f / sdOut_d) +
+                 alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2);
 
     //Cut #6: The real beta cut
     if (alpaka::math::abs(acc, betaOut) >= betaOutCut)
diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h
index 15b0aa1f7f405..0a9a8af533649 100644
--- a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h
+++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h
@@ -19,6 +19,14 @@
 #include "NeuralNetwork.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
+
+  // Compute angle from vector (x1,y1) to vector (x2,y2) using a single atan2.
+  // Equivalent to cms::alpakatools::deltaPhi(acc, x1, y1, x2, y2) which uses 2 atan2 calls.
+  template <alpaka::concepts::Acc TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE float fastDeltaPhi(TAcc const& acc, float x1, float y1, float x2, float y2) {
+    return alpaka::math::atan2(acc, x1 * y2 - x2 * y1, x1 * x2 + y1 * y2);
+  }
+
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void addQuintupletToMemory(TripletsConst triplets,
                                                             Quintuplets quintuplets,
                                                             unsigned int innerTripletIndex,
@@ -790,12 +798,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                                              float sdOut_dr,
                                                              float dr,
                                                              float lIn) {
+    // All asin arguments below are sd_dr * k2Rinv1GeVf / pt (< 0.02), so asin(x) ~ x.
     if (lIn == 0) {
       betaOut += alpaka::math::copysign(
-          acc,
-          alpaka::math::asin(
-              acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)),
-          betaOut);
+          acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax), betaOut);
       return;
     }
 
@@ -805,19 +811,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                             8.f * kPt_betaMax)))  //and the pt_beta is well-defined; less strict for endcap-endcap
     {
       const float betaInUpd =
-          betaIn +
-          alpaka::math::copysign(
-              acc,
-              alpaka::math::asin(
-                  acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)),
-              betaIn);  //FIXME: need a faster version
+          betaIn + alpaka::math::copysign(
+                       acc,
+                       alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax),
+                       betaIn);
       const float betaOutUpd =
-          betaOut +
-          alpaka::math::copysign(
-              acc,
-              alpaka::math::asin(
-                  acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)),
-              betaOut);  //FIXME: need a faster version
+          betaOut + alpaka::math::copysign(
+                        acc,
+                        alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax),
+                        betaOut);
       betaAv = 0.5f * (betaInUpd + betaOutUpd);
 
       //1st update
@@ -825,13 +827,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
           1.f / alpaka::math::abs(acc, dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv));  //get a better pt estimate
 
       betaIn += alpaka::math::copysign(
-          acc,
-          alpaka::math::asin(acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax)),
-          betaIn);  //FIXME: need a faster version
+          acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax), betaIn);
       betaOut += alpaka::math::copysign(
-          acc,
-          alpaka::math::asin(acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax)),
-          betaOut);  //FIXME: need a faster version
+          acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax), betaOut);
       //update the av and pt
       betaAv = 0.5f * (betaIn + betaOut);
       //2nd update
@@ -842,20 +840,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       const float pt_betaIn = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaIn);
 
       const float betaInUpd =
-          betaIn +
-          alpaka::math::copysign(
-              acc,
-              alpaka::math::asin(
-                  acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax)),
-              betaIn);  //FIXME: need a faster version
+          betaIn + alpaka::math::copysign(
+                       acc,
+                       alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax),
+                       betaIn);
       const float betaOutUpd =
           betaOut +
           alpaka::math::copysign(
               acc,
-              alpaka::math::asin(
-                  acc,
-                  alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax)),
-              betaIn);  //FIXME: need a faster version
+              alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax),
+              betaIn);
       betaAv = (alpaka::math::abs(acc, betaOut) > 0.2f * alpaka::math::abs(acc, betaIn))
                    ? (0.5f * (betaInUpd + betaOutUpd))
                    : betaInUpd;
@@ -863,15 +857,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       //1st update
       pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv);  //get a better pt estimate
       betaIn += alpaka::math::copysign(
-          acc,
-          alpaka::math::asin(
-              acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)),
-          betaIn);  //FIXME: need a faster version
+          acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax), betaIn);
       betaOut += alpaka::math::copysign(
-          acc,
-          alpaka::math::asin(
-              acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)),
-          betaIn);  //FIXME: need a faster version
+          acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax), betaIn);
       //update the av and pt
       betaAv = 0.5f * (betaIn + betaOut);
       //2nd update
@@ -913,7 +901,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float diffX = mds.anchorX()[thirdMDIndex] - mds.anchorX()[firstMDIndex];
     float diffY = mds.anchorY()[thirdMDIndex] - mds.anchorY()[firstMDIndex];
 
-    float dPhi = cms::alpakatools::deltaPhi(acc, midPointX, midPointY, diffX, diffY);
+    float dPhi = fastDeltaPhi(acc, midPointX, midPointY, diffX, diffY);
 
     // First obtaining the raw betaIn and betaOut values without any correction and just purely based on the mini-doublet hit positions
     float alpha_InLo = __H2F(segments.dPhiChanges()[innerSegmentIndex]);
@@ -1021,18 +1009,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax);  //need to confimm the range-out value of 7 GeV
     const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_maxPtBeta * min_ptBeta_maxPtBeta);
 
-    const float alphaInAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, alpha_InLo),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
-    const float alphaOutAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, alpha_OutLo),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
+    const float alphaInAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, alpha_InLo), alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
+    const float alphaOutAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, alpha_OutLo), alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
     const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InLo);
     const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo);
     const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum);
-    const float sinDPhi = alpaka::math::sin(acc, dPhi);
+    const float sinDPhi = dPhi;  // dPhi is small (passed dPhiCut), so sin(dPhi) ~ dPhi
 
     float dBetaROut = 0;
     if (isEC_lastLayer) {
@@ -1090,7 +1074,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float diffX = mds.anchorX()[thirdMDIndex] - mds.anchorX()[firstMDIndex];
     float diffY = mds.anchorY()[thirdMDIndex] - mds.anchorY()[firstMDIndex];
 
-    float dPhi = cms::alpakatools::deltaPhi(acc, midPointX, midPointY, diffX, diffY);
+    float dPhi = fastDeltaPhi(acc, midPointX, midPointY, diffX, diffY);
 
     float sdIn_alpha = __H2F(segments.dPhiChanges()[innerSegmentIndex]);
     float sdIn_alpha_min = __H2F(segments.dPhiChangeMins()[innerSegmentIndex]);
@@ -1187,18 +1171,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax);  //need to confirm the range-out value of 7 GeV
     const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_maxPtBeta * min_ptBeta_maxPtBeta);
 
-    const float alphaInAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, sdIn_alpha),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
-    const float alphaOutAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, sdOut_alpha),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
+    const float alphaInAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, sdIn_alpha), alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
+    const float alphaOutAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, sdOut_alpha), alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
     const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InLo);
     const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo);
     const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum);
-    const float sinDPhi = alpaka::math::sin(acc, dPhi);
+    const float sinDPhi = dPhi;  // dPhi is small (passed dPhiCut), so sin(dPhi) ~ dPhi
 
     const float dBetaRIn2 = 0;  // TODO-RH
     float dBetaROut = 0;
@@ -1332,14 +1312,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax);  //need to confirm the range-out value of 7 GeV
     const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_maxPtBeta * min_ptBeta_maxPtBeta);
 
-    const float alphaInAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, sdIn_alpha),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
-    const float alphaOutAbsReg =
-        alpaka::math::max(acc,
-                          alpaka::math::abs(acc, sdOut_alpha),
-                          alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax)));
+    const float alphaInAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, sdIn_alpha), alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
+    const float alphaOutAbsReg = alpaka::math::max(
+        acc, alpaka::math::abs(acc, sdOut_alpha), alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax));
     const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InLo);
     const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo);
     const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum);
diff --git a/RecoTracker/LSTCore/src/alpaka/Segment.h b/RecoTracker/LSTCore/src/alpaka/Segment.h
index 6437625761950..1f2c41241ed59 100644
--- a/RecoTracker/LSTCore/src/alpaka/Segment.h
+++ b/RecoTracker/LSTCore/src/alpaka/Segment.h
@@ -85,10 +85,26 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     return moduleSeparation;
   }
 
+  // Pre-loaded module data for segment creation, eliminating redundant SoA lookups
+  // in inner loops. Populated once per module (outer/middle loop level).
+  struct ModuleSegData {
+    short subdet;
+    short side;
+    short layer;
+    unsigned int iL;  // layer - 1
+    short moduleType;
+    float drdz;
+    float moduleGapSize;
+    bool isTilted;
+    float segMiniTilt2;  // 0.25 * kPixelPSZpitch^2 * drdz^2 / (1+drdz^2) / gap^2; 0 if not tilted
+    float sdMuls;        // kMiniMulsPtScale[iL] * 3 / ptCut
+  };
+
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void dAlphaThreshold(TAcc const& acc,
                                                       float* dAlphaThresholdValues,
-                                                      ModulesConst modules,
+                                                      ModuleSegData const& innerMod,
+                                                      ModuleSegData const& outerMod,
                                                       MiniDoubletsConst mds,
                                                       float xIn,
                                                       float yIn,
@@ -98,77 +114,52 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                                       float yOut,
                                                       float zOut,
                                                       float rtOut,
-                                                      uint16_t innerLowerModuleIndex,
-                                                      uint16_t outerLowerModuleIndex,
                                                       unsigned int innerMDIndex,
                                                       unsigned int outerMDIndex,
                                                       const float ptCut) {
-    float sdMuls = (modules.subdets()[innerLowerModuleIndex] == Barrel)
-                       ? kMiniMulsPtScaleBarrel[modules.layers()[innerLowerModuleIndex] - 1] * 3.f / ptCut
-                       : kMiniMulsPtScaleEndcap[modules.layers()[innerLowerModuleIndex] - 1] * 3.f / ptCut;
+    const float sdMuls = innerMod.sdMuls;
 
     //more accurate then outer rt - inner rt
     float segmentDr = alpaka::math::sqrt(acc, (yOut - yIn) * (yOut - yIn) + (xOut - xIn) * (xOut - xIn));
 
-    const float dAlpha_Bfield =
-        alpaka::math::asin(acc, alpaka::math::min(acc, segmentDr * k2Rinv1GeVf / ptCut, kSinAlphaMax));
-
-    bool isInnerTilted =
-        modules.subdets()[innerLowerModuleIndex] == Barrel and modules.sides()[innerLowerModuleIndex] != Center;
-    bool isOuterTilted =
-        modules.subdets()[outerLowerModuleIndex] == Barrel and modules.sides()[outerLowerModuleIndex] != Center;
-
-    float drdzInner = modules.drdzs()[innerLowerModuleIndex];
-    float drdzOuter = modules.drdzs()[outerLowerModuleIndex];
-    float innerModuleGapSize = moduleGapSize_seg(modules, innerLowerModuleIndex);
-    float outerModuleGapSize = moduleGapSize_seg(modules, outerLowerModuleIndex);
-    const float innerminiTilt2 = isInnerTilted
-                                     ? ((0.5f * 0.5f) * (kPixelPSZpitch * kPixelPSZpitch) * (drdzInner * drdzInner) /
-                                        (1.f + drdzInner * drdzInner) / (innerModuleGapSize * innerModuleGapSize))
-                                     : 0;
-
-    const float outerminiTilt2 = isOuterTilted
-                                     ? ((0.5f * 0.5f) * (kPixelPSZpitch * kPixelPSZpitch) * (drdzOuter * drdzOuter) /
-                                        (1.f + drdzOuter * drdzOuter) / (outerModuleGapSize * outerModuleGapSize))
-                                     : 0;
-
-    float miniDelta = innerModuleGapSize;
+    // For segments, segmentDr * k2Rinv1GeVf / ptCut is always small (<0.03),
+    // so asin(x) ~ x to better than 0.01% accuracy.
+    const float dAlpha_Bfield = alpaka::math::min(acc, segmentDr * k2Rinv1GeVf / ptCut, kSinAlphaMax);
 
     float sdLumForInnerMini2;
     float sdLumForOuterMini2;
 
-    if (modules.subdets()[innerLowerModuleIndex] == Barrel) {
-      sdLumForInnerMini2 = innerminiTilt2 * (dAlpha_Bfield * dAlpha_Bfield);
+    if (innerMod.subdet == Barrel) {
+      sdLumForInnerMini2 = innerMod.segMiniTilt2 * (dAlpha_Bfield * dAlpha_Bfield);
     } else {
       sdLumForInnerMini2 = (mds.dphis()[innerMDIndex] * mds.dphis()[innerMDIndex]) * (kDeltaZLum * kDeltaZLum) /
                            (mds.dzs()[innerMDIndex] * mds.dzs()[innerMDIndex]);
     }
 
-    if (modules.subdets()[outerLowerModuleIndex] == Barrel) {
-      sdLumForOuterMini2 = outerminiTilt2 * (dAlpha_Bfield * dAlpha_Bfield);
+    if (outerMod.subdet == Barrel) {
+      sdLumForOuterMini2 = outerMod.segMiniTilt2 * (dAlpha_Bfield * dAlpha_Bfield);
     } else {
       sdLumForOuterMini2 = (mds.dphis()[outerMDIndex] * mds.dphis()[outerMDIndex]) * (kDeltaZLum * kDeltaZLum) /
                            (mds.dzs()[outerMDIndex] * mds.dzs()[outerMDIndex]);
     }
 
     // Unique stuff for the segment dudes alone
+    const float miniDelta = innerMod.moduleGapSize;
     float dAlpha_res_inner =
-        0.02f / miniDelta *
-        (modules.subdets()[innerLowerModuleIndex] == Barrel ? 1.0f : alpaka::math::abs(acc, zIn) / rtIn);
+        0.02f / miniDelta * (innerMod.subdet == Barrel ? 1.0f : alpaka::math::abs(acc, zIn) / rtIn);
     float dAlpha_res_outer =
-        0.02f / miniDelta *
-        (modules.subdets()[outerLowerModuleIndex] == Barrel ? 1.0f : alpaka::math::abs(acc, zOut) / rtOut);
+        0.02f / miniDelta * (outerMod.subdet == Barrel ? 1.0f : alpaka::math::abs(acc, zOut) / rtOut);
 
     float dAlpha_res = dAlpha_res_inner + dAlpha_res_outer;
 
-    if (modules.subdets()[innerLowerModuleIndex] == Barrel and modules.sides()[innerLowerModuleIndex] == Center) {
+    if (innerMod.subdet == Barrel and innerMod.side == Center) {
       dAlphaThresholdValues[0] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls);
     } else {
       dAlphaThresholdValues[0] =
           dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls + sdLumForInnerMini2);
     }
 
-    if (modules.subdets()[outerLowerModuleIndex] == Barrel and modules.sides()[outerLowerModuleIndex] == Center) {
+    if (outerMod.subdet == Barrel and outerMod.side == Center) {
       dAlphaThresholdValues[1] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls);
     } else {
       dAlphaThresholdValues[1] =
@@ -335,15 +326,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     const float dPhiChange = cms::alpakatools::reducePhiRange(
         acc, cms::alpakatools::phi(acc, xOut - xIn, yOut - yIn) - mds.anchorPhi()[innerMD]);
 
-    return alpaka::math::abs(acc, dPhiChange) < sdCut;
+    if (alpaka::math::abs(acc, dPhiChange) >= sdCut)
+      return false;
+
+    return true;
   }
 
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passDeltaPhiCutsEndcap(TAcc const& acc,
-                                                             ModulesConst modules,
                                                              MiniDoubletsConst mds,
-                                                             uint16_t innerLm,
-                                                             uint16_t outerLm,
                                                              unsigned int innerMD,
                                                              unsigned int outerMD,
                                                              const float dPhi,
@@ -360,15 +351,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     const float dzFrac = dz / zIn;
     const float dPhiChange = dPhi / dzFrac * (1.f + dzFrac);
 
-    return alpaka::math::abs(acc, dPhiChange) < sdSlope;
+    if (alpaka::math::abs(acc, dPhiChange) >= sdSlope)
+      return false;
+
+    return true;
   }
 
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgoBarrel(TAcc const& acc,
-                                                                  ModulesConst modules,
+                                                                  ModuleSegData const& innerMod,
+                                                                  ModuleSegData const& outerMod,
                                                                   MiniDoubletsConst mds,
-                                                                  uint16_t innerLowerModuleIndex,
-                                                                  uint16_t outerLowerModuleIndex,
                                                                   unsigned int innerMDIndex,
                                                                   unsigned int outerMDIndex,
                                                                   float& dPhi,
@@ -401,10 +394,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     zOut = mds.anchorZ()[outerMDIndex];
     rtOut = mds.anchorRt()[outerMDIndex];
 
-    float sdSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax));
-    float dzDrtScale = alpaka::math::tan(acc, sdSlope) / sdSlope;  //FIXME: need appropriate value
+    const float sdSin = alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax);
+    const float cosSlope = alpaka::math::sqrt(acc, 1.f - sdSin * sdSin);
+    // Use 1/cosSlope >= tan(asin(s))/asin(s) for z window (safe, wider window; defers asin)
+    const float dzDrtScale = 1.f / cosSlope;
 
-    const float zGeom = modules.layers()[innerLowerModuleIndex] <= 2 ? 2.f * kPixelPSZpitch : 2.f * kStrip2SZpitch;
+    const float zGeom = innerMod.layer <= 2 ? 2.f * kPixelPSZpitch : 2.f * kStrip2SZpitch;
 
     //slope-correction only on outer end
     zLo = zIn + (zIn - kDeltaZLum) * (rtOut / rtIn - 1.f) * (zIn > 0.f ? 1.f : dzDrtScale) - zGeom;
@@ -415,26 +410,29 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     dPhi = cms::alpakatools::reducePhiRange(acc, mds.anchorPhi()[outerMDIndex] - mds.anchorPhi()[innerMDIndex]);
 
-    if (!passDeltaPhiCutsBarrel(acc,
-                                modules,
-                                mds,
-                                innerLowerModuleIndex,
-                                outerLowerModuleIndex,
-                                innerMDIndex,
-                                outerMDIndex,
-                                dPhi,
-                                rtOut,
-                                sdSlope,
-                                ptCut))
+    // dPhiChange cut using cross-product (subsumes old dPhi cut; no asin needed)
+    const float sdMuls = innerMod.sdMuls;
+    const float sdPVoff = 0.1f / rtOut;
+    const float eps = alpaka::math::sqrt(acc, sdMuls * sdMuls + sdPVoff * sdPVoff);
+    const float dx = xOut - xIn;
+    const float dy = yOut - yIn;
+    const float crossDC = dx * yIn - dy * xIn;
+    const float dotDC = dx * xIn + dy * yIn;
+    const float tanSlope = sdSin / cosSlope;
+    const float tanSdCut = tanSlope + eps * (1.f + tanSlope * tanSlope);
+    const float tanSdCutSq = tanSdCut * tanSdCut;
+
+    if (dotDC <= 0.f || crossDC * crossDC >= tanSdCutSq * dotDC * dotDC)
       return false;
 
-    dPhiChange = cms::alpakatools::reducePhiRange(
-        acc, cms::alpakatools::phi(acc, xOut - xIn, yOut - yIn) - mds.anchorPhi()[innerMDIndex]);
+    // Only survivors: compute actual dPhiChange via single atan2
+    dPhiChange = alpaka::math::atan2(acc, -crossDC, dotDC);
 
     float dAlphaThresholdValues[3];
     dAlphaThreshold(acc,
                     dAlphaThresholdValues,
-                    modules,
+                    innerMod,
+                    outerMod,
                     mds,
                     xIn,
                     yIn,
@@ -444,8 +442,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                     yOut,
                     zOut,
                     rtOut,
-                    innerLowerModuleIndex,
-                    outerLowerModuleIndex,
                     innerMDIndex,
                     outerMDIndex,
                     ptCut);
@@ -462,17 +458,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     if (alpaka::math::abs(acc, dAlphaInnerMDSegment) >= dAlphaInnerMDSegmentThreshold)
       return false;
+
     if (alpaka::math::abs(acc, dAlphaOuterMDSegment) >= dAlphaOuterMDSegmentThreshold)
       return false;
-    return alpaka::math::abs(acc, dAlphaInnerMDOuterMD) < dAlphaInnerMDOuterMDThreshold;
+
+    if (alpaka::math::abs(acc, dAlphaInnerMDOuterMD) >= dAlphaInnerMDOuterMDThreshold)
+      return false;
+
+    return true;
   }
 
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgoEndcap(TAcc const& acc,
-                                                                  ModulesConst modules,
+                                                                  ModuleSegData const& innerMod,
+                                                                  ModuleSegData const& outerMod,
                                                                   MiniDoubletsConst mds,
-                                                                  uint16_t innerLowerModuleIndex,
-                                                                  uint16_t outerLowerModuleIndex,
                                                                   unsigned int innerMDIndex,
                                                                   unsigned int outerMDIndex,
                                                                   float& dPhi,
@@ -505,10 +505,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     zOut = mds.anchorZ()[outerMDIndex];
     rtOut = mds.anchorRt()[outerMDIndex];
 
-    bool outerLayerEndcapTwoS =
-        (modules.subdets()[outerLowerModuleIndex] == Endcap) && (modules.moduleType()[outerLowerModuleIndex] == TwoS);
+    bool outerLayerEndcapTwoS = (outerMod.subdet == Endcap) && (outerMod.moduleType == TwoS);
 
-    float sdSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax));
+    const float sdSinEC = alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax);
+    const float cosSlope = alpaka::math::sqrt(acc, 1.f - sdSinEC * sdSinEC);
     float disks2SMinRadius = 60.f;
 
     float rtGeom = ((rtIn < disks2SMinRadius && rtOut < disks2SMinRadius)
@@ -522,7 +522,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     float dz = zOut - zIn;
     float dLum = alpaka::math::copysign(acc, kDeltaZLum, zIn);
-    float drtDzScale = sdSlope / alpaka::math::tan(acc, sdSlope);
+    // Use cosSlope <= asin(s)*cos(asin(s))/s for rt window (safe, wider window; defers asin)
+    const float drtDzScale = cosSlope;
 
     //rt should increase
     rtLo = alpaka::math::max(acc, rtIn * (1.f + dz / (zIn + dLum) * drtDzScale) - rtGeom, rtIn - 0.5f * rtGeom);
@@ -533,19 +534,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     if ((rtOut < rtLo) || (rtOut > rtHi))
       return false;
 
+    // Algebraic phi pre-check: |dPhi| > asin(s) iff |tan(dPhi)| > s/sqrt(1-s^2) (exact for |dPhi| < pi/2)
+    // Uses already-loaded x,y; avoids asin + anchorPhi reads for rejected pairs.
+    const float crossPhi = xIn * yOut - xOut * yIn;
+    const float dotPhi = xIn * xOut + yIn * yOut;
+    const float tanSlopeSq = sdSinEC * sdSinEC / (1.f - sdSinEC * sdSinEC);
+    if (dotPhi <= 0.f || crossPhi * crossPhi > tanSlopeSq * dotPhi * dotPhi)
+      return false;
+
+    // Compute asin only for pairs surviving the algebraic phi pre-check
+    const float sdSlope = alpaka::math::asin(acc, sdSinEC);
+
     dPhi = cms::alpakatools::reducePhiRange(acc, mds.anchorPhi()[outerMDIndex] - mds.anchorPhi()[innerMDIndex]);
 
-    if (!passDeltaPhiCutsEndcap(acc,
-                                modules,
-                                mds,
-                                innerLowerModuleIndex,
-                                outerLowerModuleIndex,
-                                innerMDIndex,
-                                outerMDIndex,
-                                dPhi,
-                                rtOut,
-                                sdSlope,
-                                ptCut))
+    if (!passDeltaPhiCutsEndcap(acc, mds, innerMDIndex, outerMDIndex, dPhi, rtOut, sdSlope, ptCut))
       return false;
 
     if (outerLayerEndcapTwoS) {
@@ -569,7 +571,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     float dAlphaThresholdValues[3];
     dAlphaThreshold(acc,
                     dAlphaThresholdValues,
-                    modules,
+                    innerMod,
+                    outerMod,
                     mds,
                     xIn,
                     yIn,
@@ -579,8 +582,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                     yOut,
                     zOut,
                     rtOut,
-                    innerLowerModuleIndex,
-                    outerLowerModuleIndex,
                     innerMDIndex,
                     outerMDIndex,
                     ptCut);
@@ -597,17 +598,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     if (alpaka::math::abs(acc, dAlphaInnerMDSegment) >= dAlphaInnerMDSegmentThreshold)
       return false;
+
     if (alpaka::math::abs(acc, dAlphaOuterMDSegment) >= dAlphaOuterMDSegmentThreshold)
       return false;
-    return alpaka::math::abs(acc, dAlphaInnerMDOuterMD) < dAlphaInnerMDOuterMDThreshold;
+
+    if (alpaka::math::abs(acc, dAlphaInnerMDOuterMD) >= dAlphaInnerMDOuterMDThreshold)
+      return false;
+
+    return true;
   }
 
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgo(TAcc const& acc,
-                                                            ModulesConst modules,
+                                                            ModuleSegData const& innerMod,
+                                                            ModuleSegData const& outerMod,
                                                             MiniDoubletsConst mds,
-                                                            uint16_t innerLowerModuleIndex,
-                                                            uint16_t outerLowerModuleIndex,
                                                             unsigned int innerMDIndex,
                                                             unsigned int outerMDIndex,
                                                             float& dPhi,
@@ -626,16 +631,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                                             float& rtHi,
 #endif
                                                             const float ptCut) {
-    if (modules.subdets()[innerLowerModuleIndex] == Barrel and modules.subdets()[outerLowerModuleIndex] == Barrel) {
+    if (innerMod.subdet == Barrel and outerMod.subdet == Barrel) {
 #ifdef CUT_VALUE_DEBUG
       rtLo = -999.f;
       rtHi = -999.f;
 #endif
       return runSegmentDefaultAlgoBarrel(acc,
-                                         modules,
+                                         innerMod,
+                                         outerMod,
                                          mds,
-                                         innerLowerModuleIndex,
-                                         outerLowerModuleIndex,
                                          innerMDIndex,
                                          outerMDIndex,
                                          dPhi,
@@ -658,10 +662,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       zHi = -999.f;
 #endif
       return runSegmentDefaultAlgoEndcap(acc,
-                                         modules,
+                                         innerMod,
+                                         outerMod,
                                          mds,
-                                         innerLowerModuleIndex,
-                                         outerLowerModuleIndex,
                                          innerMDIndex,
                                          outerMDIndex,
                                          dPhi,
@@ -697,6 +700,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         if (nInnerMDs == 0)
           continue;
 
+        // Pre-load inner module data once per module (outer loop)
+        ModuleSegData innerMod;
+        innerMod.subdet = modules.subdets()[innerLowerModuleIndex];
+        innerMod.side = modules.sides()[innerLowerModuleIndex];
+        innerMod.layer = modules.layers()[innerLowerModuleIndex];
+        innerMod.iL = innerMod.layer - 1;
+        innerMod.moduleType = modules.moduleType()[innerLowerModuleIndex];
+        innerMod.drdz = modules.drdzs()[innerLowerModuleIndex];
+        innerMod.moduleGapSize = moduleGapSize_seg(modules, innerLowerModuleIndex);
+        innerMod.isTilted = (innerMod.subdet == Barrel and innerMod.side != Center);
+        innerMod.segMiniTilt2 =
+            innerMod.isTilted
+                ? (0.25f * (kPixelPSZpitch * kPixelPSZpitch) * (innerMod.drdz * innerMod.drdz) /
+                   (1.f + innerMod.drdz * innerMod.drdz) / (innerMod.moduleGapSize * innerMod.moduleGapSize))
+                : 0.f;
+        innerMod.sdMuls = (innerMod.subdet == Barrel) ? kMiniMulsPtScaleBarrel[innerMod.iL] * 3.f / ptCut
+                                                      : kMiniMulsPtScaleEndcap[innerMod.iL] * 3.f / ptCut;
+
         unsigned int nConnectedModules = modules.nConnectedModules()[innerLowerModuleIndex];
 
         for (uint16_t outerLowerModuleArrayIdx : cms::alpakatools::uniform_elements_y(acc, nConnectedModules)) {
@@ -708,6 +729,25 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
           if (limit == 0)
             continue;
+
+          // Pre-load outer module data once per module pair (middle loop)
+          ModuleSegData outerMod;
+          outerMod.subdet = modules.subdets()[outerLowerModuleIndex];
+          outerMod.side = modules.sides()[outerLowerModuleIndex];
+          outerMod.layer = modules.layers()[outerLowerModuleIndex];
+          outerMod.iL = outerMod.layer - 1;
+          outerMod.moduleType = modules.moduleType()[outerLowerModuleIndex];
+          outerMod.drdz = modules.drdzs()[outerLowerModuleIndex];
+          outerMod.moduleGapSize = moduleGapSize_seg(modules, outerLowerModuleIndex);
+          outerMod.isTilted = (outerMod.subdet == Barrel and outerMod.side != Center);
+          outerMod.segMiniTilt2 =
+              outerMod.isTilted
+                  ? (0.25f * (kPixelPSZpitch * kPixelPSZpitch) * (outerMod.drdz * outerMod.drdz) /
+                     (1.f + outerMod.drdz * outerMod.drdz) / (outerMod.moduleGapSize * outerMod.moduleGapSize))
+                  : 0.f;
+          outerMod.sdMuls = (outerMod.subdet == Barrel) ? kMiniMulsPtScaleBarrel[outerMod.iL] * 3.f / ptCut
+                                                        : kMiniMulsPtScaleEndcap[outerMod.iL] * 3.f / ptCut;
+
           for (unsigned int hitIndex : cms::alpakatools::uniform_elements_x(acc, limit)) {
             unsigned int innerMDArrayIdx = hitIndex / nOuterMDs;
             unsigned int outerMDArrayIdx = hitIndex % nOuterMDs;
@@ -727,29 +767,30 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
             dPhiMax = 0;
             dPhiChangeMin = 0;
             dPhiChangeMax = 0;
-            if (runSegmentDefaultAlgo(acc,
-                                      modules,
-                                      mds,
-                                      innerLowerModuleIndex,
-                                      outerLowerModuleIndex,
-                                      innerMDIndex,
-                                      outerMDIndex,
-                                      dPhi,
-                                      dPhiMin,
-                                      dPhiMax,
-                                      dPhiChange,
-                                      dPhiChangeMin,
-                                      dPhiChangeMax,
+            bool pass = runSegmentDefaultAlgo(acc,
+                                              innerMod,
+                                              outerMod,
+                                              mds,
+                                              innerMDIndex,
+                                              outerMDIndex,
+                                              dPhi,
+                                              dPhiMin,
+                                              dPhiMax,
+                                              dPhiChange,
+                                              dPhiChangeMin,
+                                              dPhiChangeMax,
 #ifdef CUT_VALUE_DEBUG
-                                      dAlphaInnerMDSegment,
-                                      dAlphaOuterMDSegment,
-                                      dAlphaInnerMDOuterMD,
-                                      zLo,
-                                      zHi,
-                                      rtLo,
-                                      rtHi,
+                                              dAlphaInnerMDSegment,
+                                              dAlphaOuterMDSegment,
+                                              dAlphaInnerMDOuterMD,
+                                              zLo,
+                                              zHi,
+                                              rtLo,
+                                              rtHi,
 #endif
-                                      ptCut)) {
+                                              ptCut);
+
+            if (pass) {
               unsigned int totOccupancySegments =
                   alpaka::atomicAdd(acc,
                                     &segmentsOccupancy.totOccupancySegments()[innerLowerModuleIndex],
@@ -799,21 +840,81 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passDeltaPhiCutsSelector(TAcc const& acc,
-                                                               ModulesConst modules,
+                                                               ModuleSegData const& innerMod,
+                                                               ModuleSegData const& outerMod,
                                                                MiniDoubletsConst mds,
-                                                               uint16_t innerLm,
-                                                               uint16_t outerLm,
                                                                unsigned int innerMD,
                                                                unsigned int outerMD,
                                                                const float ptCut) {
-    const float dPhi = cms::alpakatools::reducePhiRange(acc, mds.anchorPhi()[outerMD] - mds.anchorPhi()[innerMD]);
+    const float rtIn = mds.anchorRt()[innerMD];
     const float rtOut = mds.anchorRt()[outerMD];
-    const float sdSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax));
-
-    if (modules.subdets()[innerLm] == Barrel && modules.subdets()[outerLm] == Barrel)
-      return passDeltaPhiCutsBarrel(acc, modules, mds, innerLm, outerLm, innerMD, outerMD, dPhi, rtOut, sdSlope, ptCut);
-    else
-      return passDeltaPhiCutsEndcap(acc, modules, mds, innerLm, outerLm, innerMD, outerMD, dPhi, rtOut, sdSlope, ptCut);
+    const float sdSin = alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax);
+    const float cosSlope = alpaka::math::sqrt(acc, 1.f - sdSin * sdSin);
+
+    if (innerMod.subdet == Barrel && outerMod.subdet == Barrel) {
+      // Barrel: z-range pre-filter + cross-product dPhiChange test (no asin needed).
+      const float zIn = mds.anchorZ()[innerMD];
+      const float zOut = mds.anchorZ()[outerMD];
+      const float dzDrtScale = 1.f / cosSlope;
+      const float zGeom = innerMod.layer <= 2 ? 2.f * kPixelPSZpitch : 2.f * kStrip2SZpitch;
+      const float zLo = zIn + (zIn - kDeltaZLum) * (rtOut / rtIn - 1.f) * (zIn > 0.f ? 1.f : dzDrtScale) - zGeom;
+      const float zHi = zIn + (zIn + kDeltaZLum) * (rtOut / rtIn - 1.f) * (zIn < 0.f ? 1.f : dzDrtScale) + zGeom;
+      if (zOut < zLo || zOut > zHi)
+        return false;
+
+      // Cross-product dPhiChange test (equivalent to |tan(dPhiChange)| < tan(sdCut), no asin)
+      const float sdMuls = innerMod.sdMuls;
+      const float sdPVoff = 0.1f / rtOut;
+      const float eps = alpaka::math::sqrt(acc, sdMuls * sdMuls + sdPVoff * sdPVoff);
+      const float xIn = mds.anchorX()[innerMD];
+      const float yIn = mds.anchorY()[innerMD];
+      const float xOut = mds.anchorX()[outerMD];
+      const float yOut = mds.anchorY()[outerMD];
+      const float dx = xOut - xIn;
+      const float dy = yOut - yIn;
+      const float crossDC = dx * yIn - dy * xIn;
+      const float dotDC = dx * xIn + dy * yIn;
+      const float tanSlope = sdSin / cosSlope;
+      const float tanSdCut = tanSlope + eps * (1.f + tanSlope * tanSlope);
+      return dotDC > 0.f && crossDC * crossDC < tanSdCut * tanSdCut * dotDC * dotDC;
+    } else {
+      // Endcap/mixed: z-sign + rt-range pre-filter before asin (same as creation kernel).
+      const float zIn = mds.anchorZ()[innerMD];
+      const float zOut = mds.anchorZ()[outerMD];
+      if (zIn * zOut < 0.f)
+        return false;
+
+      const float dz = zOut - zIn;
+      const float dLum = alpaka::math::copysign(acc, kDeltaZLum, zIn);
+      const float drtDzScale = cosSlope;
+      constexpr float disks2SMinRadius = 60.f;
+      const float rtGeom =
+          ((rtIn < disks2SMinRadius && rtOut < disks2SMinRadius)
+               ? (2.f * kPixelPSZpitch)
+               : ((rtIn < disks2SMinRadius || rtOut < disks2SMinRadius) ? (kPixelPSZpitch + kStrip2SZpitch)
+                                                                        : (2.f * kStrip2SZpitch)));
+      const float rtLo =
+          alpaka::math::max(acc, rtIn * (1.f + dz / (zIn + dLum) * drtDzScale) - rtGeom, rtIn - 0.5f * rtGeom);
+      const float rtHi = rtIn * (zOut - dLum) / (zIn - dLum) + rtGeom;
+      if (rtOut < rtLo || rtOut > rtHi)
+        return false;
+
+      // Algebraic phi pre-check: |dPhi| > asin(s) iff |tan(dPhi)| > s/sqrt(1-s^2) (exact for |dPhi| < pi/2)
+      const float xIn = mds.anchorX()[innerMD];
+      const float yIn = mds.anchorY()[innerMD];
+      const float xOut = mds.anchorX()[outerMD];
+      const float yOut = mds.anchorY()[outerMD];
+      const float crossPhi = xIn * yOut - xOut * yIn;
+      const float dotPhi = xIn * xOut + yIn * yOut;
+      const float tanSlopeSq = sdSin * sdSin / (1.f - sdSin * sdSin);
+      if (dotPhi <= 0.f || crossPhi * crossPhi > tanSlopeSq * dotPhi * dotPhi)
+        return false;
+
+      // Survivors: compute asin + dPhi cuts
+      const float sdSlope = alpaka::math::asin(acc, sdSin);
+      const float dPhi = cms::alpakatools::reducePhiRange(acc, mds.anchorPhi()[outerMD] - mds.anchorPhi()[innerMD]);
+      return passDeltaPhiCutsEndcap(acc, mds, innerMD, outerMD, dPhi, rtOut, sdSlope, ptCut);
+    }
   }
 
   struct CountMiniDoubletConnections {
@@ -837,12 +938,48 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         if (nConnectedModules == 0)
           continue;
 
+        // Pre-load inner module data once per module (outer loop)
+        ModuleSegData innerMod;
+        innerMod.subdet = modules.subdets()[innerLowerModuleIndex];
+        innerMod.side = modules.sides()[innerLowerModuleIndex];
+        innerMod.layer = modules.layers()[innerLowerModuleIndex];
+        innerMod.iL = innerMod.layer - 1;
+        innerMod.moduleType = modules.moduleType()[innerLowerModuleIndex];
+        innerMod.drdz = modules.drdzs()[innerLowerModuleIndex];
+        innerMod.moduleGapSize = moduleGapSize_seg(modules, innerLowerModuleIndex);
+        innerMod.isTilted = (innerMod.subdet == Barrel and innerMod.side != Center);
+        innerMod.segMiniTilt2 =
+            innerMod.isTilted
+                ? (0.25f * (kPixelPSZpitch * kPixelPSZpitch) * (innerMod.drdz * innerMod.drdz) /
+                   (1.f + innerMod.drdz * innerMod.drdz) / (innerMod.moduleGapSize * innerMod.moduleGapSize))
+                : 0.f;
+        innerMod.sdMuls = (innerMod.subdet == Barrel) ? kMiniMulsPtScaleBarrel[innerMod.iL] * 3.f / ptCut
+                                                      : kMiniMulsPtScaleEndcap[innerMod.iL] * 3.f / ptCut;
+
         for (uint16_t outerLowerModuleArrayIdx : cms::alpakatools::uniform_elements_y(acc, nConnectedModules)) {
           const uint16_t outerLowerModuleIndex = modules.moduleMap()[innerLowerModuleIndex][outerLowerModuleArrayIdx];
           const unsigned int nOuterMDs = mdsOccupancy.nMDs()[outerLowerModuleIndex];
           if (nOuterMDs == 0)
             continue;
 
+          // Pre-load outer module data once per module pair (middle loop)
+          ModuleSegData outerMod;
+          outerMod.subdet = modules.subdets()[outerLowerModuleIndex];
+          outerMod.side = modules.sides()[outerLowerModuleIndex];
+          outerMod.layer = modules.layers()[outerLowerModuleIndex];
+          outerMod.iL = outerMod.layer - 1;
+          outerMod.moduleType = modules.moduleType()[outerLowerModuleIndex];
+          outerMod.drdz = modules.drdzs()[outerLowerModuleIndex];
+          outerMod.moduleGapSize = moduleGapSize_seg(modules, outerLowerModuleIndex);
+          outerMod.isTilted = (outerMod.subdet == Barrel and outerMod.side != Center);
+          outerMod.segMiniTilt2 =
+              outerMod.isTilted
+                  ? (0.25f * (kPixelPSZpitch * kPixelPSZpitch) * (outerMod.drdz * outerMod.drdz) /
+                     (1.f + outerMod.drdz * outerMod.drdz) / (outerMod.moduleGapSize * outerMod.moduleGapSize))
+                  : 0.f;
+          outerMod.sdMuls = (outerMod.subdet == Barrel) ? kMiniMulsPtScaleBarrel[outerMod.iL] * 3.f / ptCut
+                                                        : kMiniMulsPtScaleEndcap[outerMod.iL] * 3.f / ptCut;
+
           const unsigned int limit = nInnerMDs * nOuterMDs;
 
           for (unsigned int hitIndex : cms::alpakatools::uniform_elements_x(acc, limit)) {
@@ -853,8 +990,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
             const unsigned int outerMDIndex = mdRanges[outerLowerModuleIndex][0] + outerMDArrayIdx;
 
             // Increment the connected max if the LS passes the delta phi cuts.
-            if (passDeltaPhiCutsSelector(
-                    acc, modules, mds, innerLowerModuleIndex, outerLowerModuleIndex, innerMDIndex, outerMDIndex, ptCut)) {
+            if (passDeltaPhiCutsSelector(acc, innerMod, outerMod, mds, innerMDIndex, outerMDIndex, ptCut)) {
               alpaka::atomicAdd(acc, &mds.connectedMax()[innerMDIndex], 1u, alpaka::hierarchy::Threads{});
             }
           }
diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
index 3768f021a0cfb..ab82be43506d7 100644
--- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
+++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
@@ -328,18 +328,38 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
         for (unsigned int trackCandidateIndex : cms::alpakatools::uniform_elements_x(acc, nTrackCandidates)) {
           LSTObjType type = candsBase.trackCandidateType()[trackCandidateIndex];
           unsigned int innerTrackletIdx = candsExtended.objectIndices()[trackCandidateIndex][0];
+
+          // Early spatial filter: compute TC eta and phi based on type and skip if far away.
+          float eta2, phi2;
+          if (type == LSTObjType::T5) {
+            eta2 = __H2F(quintuplets.eta()[innerTrackletIdx]);
+            phi2 = __H2F(quintuplets.phi()[innerTrackletIdx]);
+          } else if (type == LSTObjType::pT3) {
+            eta2 = __H2F(pixelTriplets.eta_pix()[innerTrackletIdx]);
+            phi2 = __H2F(pixelTriplets.phi_pix()[innerTrackletIdx]);
+          } else if (type == LSTObjType::pT5) {
+            eta2 = pixelSeeds.eta()[innerTrackletIdx - prefix];
+            phi2 = pixelSeeds.phi()[innerTrackletIdx - prefix];
+          } else {
+            continue;
+          }
+
+          if (alpaka::math::abs(acc, eta1 - eta2) > 0.2f)
+            continue;
+
+          float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
+          if (alpaka::math::abs(acc, dPhi) > 0.2f)
+            continue;
+
+          float dEta = eta1 - eta2;
+          float dR2 = dEta * dEta + dPhi * dPhi;
+
           if (type == LSTObjType::T5) {
-            unsigned int quintupletIndex = innerTrackletIdx;  // T5 index
-            float eta2 = __H2F(quintuplets.eta()[quintupletIndex]);
-            float phi2 = __H2F(quintuplets.phi()[quintupletIndex]);
-            float dEta = alpaka::math::abs(acc, eta1 - eta2);
-            float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
-            float dR2 = dEta * dEta + dPhi * dPhi;
             // Cut on pLS-T5 embed distance.
             if (dR2 < 0.02f) {
               float d2 = 0.f;
               CMS_UNROLL_LOOP for (unsigned k = 0; k < Params_pLS::kEmbed; ++k) {
-                const float diff = plsEmbed[k] - quintuplets.t5Embed()[quintupletIndex][k];
+                const float diff = plsEmbed[k] - quintuplets.t5Embed()[innerTrackletIdx][k];
                 d2 += diff * diff;
               }
               // Compare squared embedding distance to the cut value for the eta bin.
@@ -347,36 +367,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                 pixelSegments.isDup()[pixelArrayIndex] = true;
               }
             }
-          }
-          if (type == LSTObjType::pT3) {
+          } else if (type == LSTObjType::pT3) {
             int pLSIndex = pixelTriplets.pixelSegmentIndices()[innerTrackletIdx];
             int npMatched = checkPixelHits(prefix + pixelArrayIndex, pLSIndex, mds, segments, hitsBase);
             if (npMatched > 0)
               pixelSegments.isDup()[pixelArrayIndex] = true;
 
-            int pT3Index = innerTrackletIdx;
-            float eta2 = __H2F(pixelTriplets.eta_pix()[pT3Index]);
-            float phi2 = __H2F(pixelTriplets.phi_pix()[pT3Index]);
-            float dEta = alpaka::math::abs(acc, eta1 - eta2);
-            float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
-
-            float dR2 = dEta * dEta + dPhi * dPhi;
             if (dR2 < 0.000001f)
               pixelSegments.isDup()[pixelArrayIndex] = true;
-          }
-          if (type == LSTObjType::pT5) {
-            unsigned int pLSIndex = innerTrackletIdx;
-            int npMatched = checkPixelHits(prefix + pixelArrayIndex, pLSIndex, mds, segments, hitsBase);
-            if (npMatched > 0) {
+          } else if (type == LSTObjType::pT5) {
+            int npMatched = checkPixelHits(prefix + pixelArrayIndex, innerTrackletIdx, mds, segments, hitsBase);
+            if (npMatched > 0)
               pixelSegments.isDup()[pixelArrayIndex] = true;
-            }
 
-            float eta2 = pixelSeeds.eta()[pLSIndex - prefix];
-            float phi2 = pixelSeeds.phi()[pLSIndex - prefix];
-            float dEta = alpaka::math::abs(acc, eta1 - eta2);
-            float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
-
-            float dR2 = dEta * dEta + dPhi * dPhi;
             if (dR2 < 0.000001f)
               pixelSegments.isDup()[pixelArrayIndex] = true;
           }
diff --git a/RecoTracker/LSTCore/src/alpaka/Triplet.h b/RecoTracker/LSTCore/src/alpaka/Triplet.h
index 81c166cd1f4f5..fe0fe442e2c04 100644
--- a/RecoTracker/LSTCore/src/alpaka/Triplet.h
+++ b/RecoTracker/LSTCore/src/alpaka/Triplet.h
@@ -14,6 +14,33 @@
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
+  // Pre-loaded inner-segment-constant data for passPointingConstraint.
+  // Populated once per inner segment, reused across all outer segments in the inner loop.
+  struct T3InnerHitData {
+    float x1, y1;           // first MD position (cm)
+    float x2, y2;           // second MD position (shared MD, cm)
+    float rt1, rt2;         // anchorRt for first two MDs (cm)
+    float drt_InSeg;        // rt2 - rt1
+    float rt_InSeg;         // sqrt((x2-x1)^2 + (y2-y1)^2)
+    float sdIn_alpha;       // dPhiChange of inner segment
+    float sdIn_alphaRHmin;  // dPhiChangeMin (for endcap path)
+    float sdIn_alphaRHmax;  // dPhiChangeMax (for endcap path)
+    // Precomputed sin/cos of alpha values for algebraic betaIn check (avoids per-candidate atan2).
+    float sin_alpha, cos_alpha;
+    float sin_alphaRHmin, cos_alphaRHmin;  // for endcap EEE path
+    float sin_alphaRHmax, cos_alphaRHmax;  // for endcap EEE path
+    short innerSubdet;                     // subdet of inner-inner module
+    short middleSubdet;                    // subdet of middle module
+  };
+
+  // Pre-loaded hit coordinates for passRZConstraint.
+  // All values in cm, passRZConstraint converts to meters internally.
+  struct T3HitCoords {
+    float x1, y1, z1, rt1;
+    float x2, y2, z2, rt2;
+    float x3, y3, z3, rt3;
+  };
+
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(ModulesConst modules,
                                                          MiniDoubletsConst mds,
                                                          SegmentsConst segments,
@@ -72,13 +99,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   template <alpaka::concepts::Acc TAcc>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const& acc,
                                                        ModulesConst modules,
-                                                       MiniDoubletsConst mds,
                                                        uint16_t innerInnerLowerModuleIndex,
                                                        uint16_t middleLowerModuleIndex,
                                                        uint16_t outerOuterLowerModuleIndex,
-                                                       unsigned int firstMDIndex,
-                                                       unsigned int secondMDIndex,
-                                                       unsigned int thirdMDIndex,
+                                                       T3HitCoords const& h,
                                                        float circleRadius,
                                                        float circleCenterX,
                                                        float circleCenterY,
@@ -89,26 +113,26 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     const short layer3 = modules.lstLayers()[outerOuterLowerModuleIndex];
 
     //all the values are stored in the unit of cm, in the calculation below we need to be cautious if we want to use the meter unit
-    //get r and z
-    const float r1 = mds.anchorRt()[firstMDIndex] / 100;
-    const float r2 = mds.anchorRt()[secondMDIndex] / 100;
-    const float r3 = mds.anchorRt()[thirdMDIndex] / 100;
+    //get r and z (convert from cm to m)
+    const float r1 = h.rt1 / 100;
+    const float r2 = h.rt2 / 100;
+    const float r3 = h.rt3 / 100;
 
-    const float z1 = mds.anchorZ()[firstMDIndex] / 100;
-    const float z2 = mds.anchorZ()[secondMDIndex] / 100;
-    const float z3 = mds.anchorZ()[thirdMDIndex] / 100;
+    const float z1 = h.z1 / 100;
+    const float z2 = h.z2 / 100;
+    const float z3 = h.z3 / 100;
 
     //use linear approximation for regions 9 and 20-24 because it works better (see https://github.com/SegmentLinking/cmssw/pull/92)
     float residual = alpaka::math::abs(acc, z2 - ((z3 - z1) / (r3 - r1) * (r2 - r1) + z1));
 
-    //get the x,y position of each MD
-    const float x1 = mds.anchorX()[firstMDIndex] / 100;
-    const float x2 = mds.anchorX()[secondMDIndex] / 100;
-    const float x3 = mds.anchorX()[thirdMDIndex] / 100;
+    //get the x,y position of each MD (convert from cm to m)
+    const float x1 = h.x1 / 100;
+    const float x2 = h.x2 / 100;
+    const float x3 = h.x3 / 100;
 
-    const float y1 = mds.anchorY()[firstMDIndex] / 100;
-    const float y2 = mds.anchorY()[secondMDIndex] / 100;
-    const float y3 = mds.anchorY()[thirdMDIndex] / 100;
+    const float y1 = h.y1 / 100;
+    const float y2 = h.y2 / 100;
+    const float y3 = h.y3 / 100;
 
     float cross = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
     charge = -1 * ((int)copysignf(1.0f, cross));
@@ -186,13 +210,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     else if (y3 > y2 && y2 > y1)
       py = alpaka::math::abs(acc, py);
 
-    float AO = alpaka::math::sqrt(
-        acc, (x_other - x_center) * (x_other - x_center) + (y_other - y_center) * (y_other - y_center));
-    float BO =
-        alpaka::math::sqrt(acc, (x_init - x_center) * (x_init - x_center) + (y_init - y_center) * (y_init - y_center));
+    // All 3 hits lie on the fitted circle, so AO = BO = R = circleRadius/100.
+    // Law of Cosines: cos(dPhi) = (2R^2 - AB^2) / (2R^2) = 1 - AB^2/(2R^2)
+    float R = circleRadius / 100;
     float AB2 = (x_other - x_init) * (x_other - x_init) + (y_other - y_init) * (y_other - y_init);
-    float dPhi = alpaka::math::acos(acc, (AO * AO + BO * BO - AB2) / (2 * AO * BO));  //Law of Cosines
-    float ds = circleRadius / 100 * dPhi;
+    float dPhi = alpaka::math::acos(acc, 1 - AB2 / (2 * R * R));
+    float ds = R * dPhi;
     float pz = dz / ds * pt;
 
     float p = alpaka::math::sqrt(acc, px * px + py * py + pz * pz);
@@ -216,8 +239,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       float A = paraB * paraB + paraC * paraC;
       float B = 2 * paraA * paraB;
       float C = paraA * paraA - paraC * paraC;
-      float sol1 = (-B + alpaka::math::sqrt(acc, B * B - 4 * A * C)) / (2 * A);
-      float sol2 = (-B - alpaka::math::sqrt(acc, B * B - 4 * A * C)) / (2 * A);
+      float disc = alpaka::math::sqrt(acc, B * B - 4 * A * C);
+      float sol1 = (-B + disc) / (2 * A);
+      float sol2 = (-B - disc) / (2 * A);
       float solz1 = alpaka::math::asin(acc, sol1) / rou * pz / p + z_init;
       float solz2 = alpaka::math::asin(acc, sol2) / rou * pz / p + z_init;
       float diffz1 = (solz1 - z_target) * 100;
@@ -228,8 +252,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                      : ((alpaka::math::abs(acc, diffz1) < alpaka::math::abs(acc, diffz2)) ? diffz1 : diffz2);
     } else {  // for endcap
       float s = (z_target - z_init) * p / pz;
-      float x = x_init + px / a * alpaka::math::sin(acc, rou * s) - py / a * (1 - alpaka::math::cos(acc, rou * s));
-      float y = y_init + py / a * alpaka::math::sin(acc, rou * s) + px / a * (1 - alpaka::math::cos(acc, rou * s));
+      float sinRS = alpaka::math::sin(acc, rou * s);
+      float cosRS = alpaka::math::cos(acc, rou * s);
+      float x = x_init + px / a * sinRS - py / a * (1 - cosRS);
+      float y = y_init + py / a * sinRS + px / a * (1 - cosRS);
       residual = (r_target - alpaka::math::sqrt(acc, x * x + y * y)) * 100;
     }
 
@@ -326,66 +352,45 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
   }
 
   template <alpaka::concepts::Acc TAcc>
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const& acc,
-                                                             ModulesConst modules,
-                                                             MiniDoubletsConst mds,
-                                                             SegmentsConst segments,
-                                                             unsigned int firstMDIndex,
-                                                             unsigned int secondMDIndex,
-                                                             unsigned int thirdMDIndex,
-                                                             uint16_t innerInnerLowerModuleIndex,
-                                                             uint16_t middleLowerModuleIndex,
-                                                             uint16_t outerOuterLowerModuleIndex,
-                                                             unsigned int innerSegmentIndex,
-                                                             const float ptCut) {
-    const float x1 = mds.anchorX()[firstMDIndex];
-    const float x2 = mds.anchorX()[secondMDIndex];
-    const float x3 = mds.anchorX()[thirdMDIndex];
-    const float y1 = mds.anchorY()[firstMDIndex];
-    const float y2 = mds.anchorY()[secondMDIndex];
-    const float y3 = mds.anchorY()[thirdMDIndex];
-
-    const short innerInnerLowerModuleSubdet = modules.subdets()[innerInnerLowerModuleIndex];
-    const short middleLowerModuleSubdet = modules.subdets()[middleLowerModuleIndex];
-    const short outerOuterLowerModuleSubdet = modules.subdets()[outerOuterLowerModuleIndex];
-
-    const float rt_InLo = mds.anchorRt()[firstMDIndex];
-    const float rt_InOut = mds.anchorRt()[secondMDIndex];
-    const float sdIn_alpha = __H2F(segments.dPhiChanges()[innerSegmentIndex]);
-
-    const float drt_InSeg = rt_InOut - rt_InLo;
-    const float drt_tl_axis = alpaka::math::sqrt(acc, (x3 - x1) * (x3 - x1) + (y3 - y1) * (y3 - y1));
-
-    //innerOuterAnchor - innerInnerAnchor
-    const float rt_InSeg = alpaka::math::sqrt(acc, (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1));
-
-    const float betaIn =
-        sdIn_alpha - cms::alpakatools::reducePhiRange(
-                         acc, cms::alpakatools::phi(acc, x3 - x1, y3 - y1) - mds.anchorPhi()[firstMDIndex]);
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(
+      TAcc const& acc, T3InnerHitData const& inner, float x3, float y3, short outerSubdet, const float ptCut) {
+    const float dx = x3 - inner.x1;
+    const float dy = y3 - inner.y1;
+    const float drt_tl_axis = alpaka::math::sqrt(acc, dx * dx + dy * dy);
+
+    // For T3, the argument (-rt_InSeg + drt_tl_axis) * k2Rinv1GeVf / ptCut is always < 0.02,
+    // so asin(x) ~ x to better than 0.01% accuracy.
     const float betaInCut =
-        alpaka::math::asin(acc, alpaka::math::min(acc, (-rt_InSeg + drt_tl_axis) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) +
-        (0.02f / drt_InSeg);
-
-    float betaInRHmin = betaIn;
-
-    if (innerInnerLowerModuleSubdet == Endcap and middleLowerModuleSubdet == Endcap and
-        outerOuterLowerModuleSubdet == Endcap) {
-      float sdIn_alphaRHmin = __H2F(segments.dPhiChangeMins()[innerSegmentIndex]);
-      float sdIn_alphaRHmax = __H2F(segments.dPhiChangeMaxs()[innerSegmentIndex]);
-
-      betaInRHmin = betaIn + sdIn_alphaRHmin - sdIn_alpha;
-      float betaInRHmax = betaIn + sdIn_alphaRHmax - sdIn_alpha;
-      float swapTemp;
-
-      if (alpaka::math::abs(acc, betaInRHmin) > alpaka::math::abs(acc, betaInRHmax)) {
-        swapTemp = betaInRHmin;
-        betaInRHmin = betaInRHmax;
-        betaInRHmax = swapTemp;
+        alpaka::math::min(acc, (-inner.rt_InSeg + drt_tl_axis) * k2Rinv1GeVf / ptCut, kSinAlphaMax) +
+        (0.02f / inner.drt_InSeg);
+
+    // Algebraic betaIn check: exact replacement of |betaIn| < betaInCut without atan2.
+    // betaIn = alpha - deltaPhi(dir_13, hit1), where deltaPhi = atan2(cross, dot).
+    // |alpha - atan2(cross, dot)| < cut  <==>  sin^2(alpha - atan2) < cut^2 when cos > 0.
+    // sin(alpha - atan2) = (sin_alpha * dot - cos_alpha * cross) / sqrt(cross^2 + dot^2).
+    // cross = x1*(y3-y1) - y1*(x3-x1) = x1*y3 - y1*x3 (angle from hit1 to dir_13).
+    const float crossPC = inner.x1 * y3 - inner.y1 * x3;
+    const float dotPC = x3 * inner.x1 + y3 * inner.y1 - inner.rt1 * inner.rt1;
+    const float r2 = crossPC * crossPC + dotPC * dotPC;
+    const float betaInCutSq = betaInCut * betaInCut;
+
+    if (inner.innerSubdet == Endcap and inner.middleSubdet == Endcap and outerSubdet == Endcap) {
+      // EEE: check both alpha variants, pass if the one with smaller |betaIn| is within cut
+      const float sinValMin = inner.sin_alphaRHmin * dotPC - inner.cos_alphaRHmin * crossPC;
+      const float sinValMax = inner.sin_alphaRHmax * dotPC - inner.cos_alphaRHmax * crossPC;
+      const float sqMin = sinValMin * sinValMin;
+      const float sqMax = sinValMax * sinValMax;
+
+      if (sqMin <= sqMax) {
+        return sqMin < betaInCutSq * r2 and (inner.cos_alphaRHmin * dotPC + inner.sin_alphaRHmin * crossPC > 0.f);
+      } else {
+        return sqMax < betaInCutSq * r2 and (inner.cos_alphaRHmax * dotPC + inner.sin_alphaRHmax * crossPC > 0.f);
       }
     }
 
     //Beta cut
-    return alpaka::math::abs(acc, betaInRHmin) < betaInCut;
+    const float sinVal = inner.sin_alpha * dotPC - inner.cos_alpha * crossPC;
+    return sinVal * sinVal < betaInCutSq * r2 and (inner.cos_alpha * dotPC + inner.sin_alpha * crossPC > 0.f);
   }
 
   template <alpaka::concepts::Acc TAcc>
@@ -410,46 +415,50 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     const unsigned int secondMDIndex = segments.mdIndices()[outerSegmentIndex][0];
     const unsigned int thirdMDIndex = segments.mdIndices()[outerSegmentIndex][1];
 
-    const float x1 = mds.anchorX()[firstMDIndex];
-    const float x2 = mds.anchorX()[secondMDIndex];
-    const float x3 = mds.anchorX()[thirdMDIndex];
-    const float y1 = mds.anchorY()[firstMDIndex];
-    const float y2 = mds.anchorY()[secondMDIndex];
-    const float y3 = mds.anchorY()[thirdMDIndex];
+    // Load ALL hit coordinates ONCE - reused by circle fit, passRZConstraint, and betaIn computation
+    T3HitCoords h;
+    h.x1 = mds.anchorX()[firstMDIndex];
+    h.y1 = mds.anchorY()[firstMDIndex];
+    h.z1 = mds.anchorZ()[firstMDIndex];
+    h.rt1 = mds.anchorRt()[firstMDIndex];
+    h.x2 = mds.anchorX()[secondMDIndex];
+    h.y2 = mds.anchorY()[secondMDIndex];
+    h.z2 = mds.anchorZ()[secondMDIndex];
+    h.rt2 = mds.anchorRt()[secondMDIndex];
+    h.x3 = mds.anchorX()[thirdMDIndex];
+    h.y3 = mds.anchorY()[thirdMDIndex];
+    h.z3 = mds.anchorZ()[thirdMDIndex];
+    h.rt3 = mds.anchorRt()[thirdMDIndex];
 
     std::tie(circleRadius, circleCenterX, circleCenterY) =
-        computeRadiusFromThreeAnchorHits(acc, x1, y1, x2, y2, x3, y3);
+        computeRadiusFromThreeAnchorHits(acc, h.x1, h.y1, h.x2, h.y2, h.x3, h.y3);
 
     if (not passRZConstraint(acc,
                              modules,
-                             mds,
                              innerInnerLowerModuleIndex,
                              middleLowerModuleIndex,
                              outerOuterLowerModuleIndex,
-                             firstMDIndex,
-                             secondMDIndex,
-                             thirdMDIndex,
+                             h,
                              circleRadius,
                              circleCenterX,
                              circleCenterY,
                              charge))
       return false;
 
-    const float rt_InLo = mds.anchorRt()[firstMDIndex];
-    const float rt_InOut = mds.anchorRt()[secondMDIndex];
+    // Reuse pre-loaded coordinates for betaIn computation (no SoA reloads)
     const float sdIn_alpha = __H2F(segments.dPhiChanges()[innerSegmentIndex]);
 
-    const float drt_InSeg = rt_InOut - rt_InLo;
-    const float drt_tl_axis = alpaka::math::sqrt(acc, (x3 - x1) * (x3 - x1) + (y3 - y1) * (y3 - y1));
+    const float drt_InSeg = h.rt2 - h.rt1;
+    const float drt_tl_axis = alpaka::math::sqrt(acc, (h.x3 - h.x1) * (h.x3 - h.x1) + (h.y3 - h.y1) * (h.y3 - h.y1));
 
     //innerOuterAnchor - innerInnerAnchor
-    const float rt_InSeg = alpaka::math::sqrt(acc, (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1));
+    const float rt_InSeg = alpaka::math::sqrt(acc, (h.x2 - h.x1) * (h.x2 - h.x1) + (h.y2 - h.y1) * (h.y2 - h.y1));
 
-    betaIn = sdIn_alpha - cms::alpakatools::reducePhiRange(
-                              acc, cms::alpakatools::phi(acc, x3 - x1, y3 - y1) - mds.anchorPhi()[firstMDIndex]);
+    betaIn =
+        sdIn_alpha - cms::alpakatools::reducePhiRange(
+                         acc, cms::alpakatools::phi(acc, h.x3 - h.x1, h.y3 - h.y1) - mds.anchorPhi()[firstMDIndex]);
     betaInCut =
-        alpaka::math::asin(acc, alpaka::math::min(acc, (-rt_InSeg + drt_tl_axis) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) +
-        (0.02f / drt_InSeg);
+        alpaka::math::min(acc, (-rt_InSeg + drt_tl_axis) * k2Rinv1GeVf / ptCut, kSinAlphaMax) + (0.02f / drt_InSeg);
 
     bool inference =
         lst::t3dnn::runInference(acc, mds, firstMDIndex, secondMDIndex, thirdMDIndex, circleRadius, betaIn, t3Scores);
@@ -514,6 +523,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
           uint16_t middleLowerModuleIndex = segments.outerLowerModuleIndices()[innerSegmentIndex];
           int middleMDIndiceInner = segments.mdIndices()[innerSegmentIndex][1];
+          unsigned int firstMDIndex = segments.mdIndices()[innerSegmentIndex][0];
+
+          // Pre-load inner-segment-constant data (reused across all outer segments)
+          T3InnerHitData inner;
+          inner.x1 = mds.anchorX()[firstMDIndex];
+          inner.y1 = mds.anchorY()[firstMDIndex];
+          inner.x2 = mds.anchorX()[middleMDIndiceInner];
+          inner.y2 = mds.anchorY()[middleMDIndiceInner];
+          inner.rt1 = mds.anchorRt()[firstMDIndex];
+          inner.rt2 = mds.anchorRt()[middleMDIndiceInner];
+          inner.drt_InSeg = inner.rt2 - inner.rt1;
+          inner.rt_InSeg = alpaka::math::sqrt(
+              acc, (inner.x2 - inner.x1) * (inner.x2 - inner.x1) + (inner.y2 - inner.y1) * (inner.y2 - inner.y1));
+          inner.sdIn_alpha = __H2F(segments.dPhiChanges()[innerSegmentIndex]);
+          inner.sin_alpha = alpaka::math::sin(acc, inner.sdIn_alpha);
+          inner.cos_alpha = alpaka::math::cos(acc, inner.sdIn_alpha);
+          inner.innerSubdet = modules.subdets()[innerInnerLowerModuleIndex];
+          inner.middleSubdet = modules.subdets()[middleLowerModuleIndex];
+          inner.sdIn_alphaRHmin = __H2F(segments.dPhiChangeMins()[innerSegmentIndex]);
+          inner.sdIn_alphaRHmax = __H2F(segments.dPhiChangeMaxs()[innerSegmentIndex]);
+          if (inner.innerSubdet == Endcap and inner.middleSubdet == Endcap) {
+            inner.sin_alphaRHmin = alpaka::math::sin(acc, inner.sdIn_alphaRHmin);
+            inner.cos_alphaRHmin = alpaka::math::cos(acc, inner.sdIn_alphaRHmin);
+            inner.sin_alphaRHmax = alpaka::math::sin(acc, inner.sdIn_alphaRHmax);
+            inner.cos_alphaRHmax = alpaka::math::cos(acc, inner.sdIn_alphaRHmax);
+          }
 
           unsigned int nOuterSegments = segmentsOccupancy.nSegments()[middleLowerModuleIndex];
           for (unsigned int outerSegmentArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterSegments)) {
@@ -524,22 +559,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
               continue;
 
             uint16_t outerOuterLowerModuleIndex = segments.outerLowerModuleIndices()[outerSegmentIndex];
-            unsigned int firstMDIndex = segments.mdIndices()[innerSegmentIndex][0];
-            unsigned int secondMDIndex = segments.mdIndices()[outerSegmentIndex][0];
             unsigned int thirdMDIndex = segments.mdIndices()[outerSegmentIndex][1];
+            float x3 = mds.anchorX()[thirdMDIndex];
+            float y3 = mds.anchorY()[thirdMDIndex];
+            short outerSubdet = modules.subdets()[outerOuterLowerModuleIndex];
 
-            if (not passPointingConstraint(acc,
-                                           modules,
-                                           mds,
-                                           segments,
-                                           firstMDIndex,
-                                           secondMDIndex,
-                                           thirdMDIndex,
-                                           innerInnerLowerModuleIndex,
-                                           middleLowerModuleIndex,
-                                           outerOuterLowerModuleIndex,
-                                           innerSegmentIndex,
-                                           ptCut))
+            if (not passPointingConstraint(acc, inner, x3, y3, outerSubdet, ptCut))
               continue;
 
             // Match inner Sg and Outer Sg
@@ -672,29 +697,46 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
           if (nOuterSegments == 0)
             continue;
 
+          const unsigned int firstMDIndex = mdIndices[innerSegmentIndex][0];
+
+          // Pre-load inner-segment-constant data (reused across all outer segments)
+          T3InnerHitData inner;
+          inner.x1 = mds.anchorX()[firstMDIndex];
+          inner.y1 = mds.anchorY()[firstMDIndex];
+          inner.x2 = mds.anchorX()[mdShared];
+          inner.y2 = mds.anchorY()[mdShared];
+          inner.rt1 = mds.anchorRt()[firstMDIndex];
+          inner.rt2 = mds.anchorRt()[mdShared];
+          inner.drt_InSeg = inner.rt2 - inner.rt1;
+          inner.rt_InSeg = alpaka::math::sqrt(
+              acc, (inner.x2 - inner.x1) * (inner.x2 - inner.x1) + (inner.y2 - inner.y1) * (inner.y2 - inner.y1));
+          inner.sdIn_alpha = __H2F(segments.dPhiChanges()[innerSegmentIndex]);
+          inner.sin_alpha = alpaka::math::sin(acc, inner.sdIn_alpha);
+          inner.cos_alpha = alpaka::math::cos(acc, inner.sdIn_alpha);
+          inner.innerSubdet = modules.subdets()[innerLowerModuleArrayIdx];
+          inner.middleSubdet = modules.subdets()[middleLowerModuleIndex];
+          inner.sdIn_alphaRHmin = __H2F(segments.dPhiChangeMins()[innerSegmentIndex]);
+          inner.sdIn_alphaRHmax = __H2F(segments.dPhiChangeMaxs()[innerSegmentIndex]);
+          if (inner.innerSubdet == Endcap and inner.middleSubdet == Endcap) {
+            inner.sin_alphaRHmin = alpaka::math::sin(acc, inner.sdIn_alphaRHmin);
+            inner.cos_alphaRHmin = alpaka::math::cos(acc, inner.sdIn_alphaRHmin);
+            inner.sin_alphaRHmax = alpaka::math::sin(acc, inner.sdIn_alphaRHmax);
+            inner.cos_alphaRHmax = alpaka::math::cos(acc, inner.sdIn_alphaRHmax);
+          }
+
           for (unsigned int outerSegmentArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterSegments)) {
             const unsigned int outerSegmentIndex = segmentRanges[middleLowerModuleIndex][0] + outerSegmentArrayIndex;
 
             if (mdIndices[outerSegmentIndex][0] != mdShared)
               continue;
 
-            unsigned int firstMDIndex = mdIndices[innerSegmentIndex][0];
-            unsigned int secondMDIndex = mdShared;
             unsigned int thirdMDIndex = mdIndices[outerSegmentIndex][1];
             uint16_t outerOuterLowerModuleIndex = outerLowerModuleIndices[outerSegmentIndex];
+            float x3 = mds.anchorX()[thirdMDIndex];
+            float y3 = mds.anchorY()[thirdMDIndex];
+            short outerSubdet = modules.subdets()[outerOuterLowerModuleIndex];
 
-            if (not passPointingConstraint(acc,
-                                           modules,
-                                           mds,
-                                           segments,
-                                           firstMDIndex,
-                                           secondMDIndex,
-                                           thirdMDIndex,
-                                           innerLowerModuleArrayIdx,
-                                           middleLowerModuleIndex,
-                                           outerOuterLowerModuleIndex,
-                                           innerSegmentIndex,
-                                           ptCut))
+            if (not passPointingConstraint(acc, inner, x3, y3, outerSubdet, ptCut))
               continue;
 
             alpaka::atomicAdd(acc, &segments.connectedMax()[innerSegmentIndex], 1u, alpaka::hierarchy::Threads{});