Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion RecoTracker/LST/plugins/alpaka/LSTProducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
clustSizeCut_(static_cast<uint16_t>(config.getParameter<uint32_t>("clustSizeCut"))),
nopLSDupClean_(config.getParameter<bool>("nopLSDupClean")),
tcpLSTriplets_(config.getParameter<bool>("tcpLSTriplets")),
reduceMem_(config.getParameter<bool>("reduceMem")),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something like fullPrecomputeMemSlots or similarly expressive (like reduceMemByFullPrecompute). Just "reduceMem" seems more unclear why would it ever be "false".
Adding a comment in the fillDescrptions can useful as well

lstOutputToken_{produces()} {}

void produce(edm::StreamID sid, device::Event& iEvent, const device::EventSetup& iSetup) const override {
Expand All @@ -47,7 +48,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
&lstESDeviceData,
&lstInputDC,
nopLSDupClean_,
tcpLSTriplets_);
tcpLSTriplets_,
reduceMem_);

// Output
auto lstTrackCandidates = lst.getTrackCandidates();
Expand All @@ -63,6 +65,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
desc.add<std::string>("ptCutLabel", "0.8");
desc.add<bool>("nopLSDupClean", false);
desc.add<bool>("tcpLSTriplets", false);
desc.add<bool>("reduceMem", false);
descriptions.addWithDefaultLabel(desc);
}

Expand All @@ -74,6 +77,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
const uint16_t clustSizeCut_;
const bool nopLSDupClean_;
const bool tcpLSTriplets_;
const bool reduceMem_;
const device::EDPutToken<lst::TrackCandidatesBaseDeviceCollection> lstOutputToken_;
};

Expand Down
3 changes: 2 additions & 1 deletion RecoTracker/LSTCore/interface/alpaka/LST.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
LSTESData<Device> const* deviceESData,
LSTInputDeviceCollection const* lstInputDC,
bool no_pls_dupclean,
bool tc_pls_triplets);
bool tc_pls_triplets,
bool reduceMem = false);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool reduceMem = false);
bool reduceMem);

better be explicit

std::unique_ptr<TrackCandidatesBaseDeviceCollection> getTrackCandidates() {
return std::move(trackCandidatesBaseDC_);
}
Expand Down
5 changes: 3 additions & 2 deletions RecoTracker/LSTCore/src/alpaka/LST.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ void LST::run(Queue& queue,
LSTESData<Device> const* deviceESData,
LSTInputDeviceCollection const* lstInputDC,
bool no_pls_dupclean,
bool tc_pls_triplets) {
auto event = LSTEvent(verbose, ptCut, clustSizeCut, queue, deviceESData);
bool tc_pls_triplets,
bool reduceMem) {
auto event = LSTEvent(verbose, ptCut, clustSizeCut, queue, deviceESData, reduceMem);

event.addInputToEvent(lstInputDC);
event.addHitToEvent();
Expand Down
268 changes: 190 additions & 78 deletions RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions RecoTracker/LSTCore/src/alpaka/LSTEvent.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
Queue& queue_;
const float ptCut_;
const uint16_t clustSizeCut_;
const bool reduceMem_;

std::array<unsigned int, 6> n_minidoublets_by_layer_barrel_{};
std::array<unsigned int, 5> n_minidoublets_by_layer_endcap_{};
Expand Down Expand Up @@ -101,11 +102,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {

public:
// Constructor used for CMSSW integration. Uses an external queue.
LSTEvent(
bool verbose, const float ptCut, const uint16_t clustSizeCut, Queue& q, const LSTESData<Device>* deviceESData)
LSTEvent(bool verbose,
const float ptCut,
const uint16_t clustSizeCut,
Queue& q,
const LSTESData<Device>* deviceESData,
bool reduceMem = false)
: queue_(q),
ptCut_(ptCut),
clustSizeCut_(clustSizeCut),
reduceMem_(reduceMem),
nModules_(deviceESData->nModules),
nLowerModules_(deviceESData->nLowerModules),
nPixels_(deviceESData->nPixels),
Expand Down
251 changes: 179 additions & 72 deletions RecoTracker/LSTCore/src/alpaka/MiniDoublet.h

Large diffs are not rendered by default.

168 changes: 164 additions & 4 deletions RecoTracker/LSTCore/src/alpaka/Quadruplet.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,165 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
return true;
};

// Reduced-memory version: processes dense T4 pairs in a separate kernel.
// Only launched when reduceMem is enabled.
struct CreateQuadrupletsDense {
ALPAKA_FN_ACC void operator()(Acc3D const& acc,
ModulesConst modules,
MiniDoubletsConst mds,
SegmentsConst segments,
TripletsConst triplets,
TripletsOccupancyConst tripletsOccupancy,
Quadruplets quadruplets,
QuadrupletsOccupancy quadrupletsOccupancy,
ObjectRangesConst ranges,
uint16_t nEligibleT4Modules,
const float ptCut) const {
const auto& mdIndices = segments.mdIndices();
const auto& segIdx = triplets.segmentIndices();
const auto& lmIdx = triplets.lowerModuleIndices();
const auto& tripIdx = ranges.tripletModuleIndices();

for (int iter : cms::alpakatools::uniform_groups_z(acc, nEligibleT4Modules)) {
const uint16_t lowerModule1 = ranges.indicesOfEligibleT4Modules()[iter];

short layer2_adjustment, md_adjustment;
int layer = modules.layers()[lowerModule1];
if (layer == 1) {
if (modules.subdets()[lowerModule1] != Endcap)
continue;
layer2_adjustment = 1;
md_adjustment = 1;
} else if (layer == 2) {
if (modules.subdets()[lowerModule1] != Endcap)
continue;
layer2_adjustment = 1;
md_adjustment = 0;
} else {
layer2_adjustment = 0;
md_adjustment = 0;
}
const unsigned int nInnerTriplets = tripletsOccupancy.nTriplets()[lowerModule1];

for (unsigned int innerTripletArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerTriplets)) {
const unsigned int innerTripletIndex = tripIdx[lowerModule1] + innerTripletArrayIndex;
if (triplets.partOfPT5()[innerTripletIndex])
continue;
if (triplets.partOfT5()[innerTripletIndex])
continue;
if (triplets.partOfPT3()[innerTripletIndex])
continue;
const uint16_t lowerModule2 = lmIdx[innerTripletIndex][1];
const unsigned int nOuterTriplets = tripletsOccupancy.nTriplets()[lowerModule2];
for (unsigned int outerTripletArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterTriplets)) {
unsigned int outerTripletIndex = tripIdx[lowerModule2] + outerTripletArrayIndex;
if (triplets.partOfPT5()[outerTripletIndex])
continue;
if (triplets.partOfT5()[outerTripletIndex])
continue;
if (triplets.partOfPT3()[outerTripletIndex])
continue;

const unsigned int innerT3LS2Index = segIdx[innerTripletIndex][1];
const unsigned int outerT3LS1Index = segIdx[outerTripletIndex][0];

if (innerT3LS2Index != outerT3LS1Index)
continue;

// When launched as the sole creation kernel (reduceMem mode), processes ALL pairs.
// When launched alongside CreateQuadruplets, processes only dense pairs.

const uint16_t lowerModule3 = lmIdx[outerTripletIndex][1];
const uint16_t lowerModule4 = lmIdx[outerTripletIndex][2];

float innerRadius = triplets.radius()[innerTripletIndex];
float outerRadius = triplets.radius()[outerTripletIndex];
float rzChiSquared, dBeta, nonAnchorChiSquared, regressionCenterX, regressionCenterY, regressionRadius,
nonAnchorRegressionRadius, chiSquared, promptScore, displacedScore, fakeScore;

float pt = (innerRadius + outerRadius) * k2Rinv1GeVf;

bool success = runQuadrupletDefaultAlgo(acc,
modules,
mds,
segments,
triplets,
lowerModule1,
lowerModule2,
lowerModule3,
lowerModule4,
innerTripletIndex,
outerTripletIndex,
regressionCenterX,
regressionCenterY,
regressionRadius,
nonAnchorRegressionRadius,
chiSquared,
ptCut,
rzChiSquared,
nonAnchorChiSquared,
dBeta,
promptScore,
displacedScore,
fakeScore);
if (success) {
int totOccupancyQuadruplets = alpaka::atomicAdd(
acc, &quadrupletsOccupancy.totOccupancyQuadruplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{});
if (totOccupancyQuadruplets >= ranges.quadrupletModuleOccupancy()[lowerModule1]) {
#ifdef WARNINGS
printf("Quadruplet excess alert! Module index = %d, Occupancy = %d\n",
lowerModule1,
totOccupancyQuadruplets);
#endif
} else {
int quadrupletModuleIndex = alpaka::atomicAdd(
acc, &quadrupletsOccupancy.nQuadruplets()[lowerModule1], 1u, alpaka::hierarchy::Threads{});
if (ranges.quadrupletModuleIndices()[lowerModule1] == -1) {
#ifdef WARNINGS
printf("Quadruplets : no memory for module at module index = %d\n", lowerModule1);
#endif
} else {
unsigned int quadrupletIndex = ranges.quadrupletModuleIndices()[lowerModule1] + quadrupletModuleIndex;
const unsigned int layer3MDIndex =
mdIndices[segIdx[innerTripletIndex][md_adjustment]][layer2_adjustment];
float phi = mds.anchorPhi()[layer3MDIndex];
float eta = mds.anchorEta()[layer3MDIndex];

float scores = chiSquared + nonAnchorChiSquared;
addQuadrupletToMemory(triplets,
quadruplets,
innerTripletIndex,
outerTripletIndex,
lowerModule1,
lowerModule2,
lowerModule3,
lowerModule4,
innerRadius,
outerRadius,
pt,
eta,
phi,
scores,
layer,
quadrupletIndex,
rzChiSquared,
dBeta,
promptScore,
displacedScore,
fakeScore,
regressionCenterX,
regressionCenterY,
regressionRadius,
nonAnchorRegressionRadius);
}
}
}
}
}
}
}
};

struct CreateQuadruplets {
ALPAKA_FN_ACC void operator()(Acc3D const& acc,
ModulesConst modules,
Expand Down Expand Up @@ -610,8 +769,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
if (innerT3LS2Index != outerT3LS1Index)
continue;

// If densely connected, do not attempt parallel processing to avoid truncation
if (nInnerTriplets >= kNTripletThreshold || nOuterTriplets >= kNTripletThreshold) {
// Dense pairs: handle inline
const uint16_t lowerModule3 = lmIdx[outerTripletIndex][1];
const uint16_t lowerModule4 = lmIdx[outerTripletIndex][2];

Expand Down Expand Up @@ -843,7 +1002,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
Triplets triplets,
TripletsOccupancyConst tripletsOcc,
ObjectRangesConst ranges,
const float ptCut) const {
const float ptCut,
const bool reduceMem = false) const {
// The atomicAdd below with hierarchy::Threads{} requires one block in x, y dimensions.
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[1] == 1) &&
(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[2] == 1));
Expand Down Expand Up @@ -879,8 +1039,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
const unsigned int thirdMDOuter = mdIndices[thirdSegIdx][1];

if ((secondMDInner == thirdMDInner) && (secondMDOuter == thirdMDOuter)) {
// Will only perform runQuadrupletDefaultAlgorithm() checks if densely connected
if (nInnerTriplets < kNTripletThreshold && nOuterTriplets < kNTripletThreshold) {
// When reduceMem or densely connected: run full algo for exact count
if (!reduceMem && nInnerTriplets < kNTripletThreshold && nOuterTriplets < kNTripletThreshold) {
alpaka::atomicAdd(acc, &triplets.connectedLSMax()[innerTripletIndex], 1u, alpaka::hierarchy::Threads{});
} else {
const uint16_t lowerModule3 = lmIdx[outerTripletIndex][1];
Expand Down
Loading