diff --git a/Common/Utils/include/CommonUtils/IRFrameSelector.h b/Common/Utils/include/CommonUtils/IRFrameSelector.h index 6312ae8314c3a..a4365030b6a12 100644 --- a/Common/Utils/include/CommonUtils/IRFrameSelector.h +++ b/Common/Utils/include/CommonUtils/IRFrameSelector.h @@ -46,6 +46,8 @@ class IRFrameSelector auto getIRFrames() const { return mFrames; } bool isSet() const { return mIsSet; } + void setOwnList(const std::vector& lst, bool toBeSorted); + private: gsl::span mFrames{}; // externally provided span of IRFrames, must be sorted in IRFrame.getMin() o2::dataformats::IRFrame mLastIRFrameChecked{}; // last frame which was checked diff --git a/Common/Utils/src/IRFrameSelector.cxx b/Common/Utils/src/IRFrameSelector.cxx index 8122484659f45..abc0ee1ee6ce3 100644 --- a/Common/Utils/src/IRFrameSelector.cxx +++ b/Common/Utils/src/IRFrameSelector.cxx @@ -167,6 +167,16 @@ size_t IRFrameSelector::loadIRFrames(const std::string& fname) return mOwnList.size(); } +void IRFrameSelector::setOwnList(const std::vector& lst, bool toBeSorted) +{ + clear(); + mOwnList.insert(mOwnList.end(), lst.begin(), lst.end()); + if (toBeSorted) { + std::sort(mOwnList.begin(), mOwnList.end(), [](const auto& a, const auto& b) { return a.getMin() < b.getMin(); }); + } + setSelectedIRFrames(mOwnList, 0, 0, 0, false); +} + void IRFrameSelector::print(bool lst) const { LOGP(info, "Last query stopped at entry {} for IRFrame {}:{}", mLastBoundID, @@ -183,6 +193,8 @@ void IRFrameSelector::clear() { mIsSet = false; mOwnList.clear(); + mLastIRFrameChecked.getMin().clear(); // invalidate + mLastBoundID = -1; mFrames = {}; } diff --git a/Detectors/AOD/include/AODProducerWorkflow/AODProducerHelpers.h b/Detectors/AOD/include/AODProducerWorkflow/AODProducerHelpers.h index 9ef05096b2fd2..5351504443269 100644 --- a/Detectors/AOD/include/AODProducerWorkflow/AODProducerHelpers.h +++ b/Detectors/AOD/include/AODProducerWorkflow/AODProducerHelpers.h @@ -18,8 +18,6 @@ #include #include #include -#include -#include #include namespace o2::aodhelpers @@ -55,7 +53,7 @@ auto createTableCursor(framework::ProcessingContext& pc) framework::Produces c; c.resetCursor(pc.outputs() .make(framework::OutputForTable::ref())); - c.setLabel(o2::aod::MetadataTrait::metadata::tableLabel()); + c.setLabel(aod::label()); return c; } } // namespace o2::aodhelpers diff --git a/Detectors/AOD/include/AODProducerWorkflow/AODProducerWorkflowSpec.h b/Detectors/AOD/include/AODProducerWorkflow/AODProducerWorkflowSpec.h index d9481917f9a05..eaaf2d9eaedd9 100644 --- a/Detectors/AOD/include/AODProducerWorkflow/AODProducerWorkflowSpec.h +++ b/Detectors/AOD/include/AODProducerWorkflow/AODProducerWorkflowSpec.h @@ -21,7 +21,6 @@ #include "DataFormatsTRD/TrackTRD.h" #include "DetectorsBase/GRPGeomHelper.h" #include "DetectorsBase/Propagator.h" -#include "Framework/AnalysisHelpers.h" #include "Framework/DataProcessorSpec.h" #include "Framework/Task.h" #include "ReconstructionDataFormats/GlobalTrackID.h" @@ -36,7 +35,6 @@ #include #include #include -#include #include #include using namespace o2::framework; diff --git a/Detectors/AOD/src/AODProducerWorkflowSpec.cxx b/Detectors/AOD/src/AODProducerWorkflowSpec.cxx index 8a2443b57c7ff..8ee456634c1e1 100644 --- a/Detectors/AOD/src/AODProducerWorkflowSpec.cxx +++ b/Detectors/AOD/src/AODProducerWorkflowSpec.cxx @@ -358,47 +358,30 @@ void AODProducerWorkflowDPL::addToTracksExtraTable(TracksExtraCursorType& tracks template void AODProducerWorkflowDPL::addToTracksQATable(TracksQACursorType& tracksQACursor, TrackQA& trackQAInfoHolder) { - if constexpr (std::is_same_v) { // TODO remove remove once version changes - tracksQACursor( - trackQAInfoHolder.trackID, - truncateFloatFraction(trackQAInfoHolder.tpcTime0, mTPCTime0), - trackQAInfoHolder.tpcdcaR, - trackQAInfoHolder.tpcdcaZ, - trackQAInfoHolder.tpcClusterByteMask, - trackQAInfoHolder.tpcdEdxMax0R, - trackQAInfoHolder.tpcdEdxMax1R, - trackQAInfoHolder.tpcdEdxMax2R, - trackQAInfoHolder.tpcdEdxMax3R, - trackQAInfoHolder.tpcdEdxTot0R, - trackQAInfoHolder.tpcdEdxTot1R, - trackQAInfoHolder.tpcdEdxTot2R, - trackQAInfoHolder.tpcdEdxTot3R, - trackQAInfoHolder.dRefContY, - trackQAInfoHolder.dRefContZ, - trackQAInfoHolder.dRefContSnp, - trackQAInfoHolder.dRefContTgl, - trackQAInfoHolder.dRefContQ2Pt, - trackQAInfoHolder.dRefGloY, - trackQAInfoHolder.dRefGloZ, - trackQAInfoHolder.dRefGloSnp, - trackQAInfoHolder.dRefGloTgl, - trackQAInfoHolder.dRefGloQ2Pt); - } else { - tracksQACursor( - trackQAInfoHolder.trackID, - trackQAInfoHolder.tpcTime0, - trackQAInfoHolder.tpcdcaR, - trackQAInfoHolder.tpcdcaZ, - trackQAInfoHolder.tpcClusterByteMask, - trackQAInfoHolder.tpcdEdxMax0R, - trackQAInfoHolder.tpcdEdxMax1R, - trackQAInfoHolder.tpcdEdxMax2R, - trackQAInfoHolder.tpcdEdxMax3R, - trackQAInfoHolder.tpcdEdxTot0R, - trackQAInfoHolder.tpcdEdxTot1R, - trackQAInfoHolder.tpcdEdxTot2R, - trackQAInfoHolder.tpcdEdxTot3R); - } + tracksQACursor( + trackQAInfoHolder.trackID, + truncateFloatFraction(trackQAInfoHolder.tpcTime0, mTPCTime0), + trackQAInfoHolder.tpcdcaR, + trackQAInfoHolder.tpcdcaZ, + trackQAInfoHolder.tpcClusterByteMask, + trackQAInfoHolder.tpcdEdxMax0R, + trackQAInfoHolder.tpcdEdxMax1R, + trackQAInfoHolder.tpcdEdxMax2R, + trackQAInfoHolder.tpcdEdxMax3R, + trackQAInfoHolder.tpcdEdxTot0R, + trackQAInfoHolder.tpcdEdxTot1R, + trackQAInfoHolder.tpcdEdxTot2R, + trackQAInfoHolder.tpcdEdxTot3R, + trackQAInfoHolder.dRefContY, + trackQAInfoHolder.dRefContZ, + trackQAInfoHolder.dRefContSnp, + trackQAInfoHolder.dRefContTgl, + trackQAInfoHolder.dRefContQ2Pt, + trackQAInfoHolder.dRefGloY, + trackQAInfoHolder.dRefGloZ, + trackQAInfoHolder.dRefGloSnp, + trackQAInfoHolder.dRefGloTgl, + trackQAInfoHolder.dRefGloQ2Pt); } template @@ -2615,95 +2598,93 @@ AODProducerWorkflowDPL::TrackQA AODProducerWorkflowDPL::processBarrelTrackQA(int trackQAHolder.tpcdEdxTot2R = uint8_t(tpcOrig.getdEdx().dEdxTotOROC2 * dEdxNorm); trackQAHolder.tpcdEdxTot3R = uint8_t(tpcOrig.getdEdx().dEdxTotOROC3 * dEdxNorm); - if constexpr (std::is_same_v) { // TODO remove remove once version changes - // Add matching information at a reference point (defined by - // o2::aod::track::trackQARefRadius) in the same frame as the global track - // without material corrections and error propagation - if (auto itsContGID = data.getITSContributorGID(trackIndex); itsContGID.isIndexSet() && itsContGID.getSource() != GIndex::ITSAB) { - const auto& itsOrig = data.getITSTrack(itsContGID); - o2::track::TrackPar gloCopy = trackPar; - o2::track::TrackPar itsCopy = itsOrig; - o2::track::TrackPar tpcCopy = tpcOrig; - if (prop->propagateToX(gloCopy, o2::aod::track::trackQARefRadius, prop->getNominalBz(), o2::base::Propagator::MAX_SIN_PHI, o2::base::Propagator::MAX_STEP, mMatCorr) && - prop->propagateToAlphaX(tpcCopy, gloCopy.getAlpha(), o2::aod::track::trackQARefRadius, false, o2::base::Propagator::MAX_SIN_PHI, o2::base::Propagator::MAX_STEP, 1, mMatCorr) && - prop->propagateToAlphaX(itsCopy, gloCopy.getAlpha(), o2::aod::track::trackQARefRadius, false, o2::base::Propagator::MAX_SIN_PHI, o2::base::Propagator::MAX_STEP, 1, mMatCorr)) { - // All tracks are now at the same radius and in the same frame and we can calculate the deltas wrt. to the global track - // The scale is defined by the global track scaling depending on beta0 - const float beta0 = std::sqrt(std::min(50.f / tpcOrig.getdEdx().dEdxMaxTPC, 1.f)); - const float qpt = gloCopy.getQ2Pt(); - const float x = qpt / beta0; - // scaling is defined as sigmaBins/sqrt(p0^2 + (p1 * q/pt / beta)^2) - auto scaleCont = [&x](int i) -> float { - return o2::aod::track::trackQAScaleBins / std::sqrt(o2::aod::track::trackQAScaleContP0[i] * o2::aod::track::trackQAScaleContP0[i] + (o2::aod::track::trackQAScaleContP1[i] * x) * (o2::aod::track::trackQAScaleContP1[i] * x)); - }; - auto scaleGlo = [&x](int i) -> float { - return o2::aod::track::trackQAScaleBins / std::sqrt(o2::aod::track::trackQAScaleGloP0[i] * o2::aod::track::trackQAScaleGloP0[i] + (o2::aod::track::trackQAScaleGloP1[i] * x) * (o2::aod::track::trackQAScaleGloP1[i] * x)); - }; - - // This allows to safely clamp any float to one byte, using the - // minmal/maximum values as under-/overflow borders and rounding to the nearest integer - auto safeInt8Clamp = [](auto value) -> int8_t { - using ValType = decltype(value); - return static_cast(TMath::Nint(std::clamp(value, static_cast(std::numeric_limits::min()), static_cast(std::numeric_limits::max())))); - }; - - // Calculate deltas for contributors - trackQAHolder.dRefContY = safeInt8Clamp((itsCopy.getY() - tpcCopy.getY()) * scaleCont(0)); - trackQAHolder.dRefContZ = safeInt8Clamp((itsCopy.getZ() - tpcCopy.getZ()) * scaleCont(1)); - trackQAHolder.dRefContSnp = safeInt8Clamp((itsCopy.getSnp() - tpcCopy.getSnp()) * scaleCont(2)); - trackQAHolder.dRefContTgl = safeInt8Clamp((itsCopy.getTgl() - tpcCopy.getTgl()) * scaleCont(3)); - trackQAHolder.dRefContQ2Pt = safeInt8Clamp((itsCopy.getQ2Pt() - tpcCopy.getQ2Pt()) * scaleCont(4)); - // Calculate deltas for global track against averaged contributors - trackQAHolder.dRefGloY = safeInt8Clamp(((itsCopy.getY() + tpcCopy.getY()) * 0.5f - gloCopy.getY()) * scaleGlo(0)); - trackQAHolder.dRefGloZ = safeInt8Clamp(((itsCopy.getZ() + tpcCopy.getZ()) * 0.5f - gloCopy.getZ()) * scaleGlo(1)); - trackQAHolder.dRefGloSnp = safeInt8Clamp(((itsCopy.getSnp() + tpcCopy.getSnp()) * 0.5f - gloCopy.getSnp()) * scaleGlo(2)); - trackQAHolder.dRefGloTgl = safeInt8Clamp(((itsCopy.getTgl() + tpcCopy.getTgl()) * 0.5f - gloCopy.getTgl()) * scaleGlo(3)); - trackQAHolder.dRefGloQ2Pt = safeInt8Clamp(((itsCopy.getQ2Pt() + tpcCopy.getQ2Pt()) * 0.5f - gloCopy.getQ2Pt()) * scaleGlo(4)); - - if (O2_ENUM_TEST_BIT(mStreamerMask, AODProducerStreamerMask::TrackQA)) { - (*mStreamer) << "trackQA" - << "trackITSOrig=" << itsOrig - << "trackTPCOrig=" << tpcOrig - << "trackITSTPCOrig=" << trackPar - << "trackITSProp=" << itsCopy - << "trackTPCProp=" << tpcCopy - << "trackITSTPCProp=" << gloCopy - << "refRadius=" << o2::aod::track::trackQARefRadius - << "scaleBins=" << o2::aod::track::trackQAScaleBins - << "scaleCont0=" << scaleCont(0) - << "scaleCont1=" << scaleCont(1) - << "scaleCont2=" << scaleCont(2) - << "scaleCont3=" << scaleCont(3) - << "scaleCont4=" << scaleCont(4) - << "scaleGlo0=" << scaleGlo(0) - << "scaleGlo1=" << scaleGlo(1) - << "scaleGlo2=" << scaleGlo(2) - << "scaleGlo3=" << scaleGlo(3) - << "scaleGlo4=" << scaleGlo(4) - << "trackQAHolder.tpcTime0=" << trackQAHolder.tpcTime0 - << "trackQAHolder.tpcdcaR=" << trackQAHolder.tpcdcaR - << "trackQAHolder.tpcdcaZ=" << trackQAHolder.tpcdcaZ - << "trackQAHolder.tpcdcaClusterByteMask=" << trackQAHolder.tpcClusterByteMask - << "trackQAHolder.tpcdEdxMax0R=" << trackQAHolder.tpcdEdxMax0R - << "trackQAHolder.tpcdEdxMax1R=" << trackQAHolder.tpcdEdxMax1R - << "trackQAHolder.tpcdEdxMax2R=" << trackQAHolder.tpcdEdxMax2R - << "trackQAHolder.tpcdEdxMax3R=" << trackQAHolder.tpcdEdxMax3R - << "trackQAHolder.tpcdEdxTot0R=" << trackQAHolder.tpcdEdxTot0R - << "trackQAHolder.tpcdEdxTot1R=" << trackQAHolder.tpcdEdxTot1R - << "trackQAHolder.tpcdEdxTot2R=" << trackQAHolder.tpcdEdxTot2R - << "trackQAHolder.tpcdEdxTot3R=" << trackQAHolder.tpcdEdxTot3R - << "trackQAHolder.dRefContY=" << trackQAHolder.dRefContY - << "trackQAHolder.dRefContZ=" << trackQAHolder.dRefContZ - << "trackQAHolder.dRefContSnp=" << trackQAHolder.dRefContSnp - << "trackQAHolder.dRefContTgl=" << trackQAHolder.dRefContTgl - << "trackQAHolder.dRefContQ2Pt=" << trackQAHolder.dRefContQ2Pt - << "trackQAHolder.dRefGloY=" << trackQAHolder.dRefGloY - << "trackQAHolder.dRefGloZ=" << trackQAHolder.dRefGloZ - << "trackQAHolder.dRefGloSnp=" << trackQAHolder.dRefGloSnp - << "trackQAHolder.dRefGloTgl=" << trackQAHolder.dRefGloTgl - << "trackQAHolder.dRefGloQ2Pt=" << trackQAHolder.dRefGloQ2Pt - << "\n"; - } + // Add matching information at a reference point (defined by + // o2::aod::track::trackQARefRadius) in the same frame as the global track + // without material corrections and error propagation + if (auto itsContGID = data.getITSContributorGID(trackIndex); itsContGID.isIndexSet() && itsContGID.getSource() != GIndex::ITSAB) { + const auto& itsOrig = data.getITSTrack(itsContGID); + o2::track::TrackPar gloCopy = trackPar; + o2::track::TrackPar itsCopy = itsOrig; + o2::track::TrackPar tpcCopy = tpcOrig; + if (prop->propagateToX(gloCopy, o2::aod::track::trackQARefRadius, prop->getNominalBz(), o2::base::Propagator::MAX_SIN_PHI, o2::base::Propagator::MAX_STEP, mMatCorr) && + prop->propagateToAlphaX(tpcCopy, gloCopy.getAlpha(), o2::aod::track::trackQARefRadius, false, o2::base::Propagator::MAX_SIN_PHI, o2::base::Propagator::MAX_STEP, 1, mMatCorr) && + prop->propagateToAlphaX(itsCopy, gloCopy.getAlpha(), o2::aod::track::trackQARefRadius, false, o2::base::Propagator::MAX_SIN_PHI, o2::base::Propagator::MAX_STEP, 1, mMatCorr)) { + // All tracks are now at the same radius and in the same frame and we can calculate the deltas wrt. to the global track + // The scale is defined by the global track scaling depending on beta0 + const float beta0 = std::sqrt(std::min(50.f / tpcOrig.getdEdx().dEdxMaxTPC, 1.f)); + const float qpt = gloCopy.getQ2Pt(); + const float x = qpt / beta0; + // scaling is defined as sigmaBins/sqrt(p0^2 + (p1 * q/pt / beta)^2) + auto scaleCont = [&x](int i) -> float { + return o2::aod::track::trackQAScaleBins / std::sqrt(o2::aod::track::trackQAScaleContP0[i] * o2::aod::track::trackQAScaleContP0[i] + (o2::aod::track::trackQAScaleContP1[i] * x) * (o2::aod::track::trackQAScaleContP1[i] * x)); + }; + auto scaleGlo = [&x](int i) -> float { + return o2::aod::track::trackQAScaleBins / std::sqrt(o2::aod::track::trackQAScaleGloP0[i] * o2::aod::track::trackQAScaleGloP0[i] + (o2::aod::track::trackQAScaleGloP1[i] * x) * (o2::aod::track::trackQAScaleGloP1[i] * x)); + }; + + // This allows to safely clamp any float to one byte, using the + // minmal/maximum values as under-/overflow borders and rounding to the nearest integer + auto safeInt8Clamp = [](auto value) -> int8_t { + using ValType = decltype(value); + return static_cast(TMath::Nint(std::clamp(value, static_cast(std::numeric_limits::min()), static_cast(std::numeric_limits::max())))); + }; + + // Calculate deltas for contributors + trackQAHolder.dRefContY = safeInt8Clamp((itsCopy.getY() - tpcCopy.getY()) * scaleCont(0)); + trackQAHolder.dRefContZ = safeInt8Clamp((itsCopy.getZ() - tpcCopy.getZ()) * scaleCont(1)); + trackQAHolder.dRefContSnp = safeInt8Clamp((itsCopy.getSnp() - tpcCopy.getSnp()) * scaleCont(2)); + trackQAHolder.dRefContTgl = safeInt8Clamp((itsCopy.getTgl() - tpcCopy.getTgl()) * scaleCont(3)); + trackQAHolder.dRefContQ2Pt = safeInt8Clamp((itsCopy.getQ2Pt() - tpcCopy.getQ2Pt()) * scaleCont(4)); + // Calculate deltas for global track against averaged contributors + trackQAHolder.dRefGloY = safeInt8Clamp(((itsCopy.getY() + tpcCopy.getY()) * 0.5f - gloCopy.getY()) * scaleGlo(0)); + trackQAHolder.dRefGloZ = safeInt8Clamp(((itsCopy.getZ() + tpcCopy.getZ()) * 0.5f - gloCopy.getZ()) * scaleGlo(1)); + trackQAHolder.dRefGloSnp = safeInt8Clamp(((itsCopy.getSnp() + tpcCopy.getSnp()) * 0.5f - gloCopy.getSnp()) * scaleGlo(2)); + trackQAHolder.dRefGloTgl = safeInt8Clamp(((itsCopy.getTgl() + tpcCopy.getTgl()) * 0.5f - gloCopy.getTgl()) * scaleGlo(3)); + trackQAHolder.dRefGloQ2Pt = safeInt8Clamp(((itsCopy.getQ2Pt() + tpcCopy.getQ2Pt()) * 0.5f - gloCopy.getQ2Pt()) * scaleGlo(4)); + + if (O2_ENUM_TEST_BIT(mStreamerMask, AODProducerStreamerMask::TrackQA)) { + (*mStreamer) << "trackQA" + << "trackITSOrig=" << itsOrig + << "trackTPCOrig=" << tpcOrig + << "trackITSTPCOrig=" << trackPar + << "trackITSProp=" << itsCopy + << "trackTPCProp=" << tpcCopy + << "trackITSTPCProp=" << gloCopy + << "refRadius=" << o2::aod::track::trackQARefRadius + << "scaleBins=" << o2::aod::track::trackQAScaleBins + << "scaleCont0=" << scaleCont(0) + << "scaleCont1=" << scaleCont(1) + << "scaleCont2=" << scaleCont(2) + << "scaleCont3=" << scaleCont(3) + << "scaleCont4=" << scaleCont(4) + << "scaleGlo0=" << scaleGlo(0) + << "scaleGlo1=" << scaleGlo(1) + << "scaleGlo2=" << scaleGlo(2) + << "scaleGlo3=" << scaleGlo(3) + << "scaleGlo4=" << scaleGlo(4) + << "trackQAHolder.tpcTime0=" << trackQAHolder.tpcTime0 + << "trackQAHolder.tpcdcaR=" << trackQAHolder.tpcdcaR + << "trackQAHolder.tpcdcaZ=" << trackQAHolder.tpcdcaZ + << "trackQAHolder.tpcdcaClusterByteMask=" << trackQAHolder.tpcClusterByteMask + << "trackQAHolder.tpcdEdxMax0R=" << trackQAHolder.tpcdEdxMax0R + << "trackQAHolder.tpcdEdxMax1R=" << trackQAHolder.tpcdEdxMax1R + << "trackQAHolder.tpcdEdxMax2R=" << trackQAHolder.tpcdEdxMax2R + << "trackQAHolder.tpcdEdxMax3R=" << trackQAHolder.tpcdEdxMax3R + << "trackQAHolder.tpcdEdxTot0R=" << trackQAHolder.tpcdEdxTot0R + << "trackQAHolder.tpcdEdxTot1R=" << trackQAHolder.tpcdEdxTot1R + << "trackQAHolder.tpcdEdxTot2R=" << trackQAHolder.tpcdEdxTot2R + << "trackQAHolder.tpcdEdxTot3R=" << trackQAHolder.tpcdEdxTot3R + << "trackQAHolder.dRefContY=" << trackQAHolder.dRefContY + << "trackQAHolder.dRefContZ=" << trackQAHolder.dRefContZ + << "trackQAHolder.dRefContSnp=" << trackQAHolder.dRefContSnp + << "trackQAHolder.dRefContTgl=" << trackQAHolder.dRefContTgl + << "trackQAHolder.dRefContQ2Pt=" << trackQAHolder.dRefContQ2Pt + << "trackQAHolder.dRefGloY=" << trackQAHolder.dRefGloY + << "trackQAHolder.dRefGloZ=" << trackQAHolder.dRefGloZ + << "trackQAHolder.dRefGloSnp=" << trackQAHolder.dRefGloSnp + << "trackQAHolder.dRefGloTgl=" << trackQAHolder.dRefGloTgl + << "trackQAHolder.dRefGloQ2Pt=" << trackQAHolder.dRefGloQ2Pt + << "\n"; } } } diff --git a/Detectors/AOD/src/StandaloneAODProducer.cxx b/Detectors/AOD/src/StandaloneAODProducer.cxx index a9f5a09141d8c..7ac59a556ac08 100644 --- a/Detectors/AOD/src/StandaloneAODProducer.cxx +++ b/Detectors/AOD/src/StandaloneAODProducer.cxx @@ -94,7 +94,7 @@ void fillMCollisionTable(o2::steer::MCKinematicsReader const& mcreader) TFile outfile("aod.root", "UPDATE"); { - TableToTree t2t(mccoltable, &outfile, aod::MetadataTrait::metadata::tableLabel()); + TableToTree t2t(mccoltable, &outfile, aod::description_str(aod::signature()).data()); t2t.addAllBranches(); t2t.process(); } @@ -200,12 +200,12 @@ void fillCollisionAndTrackTable() f.Close(); TFile outfile("aod.root", "RECREATE"); { - TableToTree t2t(colltable, &outfile, aod::MetadataTrait::metadata::tableLabel()); + TableToTree t2t(colltable, &outfile, aod::description_str(aod::signature()).data()); t2t.addAllBranches(); t2t.process(); } { - TableToTree t2t(tracktable, &outfile, "Tracks" /* aod::MetadataTrait::metadata::tableLabel() */); + TableToTree t2t(tracktable, &outfile, aod::description_str(aod::signature()).data()); t2t.addAllBranches(); t2t.process(); } diff --git a/Detectors/CTF/README.md b/Detectors/CTF/README.md index e1e65060db523..47ce765de289a 100644 --- a/Detectors/CTF/README.md +++ b/Detectors/CTF/README.md @@ -95,6 +95,14 @@ comma-separated list of detectors to read, Overrides skipDet ``` comma-separated list of detectors to skip + +By default an exception will be thrown if detector is requested but missing in the CTF. To enable injection of the empty output in such case one should use option `--allow-missing-detectors`. + +``` +--ctf-data-subspec arg (=0) +``` +allows to alter the `subSpecification` used to send the CTFDATA from the reader to decoders. Non-0 value must be used in case the data extracted by the CTF-reader should be processed and stored in new CTFs (in order to avoid clash of CTFDATA messages of the reader and writer). + ``` --max-tf arg (=-1) ``` @@ -141,6 +149,8 @@ There is a possibility to read remote root files directly, w/o caching them loca 2) provide proper regex to define remote files, e.g. for the example above: `--remote-regex "^root://.+/eos/aliceo2/.+"`. 3) pass an option `--copy-cmd no-copy`. +## Selective TF reading + ``` --select-ctf-ids ``` @@ -148,24 +158,25 @@ This is a `ctf-reader` device local option allowing selective reading of particu Note that the index corresponds not to the entry of the TF in the CTF tree but to the reader own counter incremented throught all input files (e.g. if the 10 CTF files with 20 TFs each are provided for the input and the selection of TFs `0,2,22,66` is provided, the reader will inject to the DPL the TFs at entries 0 and 2 from the 1st CTF file, entry 5 of the second file, entry 6 of the 3d and will finish the job. -For the ITS and MFT entropy decoding one can request either to decompose clusters to digits and send them instead of clusters (via `o2-ctf-reader-workflow` global options `--its-digits` and `--mft-digits` respectively) -or to apply the noise mask to decoded clusters (or decoded digits). If the masking (e.g. via option `--its-entropy-decoder " --mask-noise "`) is requested, user should provide to the entropy decoder the noise mask file (eventually will be loaded from CCDB) and cluster patterns decoding dictionary (if the clusters were encoded with patterns IDs). -For example, ``` -o2-ctf-reader-workflow --ctf-input --onlyDet ITS,MFT --its-entropy-decoder ' --mask-noise' | ... +--ir-frames-files --skip-skimmed-out-tf ``` -will decode ITS and MFT data, decompose on the fly ITS clusters to digits, mask the noisy pixels with the provided masks, recluster remaining ITS digits and send the new clusters out, together with unchanged MFT clusters. +This option (used for skimming) allow to push to DPL only those TFs which overlap with selected BC-ranges provided via input root file (for various formats see `o2::utils::IRFrameSelector::loadIRFrames` method). + ``` -o2-ctf-reader-workflow --ctf-input --onlyDet ITS,MFT --mft-digits --mft-entropy-decoder ' --mask-noise' | ... +--ir-frames-files ``` -will send decompose clusters to digits and send ben out after masking the noise for the MFT, while ITS clusters will be sent as decoded. - -By default an exception will be thrown if detector is requested but missing in the CTF. To enable injection of the empty output in such case one should use option `--allow-missing-detectors`. +This option allows to push to DPL only those TFs which overlap with the ` ` (separators can be any whitespace, comma or semicolon) records provided via text file (assuming that there are some entries for a given run, otherwise the option is ignored). +Multiple ranges per run and multiple runs can be mentioned in a single input file. The range limits can be indicated either as a UNIX timestamp in `ms` or as an orbit number (in the fill the run belongs to). +In case an option ``` ---ctf-data-subspec arg (=0) +--invert-irframe-selection ``` -allows to alter the `subSpecification` used to send the CTFDATA from the reader to decoders. Non-0 value must be used in case the data extracted by the CTF-reader should be processed and stored in new CTFs (in order to avoid clash of CTFDATA messages of the reader and writer). +is provided, the selections above are inverted: TFs matching some of the provided ranges will be discarded, while the rest will be pushed to the DPL + +At the end of the processing the `ctf-writer` will create a local file `ctf_read_ntf.txt` containing only the number of TFs pushed to the DPL. +In case no TF passed the selections above, this file will contain 0. ## Support for externally provided encoding dictionaries @@ -201,3 +212,18 @@ Additionally, one may throttle on the free SHM by providing an option to the rea Note that by default the reader reads into the memory the CTF data and prepares all output messages but injects them only once the rate-limiter allows that. With the option `--limit-tf-before-reading` set also the preparation of the data to inject will be conditioned by the green light from the rate-limiter. + + +## Modifying ITS/MFT CTF output + +For the ITS and MFT entropy decoding one can request either to decompose clusters to digits and send them instead of clusters (via `o2-ctf-reader-workflow` global options `--its-digits` and `--mft-digits` respectively) +or to apply the noise mask to decoded clusters (or decoded digits). If the masking (e.g. via option `--its-entropy-decoder " --mask-noise "`) is requested, user should provide to the entropy decoder the noise mask file (eventually will be loaded from CCDB) and cluster patterns decoding dictionary (if the clusters were encoded with patterns IDs). +For example, +``` +o2-ctf-reader-workflow --ctf-input --onlyDet ITS,MFT --its-entropy-decoder ' --mask-noise' | ... +``` +will decode ITS and MFT data, decompose on the fly ITS clusters to digits, mask the noisy pixels with the provided masks, recluster remaining ITS digits and send the new clusters out, together with unchanged MFT clusters. +``` +o2-ctf-reader-workflow --ctf-input --onlyDet ITS,MFT --mft-digits --mft-entropy-decoder ' --mask-noise' | ... +``` +will send decompose clusters to digits and send ben out after masking the noise for the MFT, while ITS clusters will be sent as decoded. diff --git a/Detectors/CTF/workflow/include/CTFWorkflow/CTFReaderSpec.h b/Detectors/CTF/workflow/include/CTFWorkflow/CTFReaderSpec.h index 997572e0371b2..b202013a6eea1 100644 --- a/Detectors/CTF/workflow/include/CTFWorkflow/CTFReaderSpec.h +++ b/Detectors/CTF/workflow/include/CTFWorkflow/CTFReaderSpec.h @@ -31,8 +31,10 @@ struct CTFReaderInp { std::string remoteRegex{}; std::string metricChannel{}; std::string fileIRFrames{}; + std::string fileRunTimeSpans{}; std::vector ctfIDs{}; bool skipSkimmedOutTF = false; + bool invertIRFramesSelection = false; bool allowMissingDetectors = false; bool checkTFLimitBeforeReading = false; bool sup0xccdb = false; diff --git a/Detectors/CTF/workflow/src/CTFReaderSpec.cxx b/Detectors/CTF/workflow/src/CTFReaderSpec.cxx index 70bb589e8836a..bcf3b5d975b74 100644 --- a/Detectors/CTF/workflow/src/CTFReaderSpec.cxx +++ b/Detectors/CTF/workflow/src/CTFReaderSpec.cxx @@ -45,6 +45,9 @@ #include "DataFormatsZDC/CTF.h" #include "DataFormatsHMP/CTF.h" #include "DataFormatsCTP/CTF.h" +#include "DataFormatsParameters/AggregatedRunInfo.h" +#include "CCDB/BasicCCDBManager.h" +#include "CommonConstants/LHCConstants.h" #include "Algorithm/RangeTokenizer.h" #include #include @@ -81,6 +84,8 @@ class CTFReaderSpec : public o2::framework::Task void run(o2::framework::ProcessingContext& pc) final; private: + void runTimeRangesToIRFrameSelector(const o2::framework::TimingInfo& timingInfo); + void loadRunTimeSpans(const std::string& flname); void openCTFFile(const std::string& flname); bool processTF(ProcessingContext& pc); void checkTreeEntries(); @@ -91,16 +96,20 @@ class CTFReaderSpec : public o2::framework::Task void tryToFixCTFHeader(CTFHeader& ctfHeader) const; CTFReaderInp mInput{}; o2::utils::IRFrameSelector mIRFrameSelector; // optional IR frames selector + std::map>> mRunTimeRanges; std::unique_ptr mFileFetcher; std::unique_ptr mCTFFile; std::unique_ptr mCTFTree; bool mRunning = false; bool mUseLocalTFCounter = false; + int mConvRunTimeRangesToOrbits = -1; // not defined yet int mCTFCounter = 0; + int mCTFCounterAcc = 0; int mNFailedFiles = 0; int mFilesRead = 0; int mTFLength = 128; int mNWaits = 0; + int mRunNumberPrev = -1; long mTotalWaitTime = 0; long mLastSendTime = 0L; long mCurrTreeEntry = 0L; @@ -129,8 +138,8 @@ void CTFReaderSpec::stopReader() return; } LOGP(info, "CTFReader stops processing, {} files read, {} files failed", mFilesRead - mNFailedFiles, mNFailedFiles); - LOGP(info, "CTF reading total timing: Cpu: {:.3f} Real: {:.3f} s for {} TFs in {} loops, spent {:.2} s in {} data waiting states", - mTimer.CpuTime(), mTimer.RealTime(), mCTFCounter, mFileFetcher->getNLoops(), 1e-6 * mTotalWaitTime, mNWaits); + LOGP(info, "CTF reading total timing: Cpu: {:.3f} Real: {:.3f} s for {} TFs ({} accepted) in {} loops, spent {:.2} s in {} data waiting states", + mTimer.CpuTime(), mTimer.RealTime(), mCTFCounter, mCTFCounterAcc, mFileFetcher->getNLoops(), 1e-6 * mTotalWaitTime, mNWaits); mRunning = false; mFileFetcher->stop(); mFileFetcher.reset(); @@ -164,6 +173,111 @@ void CTFReaderSpec::init(InitContext& ic) mTFLength = hbfu.nHBFPerTF; LOGP(info, "IRFrames will be selected from {}, assumed TF length: {} HBF", mInput.fileIRFrames, mTFLength); } + if (!mInput.fileRunTimeSpans.empty()) { + loadRunTimeSpans(mInput.fileRunTimeSpans); + } +} + +void CTFReaderSpec::runTimeRangesToIRFrameSelector(const o2::framework::TimingInfo& timingInfo) +{ + // convert entries in the runTimeRanges to IRFrameSelector, if needed, convert time to orbit + mIRFrameSelector.clear(); + auto ent = mRunTimeRanges.find(timingInfo.runNumber); + if (ent == mRunTimeRanges.end()) { + LOGP(info, "RunTimeRanges selection was provided but run {} has no entries, all TFs will be processed", timingInfo.runNumber); + return; + } + o2::parameters::AggregatedRunInfo rinfo; + auto& ccdb = o2::ccdb::BasicCCDBManager::instance(); + rinfo = o2::parameters::AggregatedRunInfo::buildAggregatedRunInfo(ccdb, timingInfo.runNumber); + if (rinfo.runNumber != timingInfo.runNumber || rinfo.orbitsPerTF < 1) { + LOGP(fatal, "failed to extract AggregatedRunInfo for run {}", timingInfo.runNumber); + } + mTFLength = rinfo.orbitsPerTF; + std::vector frames; + for (const auto& rng : ent->second) { + long orbMin = 0, orbMax = 0; + if (mConvRunTimeRangesToOrbits > 0) { + orbMin = rinfo.orbitSOR + (rng.first - rinfo.sor) / (o2::constants::lhc::LHCOrbitMUS * 0.001); + orbMax = rinfo.orbitSOR + (rng.second - rinfo.sor) / (o2::constants::lhc::LHCOrbitMUS * 0.001); + } else { + orbMin = rng.first; + orbMax = rng.second; + } + if (orbMin < 0) { + orbMin = 0; + } + if (orbMax < 0) { + orbMax = 0; + } + if (timingInfo.runNumber > 523897) { + orbMin = (orbMin / rinfo.orbitsPerTF) * rinfo.orbitsPerTF; + orbMax = (orbMax / rinfo.orbitsPerTF + 1) * rinfo.orbitsPerTF - 1; + } + LOGP(info, "TFs overlapping with orbits {}:{} will be {}", orbMin, orbMax, mInput.invertIRFramesSelection ? "rejected" : "selected"); + frames.emplace_back(InteractionRecord{0, uint32_t(orbMin)}, InteractionRecord{o2::constants::lhc::LHCMaxBunches, uint32_t(orbMax)}); + } + mIRFrameSelector.setOwnList(frames, true); +} + +void CTFReaderSpec::loadRunTimeSpans(const std::string& flname) +{ + std::ifstream inputFile(flname); + if (!inputFile) { + LOGP(fatal, "Failed to open selected run/timespans file {}", mInput.fileRunTimeSpans); + } + std::string line; + size_t cntl = 0, cntr = 0; + while (std::getline(inputFile, line)) { + cntl++; + for (char& ch : line) { // Replace semicolons and tabs with spaces for uniform processing + if (ch == ';' || ch == '\t' || ch == ',') { + ch = ' '; + } + } + o2::utils::Str::trim(line); + if (line.size() < 1 || line[0] == '#') { + continue; + } + auto tokens = o2::utils::Str::tokenize(line, ' '); + auto logError = [&cntl, &line]() { LOGP(error, "Expected format for selection is tripplet , failed on line#{}: {}", cntl, line); }; + if (tokens.size() >= 3) { + int run = 0; + long rmin, rmax; + try { + run = std::stoi(tokens[0]); + rmin = std::stol(tokens[1]); + rmax = std::stol(tokens[2]); + } catch (...) { + logError(); + continue; + } + + constexpr long ISTimeStamp = 1514761200000L; + int convmn = rmin > ISTimeStamp ? 1 : 0, convmx = rmax > ISTimeStamp ? 1 : 0; // values above ISTimeStamp are timestamps (need to be converted to orbits) + if (rmin > rmax) { + LOGP(fatal, "Provided range limits are not in increasing order, entry is {}", line); + } + if (mConvRunTimeRangesToOrbits == -1) { + if (convmn != convmx) { + LOGP(fatal, "Provided range limits should be both consistent either with orbit number or with unix timestamp in ms, entry is {}", line); + } + mConvRunTimeRangesToOrbits = convmn; // need to convert to orbit if time + LOGP(info, "Interpret selected time-spans input as {}", mConvRunTimeRangesToOrbits == 1 ? "timstamps(ms)" : "orbits"); + } else { + if (mConvRunTimeRangesToOrbits != convmn || mConvRunTimeRangesToOrbits != convmx) { + LOGP(fatal, "Provided range limits should are not consistent with previously determined {} input, entry is {}", mConvRunTimeRangesToOrbits == 1 ? "timestamps" : "orbits", line); + } + } + + mRunTimeRanges[run].emplace_back(rmin, rmax); + cntr++; + } else { + logError(); + } + } + LOGP(info, "Read {} time-spans for {} runs from {}", cntr, mRunTimeRanges.size(), mInput.fileRunTimeSpans); + inputFile.close(); } ///_______________________________________ @@ -256,6 +370,17 @@ void CTFReaderSpec::run(ProcessingContext& pc) pc.services().get().endOfStream(); pc.services().get().readyToQuit(QuitRequest::Me); stopReader(); + const std::string dummy{"ctf_read_ntf.txt"}; + if (mCTFCounterAcc == 0) { + LOGP(warn, "No TF passed selection, writing a 0 to file {}", dummy); + } + try { + std::ofstream outfile; + outfile.open(dummy, std::ios::out | std::ios::trunc); + outfile << mCTFCounterAcc << std::endl; + } catch (...) { + LOGP(error, "Failed to write {}", dummy); + } } } @@ -278,7 +403,7 @@ bool CTFReaderSpec::processTF(ProcessingContext& pc) } if (mUseLocalTFCounter) { - ctfHeader.tfCounter = mCTFCounter; + ctfHeader.tfCounter = mCTFCounterAcc; } LOG(info) << ctfHeader; @@ -289,19 +414,26 @@ bool CTFReaderSpec::processTF(ProcessingContext& pc) timingInfo.tfCounter = ctfHeader.tfCounter; timingInfo.runNumber = ctfHeader.run; + if (mRunTimeRanges.size() && timingInfo.runNumber != mRunNumberPrev) { + runTimeRangesToIRFrameSelector(timingInfo); + } + mRunNumberPrev = timingInfo.runNumber; + if (mIRFrameSelector.isSet()) { o2::InteractionRecord ir0(0, timingInfo.firstTForbit); - // we cannot have GRPECS via DPL CCDB fetcher in the CTFReader, so we use mTFLength extracted from the HBFUtils o2::InteractionRecord ir1(o2::constants::lhc::LHCMaxBunches - 1, timingInfo.firstTForbit < 0xffffffff - (mTFLength - 1) ? timingInfo.firstTForbit + (mTFLength - 1) : 0xffffffff); auto irSpan = mIRFrameSelector.getMatchingFrames({ir0, ir1}); - if (irSpan.size() == 0 && mInput.skipSkimmedOutTF) { - LOGP(info, "Skimming did not define any selection for TF [{}] : [{}]", ir0.asString(), ir1.asString()); + bool acc = true; + if (mInput.skipSkimmedOutTF) { + acc = (irSpan.size() > 0) ? !mInput.invertIRFramesSelection : mInput.invertIRFramesSelection; + LOGP(info, "IRFrame selection contains {} frames for TF [{}] : [{}]: {}use this TF (selection inversion mode is {})", + irSpan.size(), ir0.asString(), ir1.asString(), acc ? "" : "do not ", mInput.invertIRFramesSelection ? "ON" : "OFF"); + } + if (!acc) { return false; - } else { - if (mInput.checkTFLimitBeforeReading) { - limiter.check(pc, mInput.tfRateLimit, mInput.minSHM); - } - LOGP(info, "{} IR-Frames are selected for TF [{}] : [{}]", irSpan.size(), ir0.asString(), ir1.asString()); + } + if (mInput.checkTFLimitBeforeReading) { + limiter.check(pc, mInput.tfRateLimit, mInput.minSHM); } auto outVec = pc.outputs().make>(OutputRef{"selIRFrames"}, irSpan.begin(), irSpan.end()); } else { @@ -329,6 +461,7 @@ bool CTFReaderSpec::processTF(ProcessingContext& pc) processDetector(DetID::CPV, ctfHeader, pc); processDetector(DetID::ZDC, ctfHeader, pc); processDetector(DetID::CTP, ctfHeader, pc); + mCTFCounterAcc++; // send sTF acknowledge message if (!mInput.sup0xccdb) { @@ -466,7 +599,7 @@ DataProcessorSpec getCTFReaderSpec(const CTFReaderInp& inp) outputs.emplace_back(OutputLabel{det.getName()}, det.getDataOrigin(), "CTFDATA", inp.subspec, Lifetime::Timeframe); } } - if (!inp.fileIRFrames.empty()) { + if (!inp.fileIRFrames.empty() || !inp.fileRunTimeSpans.empty()) { outputs.emplace_back(OutputLabel{"selIRFrames"}, "CTF", "SELIRFRAMES", 0, Lifetime::Timeframe); } if (!inp.sup0xccdb) { diff --git a/Detectors/CTF/workflow/src/ctf-reader-workflow.cxx b/Detectors/CTF/workflow/src/ctf-reader-workflow.cxx index 90d259f4e3a5c..1f0ef9a3b871b 100644 --- a/Detectors/CTF/workflow/src/ctf-reader-workflow.cxx +++ b/Detectors/CTF/workflow/src/ctf-reader-workflow.cxx @@ -66,7 +66,9 @@ void customize(std::vector& workflowOptions) options.push_back(ConfigParamSpec{"ctf-data-subspec", VariantType::Int, 0, {"subspec to use for decoded CTF messages (use non-0 if CTF writer will be attached downstream)"}}); options.push_back(ConfigParamSpec{"configKeyValues", VariantType::String, "", {"Semicolon separated key=value strings"}}); options.push_back(ConfigParamSpec{"ir-frames-files", VariantType::String, "", {"If non empty, inject selected IRFrames from this file"}}); + options.push_back(ConfigParamSpec{"run-time-span-file", VariantType::String, "", {"If non empty, inject selected IRFrames from this text file (run, min/max orbit or unix time)"}}); options.push_back(ConfigParamSpec{"skip-skimmed-out-tf", VariantType::Bool, false, {"Do not process TFs with empty IR-Frame coverage"}}); + options.push_back(ConfigParamSpec{"invert-irframe-selection", VariantType::Bool, false, {"Select only frames mentioned in ir-frames-file (skip-skimmed-out-tf applied to TF not selected!)"}}); // options.push_back(ConfigParamSpec{"its-digits", VariantType::Bool, false, {"convert ITS clusters to digits"}}); options.push_back(ConfigParamSpec{"mft-digits", VariantType::Bool, false, {"convert MFT clusters to digits"}}); @@ -125,7 +127,9 @@ WorkflowSpec defineDataProcessing(ConfigContext const& configcontext) ctfInput.sup0xccdb = !configcontext.options().get("send-diststf-0xccdb"); ctfInput.minSHM = std::stoul(configcontext.options().get("timeframes-shm-limit")); ctfInput.fileIRFrames = configcontext.options().get("ir-frames-files"); + ctfInput.fileRunTimeSpans = configcontext.options().get("run-time-span-file"); ctfInput.skipSkimmedOutTF = configcontext.options().get("skip-skimmed-out-tf"); + ctfInput.invertIRFramesSelection = configcontext.options().get("invert-irframe-selection"); int verbosity = configcontext.options().get("ctf-reader-verbosity"); int rateLimitingIPCID = std::stoi(configcontext.options().get("timeframes-rate-limit-ipcid")); @@ -133,6 +137,12 @@ WorkflowSpec defineDataProcessing(ConfigContext const& configcontext) if (rateLimitingIPCID > -1 && !chanFmt.empty()) { ctfInput.metricChannel = fmt::format(fmt::runtime(chanFmt), o2::framework::ChannelSpecHelpers::defaultIPCFolder(), rateLimitingIPCID); } + if (!ctfInput.fileRunTimeSpans.empty()) { + ctfInput.skipSkimmedOutTF = true; + } + if (!ctfInput.fileIRFrames.empty() && !ctfInput.fileRunTimeSpans.empty()) { + LOGP(fatal, "One cannot provide --ir-frames-files and --run-time-span-file options simultaneously"); + } specs.push_back(o2::ctf::getCTFReaderSpec(ctfInput)); diff --git a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h index 457381862cc42..9e8299e89b404 100644 --- a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h +++ b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h @@ -45,7 +45,7 @@ struct FastMultEst { static uint32_t getCurrentRandomSeed(); int selectROFs(const gsl::span rofs, const gsl::span clus, - const gsl::span trig, std::vector& sel); + const gsl::span trig, std::vector& sel); void fillNClPerLayer(const gsl::span& clusters); float process(const std::array ncl) diff --git a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx index a55fafdf60409..c547996c6f356 100644 --- a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx +++ b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx @@ -125,7 +125,7 @@ float FastMultEst::processNoiseImposed(const std::array ncl) } int FastMultEst::selectROFs(const gsl::span rofs, const gsl::span clus, - const gsl::span trig, std::vector& sel) + const gsl::span trig, std::vector& sel) { int nrof = rofs.size(), nsel = 0; const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index ad8724f315ec8..37f392ebbd3a7 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -51,9 +51,19 @@ class TimeFrameGPU : public TimeFrame void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr); void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int); void initDeviceSAFitting(); + void loadIndexTableUtils(const int); void loadTrackingFrameInfoDevice(const int); void loadUnsortedClustersDevice(const int); void loadClustersDevice(const int); + void loadClustersIndexTables(const int iteration); + void createUsedClustersDevice(const int); + void loadUsedClustersDevice(); + void loadROframeClustersDevice(const int); + void loadMultiplicityCutMask(const int); + void loadVertices(const int); + + /// + void createTrackletsLUTDevice(const int); void loadTrackletsDevice(); void loadTrackletsLUTDevice(); void loadCellsDevice(); @@ -62,6 +72,7 @@ class TimeFrameGPU : public TimeFrame void loadTrackSeedsChi2Device(); void loadRoadsDevice(); void loadTrackSeedsDevice(std::vector&); + void createTrackletsBuffers(); void createCellsBuffers(const int); void createCellsDevice(); void createCellsLUTDevice(); @@ -93,7 +104,7 @@ class TimeFrameGPU : public TimeFrame std::vector>& getLabelsInChunks() { return mLabelsInChunks; } int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones. StaticTrackingParameters* getDeviceTrackingParameters() { return mTrackingParamsDevice; } - Vertex* getDeviceVertices() { return mVerticesDevice; } + Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; } int* getDeviceROFramesPV() { return mROFramesPVDevice; } unsigned char* getDeviceUsedClusters(const int); const o2::base::Propagator* getChainPropagator(); @@ -107,8 +118,12 @@ class TimeFrameGPU : public TimeFrame const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } - const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } - const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } + const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; } + std::vector getClusterSizes(); + const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } + const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } + Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; } + int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } @@ -116,17 +131,19 @@ class TimeFrameGPU : public TimeFrame o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; } + uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; } void setDevicePropagator(const o2::base::PropagatorImpl*) override; // Host-specific getters - gsl::span getHostNTracklets(const int chunkId); - gsl::span getHostNCells(const int chunkId); + gsl::span getNTracklets() { return mNTracklets; } + gsl::span getNCells() { return mNCells; } // Host-available device getters + gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } + gsl::span getDeviceTracklet() { return mTrackletsDevice; } gsl::span getDeviceCells() { return mCellsDevice; } - gsl::span getNCellsDevice() { return mNCells; } private: void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations @@ -136,31 +153,37 @@ class TimeFrameGPU : public TimeFrame StaticTrackingParameters mStaticTrackingParams; // Host-available device buffer sizes + std::array mNTracklets; std::array mNCells; // Device pointers StaticTrackingParameters* mTrackingParamsDevice; IndexTableUtils* mIndexTableUtilsDevice; - std::array mROFramesClustersDevice; - std::array mUsedClustersDevice; - Vertex* mVerticesDevice; - int* mROFramesPVDevice; // Hybrid pref + uint8_t* mMultMaskDevice; + Vertex* mPrimaryVerticesDevice; + int* mROFramesPVDevice; std::array mClustersDevice; std::array mUnsortedClustersDevice; + std::array mClustersIndexTablesDevice; + std::array mUsedClustersDevice; + std::array mROFramesClustersDevice; const Cluster** mClustersDeviceArray; const Cluster** mUnsortedClustersDeviceArray; + const int** mClustersIndexTablesDeviceArray; + const unsigned char** mUsedClustersDeviceArray; + const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; - const Tracklet** mTrackletsDeviceArray; - const int** mTrackletsLUTDeviceArray; - std::array mTrackletsLUTDevice; + Tracklet** mTrackletsDeviceArray; + std::array mTrackletsLUTDevice; std::array mCellsLUTDevice; std::array mNeighboursLUTDevice; int** mCellsLUTDeviceArray; int** mNeighboursCellDeviceArray; int** mNeighboursCellLUTDeviceArray; + int** mTrackletsLUTDeviceArray; std::array mCellsDevice; std::array mNeighboursIndexTablesDevice; CellSeed* mTrackSeedsDevice; @@ -186,10 +209,6 @@ class TimeFrameGPU : public TimeFrame std::vector> mNVerticesInChunks; std::vector> mLabelsInChunks; - // Host memory used only in GPU tracking - std::vector mHostNTracklets; - std::vector mHostNCells; - // Temporary buffer for storing output tracks from GPU tracking std::vector mTrackITSExt; }; @@ -215,6 +234,16 @@ inline int TimeFrameGPU::getNClustersInRofSpan(const int rofIdstart, co { return static_cast(mROFramesClusters[layerId][(rofIdstart + rofSpanSize) < mROFramesClusters.size() ? rofIdstart + rofSpanSize : mROFramesClusters.size() - 1] - mROFramesClusters[layerId][rofIdstart]); } + +template +inline std::vector TimeFrameGPU::getClusterSizes() +{ + std::vector sizes(mUnsortedClusters.size()); + std::transform(mUnsortedClusters.begin(), mUnsortedClusters.end(), sizes.begin(), + [](const auto& v) { return static_cast(v.size()); }); + return sizes; +} + } // namespace gpu } // namespace its } // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 34e6165b9530f..54bdae302e643 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -50,11 +50,74 @@ GPUg() void fitTrackSeedsKernel( #endif } // namespace gpu +template +void countTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minR, + std::vector& maxR, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + +template +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minR, + std::vector& maxR, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + void countCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -70,8 +133,8 @@ void countCellsHandler(const Cluster** sortedClusters, void computeCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 66244bf854b5f..a88e51742e84a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -31,6 +31,49 @@ struct gpuPair { namespace gpu { +// Poor man implementation of a span-like struct. It is very limited. +template +struct gpuSpan { + using value_type = T; + using ptr = T*; + using ref = T&; + + GPUd() gpuSpan() : _data(nullptr), _size(0) {} + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } + GPUd() bool empty() const { return _size == 0; } + GPUd() ref front() const { return _data[0]; } + GPUd() ref back() const { return _data[_size - 1]; } + GPUd() ptr begin() const { return _data; } + GPUd() ptr end() const { return _data + _size; } + + protected: + ptr _data; + unsigned int _size; +}; + +template +struct gpuSpan { + using value_type = T; + using ptr = const T*; + using ref = const T&; + + GPUd() gpuSpan() : _data(nullptr), _size(0) {} + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} + GPUd() gpuSpan(const gpuSpan& other) : _data(other._data), _size(other._size) {} + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } + GPUd() bool empty() const { return _size == 0; } + GPUd() ref front() const { return _data[0]; } + GPUd() ref back() const { return _data[_size - 1]; } + GPUd() ptr begin() const { return _data; } + GPUd() ptr end() const { return _data + _size; } + + protected: + ptr _data; + unsigned int _size; +}; enum class Task { Tracker = 0, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 67144ba2c98ea..4bd15c0203d81 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -92,6 +92,19 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl +void TimeFrameGPU::loadIndexTableUtils(const int iteration) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils"); + if (!iteration) { + LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB); + allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator()); + } + LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB); + checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) { @@ -128,6 +141,65 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) } } +template +void TimeFrameGPU::loadClustersIndexTables(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::createUsedClustersDevice(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::loadUsedClustersDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB); + checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::loadROframeClustersDevice(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + template void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) { @@ -146,19 +218,76 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } +template +void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); + LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); + allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::loadVertices(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); + LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); + allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs"); + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + if (!iteration) { + LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); + } + checkGPUError(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + } + if (!iteration) { + allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::createTrackletsBuffers() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + mNTracklets[iLayer] = 0; + checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator()); + } + allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadTrackletsDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mTracklets[iLayer].size() * sizeof(Tracklet), nullptr, getExtAllocator()); checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -167,14 +296,12 @@ void TimeFrameGPU::loadTrackletsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), mTrackletsLookupTable[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer + 1, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); } - allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice)); + checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -214,9 +341,9 @@ void TimeFrameGPU::createCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); @@ -228,7 +355,7 @@ void TimeFrameGPU::createCellsBuffers(const int layer) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); mNCells[layer] = 0; - checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator()); @@ -319,9 +446,9 @@ void TimeFrameGPU::downloadCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts"); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1)); - mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1); - checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1)); + mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1); + checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -362,13 +489,6 @@ void TimeFrameGPU::unregisterRest() LOGP(debug, "unregistering rest of the host memory..."); checkGPUError(cudaHostUnregister(mCellsDevice.data())); checkGPUError(cudaHostUnregister(mTrackletsDevice.data())); - checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data())); - for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - if (iLayer < nLayers - 2) { - checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data())); - } - checkGPUError(cudaHostUnregister(mTracklets[iLayer].data())); - } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 3c6a307fc4ff6..ae86507e46325 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -31,241 +31,18 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers); mTimeFrameGPU->loadClustersDevice(iteration); mTimeFrameGPU->loadUnsortedClustersDevice(iteration); + mTimeFrameGPU->loadClustersIndexTables(iteration); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); + mTimeFrameGPU->loadMultiplicityCutMask(iteration); + mTimeFrameGPU->loadVertices(iteration); + mTimeFrameGPU->loadROframeClustersDevice(iteration); + mTimeFrameGPU->createUsedClustersDevice(iteration); + mTimeFrameGPU->loadIndexTableUtils(iteration); } template void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int, int) { - // if (!mTimeFrameGPU->getClusters().size()) { - // return; - // } - // const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); - // gsl::span diamondSpan(&diamondVert, 1); - // std::vector threads(mTimeFrameGPU->getNChunks()); - - // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { - // int maxTracklets{static_cast(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity) * - // static_cast(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster)}; - // int maxRofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()}; - // // Define workload - // auto doTrackReconstruction = [&, chunkId, maxRofPerChunk, iteration]() -> void { - // auto offset = chunkId * maxRofPerChunk; - // auto maxROF = offset + maxRofPerChunk; - // while (offset < maxROF) { - // auto rofs = mTimeFrameGPU->loadChunkData(chunkId, offset, maxROF); - // //////////////////// - // /// Tracklet finding - - // for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - // auto nclus = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, iLayer); - // const float meanDeltaR{mTrkParams[iteration].LayerRadii[iLayer + 1] - mTrkParams[iteration].LayerRadii[iLayer]}; - // gpu::computeLayerTrackletsKernelMultipleRof<<getStream(chunkId).get()>>>( - // iLayer, // const int layerIndex, - // iteration, // const int iteration, - // offset, // const unsigned int startRofId, - // rofs, // const unsigned int rofSize, - // 0, // const unsigned int deltaRof, - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer), // const Cluster* clustersCurrentLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer + 1), // const Cluster* clustersNextLayer, - // mTimeFrameGPU->getDeviceROframesClusters(iLayer), // const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - // mTimeFrameGPU->getDeviceROframesClusters(iLayer + 1), // const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(iLayer + 1), // const int* indexTableNextLayer, - // mTimeFrameGPU->getDeviceUsedClusters(iLayer), // const int* usedClustersCurrentLayer, - // mTimeFrameGPU->getDeviceUsedClusters(iLayer + 1), // const int* usedClustersNextLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), // Tracklet* tracklets, // output data - // mTimeFrameGPU->getDeviceVertices(), // const Vertex* vertices, - // mTimeFrameGPU->getDeviceROframesPV(), // const int* pvROFrame, - // mTimeFrameGPU->getPhiCut(iLayer), // const float phiCut, - // mTimeFrameGPU->getMinR(iLayer + 1), // const float minR, - // mTimeFrameGPU->getMaxR(iLayer + 1), // const float maxR, - // meanDeltaR, // const float meanDeltaR, - // mTimeFrameGPU->getPositionResolution(iLayer), // const float positionResolution, - // mTimeFrameGPU->getMSangle(iLayer), // const float mSAngle, - // mTimeFrameGPU->getDeviceTrackingParameters(), // const StaticTrackingParameters* trkPars, - // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity, // const int clustersPerROfCapacity, - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster); // const int maxTrackletsPerCluster - - // // Remove empty tracklets due to striding. - // auto nulltracklet = o2::its::Tracklet{}; - // auto thrustTrackletsBegin = thrust::device_ptr(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer)); - // auto thrustTrackletsEnd = thrust::device_ptr(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer) + (int)rofs * maxTracklets); - // auto thrustTrackletsAfterEraseEnd = thrust::remove(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), - // thrustTrackletsBegin, - // thrustTrackletsEnd, - // nulltracklet); - // // Sort tracklets by first cluster index. - // thrust::sort(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), - // thrustTrackletsBegin, - // thrustTrackletsAfterEraseEnd, - // gpu::trackletSortIndexFunctor()); - - // // Remove duplicates. - // auto thrustTrackletsAfterUniqueEnd = thrust::unique(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), thrustTrackletsBegin, thrustTrackletsAfterEraseEnd); - - // discardResult(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get())); - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer] = thrustTrackletsAfterUniqueEnd - thrustTrackletsBegin; - // // Compute tracklet lookup table. - // gpu::compileTrackletsLookupTableKernel<<getStream(chunkId).get()>>>(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]); - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), // d_out - // nclus, // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Create tracklets labels, at the moment on the host - // if (mTimeFrameGPU->hasMCinformation()) { - // std::vector tracklets(mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]); - // checkGPUError(cudaHostRegister(tracklets.data(), tracklets.size() * sizeof(o2::its::Tracklet), cudaHostRegisterDefault)); - // checkGPUError(cudaMemcpyAsync(tracklets.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), tracklets.size() * sizeof(o2::its::Tracklet), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // for (auto& trk : tracklets) { - // MCCompLabel label; - // int currentId{mTimeFrameGPU->mClusters[iLayer][trk.firstClusterIndex].clusterId}; // This is not yet offsetted to the index of the first cluster of the chunk - // int nextId{mTimeFrameGPU->mClusters[iLayer + 1][trk.secondClusterIndex].clusterId}; // This is not yet offsetted to the index of the first cluster of the chunk - // for (auto& lab1 : mTimeFrameGPU->getClusterLabels(iLayer, currentId)) { - // for (auto& lab2 : mTimeFrameGPU->getClusterLabels(iLayer + 1, nextId)) { - // if (lab1 == lab2 && lab1.isValid()) { - // label = lab1; - // break; - // } - // } - // if (label.isValid()) { - // break; - // } - // } - // // TODO: implment label merging. - // // mTimeFrameGPU->getTrackletsLabel(iLayer).emplace_back(label); - // } - // checkGPUError(cudaHostUnregister(tracklets.data())); - // } - // } - - // //////////////// - // /// Cell finding - // for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - // // Compute layer cells. - // gpu::computeLayerCellsKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], - // nullptr, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), - // mTimeFrameGPU->getDeviceTrackingParameters()); - - // // Compute number of found Cells - // checkGPUError(cub::DeviceReduce::Sum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells() + iLayer, // d_out - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - // // Compute LUT - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_out - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // gpu::computeLayerCellsKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), - // mTimeFrameGPU->getDeviceTrackingParameters()); - // } - // checkGPUError(cudaMemcpyAsync(mTimeFrameGPU->getHostNCells(chunkId).data(), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // (nLayers - 2) * sizeof(int), - // cudaMemcpyDeviceToHost, - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Create cells labels - // // TODO: make it work after fixing the tracklets labels - // if (mTimeFrameGPU->hasMCinformation()) { - // for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - // std::vector cells(mTimeFrameGPU->getHostNCells(chunkId)[iLayer]); - // // Async with not registered memory? - // checkGPUError(cudaMemcpyAsync(cells.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), mTimeFrameGPU->getHostNCells(chunkId)[iLayer] * sizeof(o2::its::Cell), cudaMemcpyDeviceToHost)); - // for (auto& cell : cells) { - // MCCompLabel currentLab{mTimeFrameGPU->getTrackletsLabel(iLayer)[cell.getFirstTrackletIndex()]}; - // MCCompLabel nextLab{mTimeFrameGPU->getTrackletsLabel(iLayer + 1)[cell.getSecondTrackletIndex()]}; - // mTimeFrameGPU->getCellsLabel(iLayer).emplace_back(currentLab == nextLab ? currentLab : MCCompLabel()); - // } - // } - // } - - // ///////////////////// - // /// Neighbour finding - // for (int iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - // gpu::computeLayerCellNeighboursKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1), - // iLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), - // nullptr, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize); - - // // Compute Cell Neighbours LUT - // checkGPUError(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_out - // mTimeFrameGPU->getHostNCells(chunkId)[iLayer + 1], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // gpu::computeLayerCellNeighboursKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1), - // iLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize); - - // // if (!chunkId) { - // // gpu::printBufferLayerOnThread<<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLayer, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer), - // // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize * rofs); - // // } - // } - // // Download cells into vectors - - // for (int iLevel{nLayers - 2}; iLevel >= mTrkParams[iteration].CellMinimumLevel(); --iLevel) { - // const int minimumLevel{iLevel - 1}; - // for (int iLayer{nLayers - 3}; iLayer >= minimumLevel; --iLayer) { - // // gpu::computeLayerRoadsKernel<<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLevel, // const int level, - // // iLayer, // const int layerIndex, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayCells(), // const CellSeed** cells, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), // const int* nCells, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCell(), // const int** neighbours, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCellLUT(), // const int** neighboursLUT, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceRoads(), // Road* roads, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceRoadsLookupTables(iLayer)); // int* roadsLookupTable - // } - // } - - // // End of tracking for this chunk - // offset += rofs; - // } - // }; - // threads[chunkId] = std::thread(doTrackReconstruction); - // } - // for (auto& thread : threads) { - // thread.join(); - // } - - // mTimeFrameGPU->wipe(nLayers); } template @@ -299,7 +76,7 @@ int TrackerTraitsGPU::getTFNumberOfClusters() const template int TrackerTraitsGPU::getTFNumberOfTracklets() const { - return mTimeFrameGPU->getNumberOfTracklets(); + return std::accumulate(mTimeFrameGPU->getNTracklets().begin(), mTimeFrameGPU->getNTracklets().end(), 0); } template @@ -313,31 +90,94 @@ int TrackerTraitsGPU::getTFNumberOfCells() const template void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) { - TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); + // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); + mTimeFrameGPU->createTrackletsLUTDevice(iteration); + + const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); + gsl::span diamondSpan(&diamondVert, 1); + int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; + int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; + + countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + startROF, + endROF, + mTimeFrameGPU->getNrof(), + mTrkParams[iteration].DeltaROF, + iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), + mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), + mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->createTrackletsBuffers(); + computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + startROF, + endROF, + mTimeFrameGPU->getNrof(), + mTrkParams[iteration].DeltaROF, + iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), + mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceTracklet(), + mTimeFrameGPU->getNTracklets(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), + mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), + conf.nBlocks, + conf.nThreads); } template void TrackerTraitsGPU::computeCellsHybrid(const int iteration) { - mTimeFrameGPU->loadTrackletsDevice(); - mTimeFrameGPU->loadTrackletsLUTDevice(); mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - // #pragma omp parallel for num_threads(nLayers) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { - if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() || - mTimeFrameGPU->getTracklets()[iLayer].empty()) { + if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) { continue; } - - const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getTracklets()[iLayer].size())}; + const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, nullptr, mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -354,7 +194,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, mTimeFrameGPU->getDeviceCells()[iLayer], mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -378,7 +218,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])}; + const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); @@ -441,7 +281,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) std::vector lastCellId, updatedCellId; std::vector lastCellSeed, updatedCellSeed; - processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); + processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { @@ -495,8 +335,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - nShared += int(mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); - isFirstShared |= !iLayer && mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); + nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); + isFirstShared |= !iLayer && mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); } if (nShared > mTrkParams[0].ClusterSharing) { @@ -508,8 +348,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - mTimeFrame->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); - int currentROF = mTimeFrame->getClusterROF(iLayer, track.getClusterIndex(iLayer)); + mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); + int currentROF = mTimeFrameGPU->getClusterROF(iLayer, track.getClusterIndex(iLayer)); for (int iR{0}; iR < 3; ++iR) { if (rofs[iR] == INT_MAX) { rofs[iR] = currentROF; @@ -525,9 +365,10 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (rofs[1] != INT_MAX) { track.setNextROFbit(); } - mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); + mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } } + mTimeFrameGPU->loadUsedClustersDevice(); if (iteration == mTrkParams.size() - 1) { mTimeFrameGPU->unregisterHostMemory(0); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 73dcf3bcb4894..229827611c077 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -32,6 +32,7 @@ #include "ITStracking/IndexTableUtils.h" #include "ITStracking/MathUtils.h" #include "DataFormatsITS/TrackITS.h" +#include "ReconstructionDataFormats/Vertex.h" #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" @@ -70,12 +71,39 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = } namespace o2::its - { using namespace constants::its2; +using Vertex = o2::dataformats::Vertex>; + +GPUd() float Sq(float v) +{ + return v * v; +} namespace gpu { + +GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, + const o2::its::IndexTableUtils& utils, + const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +{ + const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; + const float phiRangeMin = (maxdeltaphi > constants::math::Pi) ? 0.f : currentCluster.phi - maxdeltaphi; + const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; + const float phiRangeMax = (maxdeltaphi > constants::math::Pi) ? constants::math::TwoPi : currentCluster.phi + maxdeltaphi; + + if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || + zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { + + return getEmptyBinsRect(); + } + + return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), + o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; +} + GPUd() bool fitTrack(TrackITSExt& track, int start, int end, @@ -127,7 +155,7 @@ GPUd() bool fitTrack(TrackITSExt& track, } nCl++; } - return o2::gpu::GPUCommonMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); + return o2::gpu::CAMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); } GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, @@ -146,7 +174,7 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, const float y3 = tf3.positionTrackingFrame[0]; const float z3 = tf3.positionTrackingFrame[1]; - const bool zeroField{o2::gpu::GPUCommonMath::Abs(bz) < o2::constants::math::Almost0}; + const bool zeroField{o2::gpu::CAMath::Abs(bz) < o2::constants::math::Almost0}; const float tgp = zeroField ? o2::gpu::CAMath::ATan2(y3 - y1, x3 - x1) : 1.f; const float crv = zeroField ? 1.f : math_utils::computeCurvature(x3, y3, x2, y2, x1, y1); const float snp = zeroField ? tgp / o2::gpu::CAMath::Sqrt(1.f + tgp * tgp) : crv * (x3 - math_utils::computeCurvatureCentreX(x3, y3, x2, y2, x1, y1)); @@ -164,6 +192,17 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, 0.f, 0.f, 0.f, 0.f, sg2q2pt}); } +// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }; +// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }; + +struct sort_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); } +}; + +struct equal_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; } +}; + template struct pair_to_first : public thrust::unary_function, T1> { GPUhd() int operator()(const gpuPair& a) const @@ -196,6 +235,33 @@ struct is_valid_pair { } }; +GPUd() gpuSpan getPrimaryVertices(const int rof, + const int* roframesPV, + const int nROF, + const uint8_t* mask, + const Vertex* vertices) +{ + const int start_pv_id = roframesPV[rof]; + const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1; + size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded + return gpuSpan(&vertices[start_pv_id], delta); +}; + +GPUd() gpuSpan getClustersOnLayer(const int rof, + const int totROFs, + const int layer, + const int** roframesClus, + const Cluster** clusters) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[layer][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id; + return gpuSpan(&(clusters[layer][start_clus_id]), delta); +} + template GPUg() void fitTrackSeedsKernel( CellSeed* trackSeeds, @@ -314,8 +380,8 @@ GPUg() void computeLayerCellsKernel( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTrackletsCurrent, const int layer, CellSeed* cells, @@ -331,8 +397,8 @@ GPUg() void computeLayerCellsKernel( for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) { const Tracklet& currentTracklet = tracklets[layer][iCurrentTrackletIndex]; const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; - const int nextLayerFirstTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex]}; - const int nextLayerLastTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex + 1]}; + const int nextLayerFirstTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex]}; + const int nextLayerLastTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex + 1]}; if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) { continue; } @@ -342,7 +408,7 @@ GPUg() void computeLayerCellsKernel( break; } const Tracklet& nextTracklet = tracklets[layer + 1][iNextTrackletIndex]; - const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; + const float deltaTanLambda{o2::gpu::CAMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; if (deltaTanLambda / cellDeltaTanLambdaSigma < nSigmaCut) { const int clusId[3]{ @@ -394,35 +460,124 @@ GPUg() void computeLayerCellsKernel( } } -///////////////////////////////////////// -// Debug Kernels -///////////////////////////////////////// -GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, - const o2::its::IndexTableUtils& utils, - const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +template +GPUg() void computeLayerTrackletsMultiROFKernel( + const IndexTableUtils* utils, + const uint8_t* multMask, + const int layerIndex, + const int startROF, + const int endROF, + const int totalROFs, + const int deltaROF, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const int vertexId, + const Cluster** clusters, // Input data rof0 + const int** ROFClusters, // Number of clusters on layers per ROF + const unsigned char** usedClusters, // Used clusters + const int** indexTables, // Input data rof0-delta getNphiBins()}; + const int zBins{utils->getNzBins()}; + for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) { + const short rof0 = iROF + startROF; + auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices); + const auto startVtx{vertexId >= 0 ? vertexId : 0}; + const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; + const short minROF = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaROF)); + const short maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaROF)); + auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters); + if (clustersCurrentLayer.empty()) { + continue; + } - if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || - zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { + for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) { + unsigned int storedTracklets{0}; + auto currentCluster{clustersCurrentLayer[currentClusterIndex]}; + const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex}; + if (usedClusters[layerIndex][currentCluster.clusterId]) { + continue; + } - return getEmptyBinsRect(); - } + const float inverseR0{1.f / currentCluster.radius}; + for (int iV{startVtx}; iV < endVtx; ++iV) { + auto& primaryVertex{primaryVertices[iV]}; + if (primaryVertex.isFlagSet(2) && iteration != 3) { + continue; + } + const float resolution = o2::gpu::CAMath::Sqrt(Sq(resolutionPV) / primaryVertex.getNContributors() + Sq(positionResolution)); + const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; + const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; + const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; + const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution + const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * MSAngle))}; + const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; + if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + continue; + } + int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - return int4{o2::gpu::GPUCommonMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), - o2::gpu::GPUCommonMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; -} + if (phiBinsNum < 0) { + phiBinsNum += phiBins; + } -GPUhd() float Sq(float q) -{ - return q * q; + const int tableSize{phiBins * zBins + 1}; + for (short rof1{minROF}; rof1 <= maxROF; ++rof1) { + auto clustersNextLayer = getClustersOnLayer(rof1, totalROFs, layerIndex + 1, ROFClusters, clusters); + if (clustersNextLayer.empty()) { + continue; + } + for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { + int iPhiBin = (selectedBinsRect.y + iPhiCount) % phiBins; + const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; + const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; + const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex]; + const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex]; + for (int nextClusterIndex{firstRowClusterIndex}; nextClusterIndex < maxRowClusterIndex; ++nextClusterIndex) { + if (nextClusterIndex >= clustersNextLayer.size()) { + break; + } + const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; + if (usedClusters[layerIndex + 1][nextCluster.clusterId]) { + continue; + } + const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; + const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; + const int nextSortedIndex{ROFClusters[layerIndex + 1][rof1] + nextClusterIndex}; + if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { + if constexpr (initRun) { + trackletsLUT[layerIndex][currentSortedIndex]++; // we need l0 as well for usual exclusive sums. + } else { + const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; + const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; + new (tracklets[layerIndex] + trackletsLUT[layerIndex][currentSortedIndex] + storedTracklets) Tracklet{currentSortedIndex, nextSortedIndex, tanL, phi, rof0, rof1}; + } + ++storedTracklets; + } + } + } + } + } + } + } } +///////////////////////////////////////// +// Debug Kernels +///////////////////////////////////////// + template GPUd() void pPointer(T* ptr) { @@ -437,7 +592,6 @@ GPUg() void printPointersKernel(std::tuple args) std::apply(print_all, args); } -// Functors to sort tracklets template struct trackletSortEmptyFunctor : public thrust::binary_function { GPUhd() bool operator()(const T& lhs, const T& rhs) const @@ -454,7 +608,6 @@ struct trackletSortIndexFunctor : public thrust::binary_function { } }; -// Print layer buffer GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { @@ -494,52 +647,12 @@ GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned } } -// Dump vertices GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - printf("vertices: "); + printf("vertices: \n"); for (int i{0}; i < size; ++i) { - printf("x=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ()); - } - } -} - -// Dump tracklets -GPUg() void printTracklets(const Tracklet* t, - const int offset, - const int startRof, - const int nrof, - const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - const int maxClustersPerRof = 5e2, - const int maxTrackletsPerCluster = 50, - const unsigned int tId = 0) -{ - if (threadIdx.x == tId) { - auto offsetCurrent{roFrameClustersCurrentLayer[offset]}; - auto offsetNext{roFrameClustersNextLayer[offset]}; - auto offsetChunk{(startRof - offset) * maxClustersPerRof * maxTrackletsPerCluster}; - for (int i{offsetChunk}; i < offsetChunk + nrof * maxClustersPerRof * maxTrackletsPerCluster; ++i) { - if (t[i].firstClusterIndex != -1) { - t[i].dump(offsetCurrent, offsetNext); - } - } - } -} - -GPUg() void printTrackletsNotStrided(const Tracklet* t, - const int offset, - const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - const int ntracklets, - const unsigned int tId = 0) -{ - if (threadIdx.x == tId) { - auto offsetCurrent{roFrameClustersCurrentLayer[offset]}; - auto offsetNext{roFrameClustersNextLayer[offset]}; - for (int i{0}; i < ntracklets; ++i) { - t[i].dump(offsetCurrent, offsetNext); + printf("\tx=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ()); } } } @@ -556,102 +669,25 @@ GPUg() void printNeighbours(const gpuPair* neighbours, } } -// Compute the tracklets for a given layer -template -GPUg() void computeLayerTrackletsKernelSingleRof( - const short rof0, - const short maxRofs, - const int layerIndex, - const Cluster* clustersCurrentLayer, // input data rof0 - const Cluster* clustersNextLayer, // input data rof0-delta * trkPars, - const IndexTableUtils* utils, - const unsigned int maxTrackletsPerCluster = 50) +GPUg() void printTrackletsLUTPerROF(const int layerId, + const int** ROFClusters, + int** luts, + const int tId = 0) { - for (int currentClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; currentClusterIndex < currentLayerClustersSize; currentClusterIndex += blockDim.x * gridDim.x) { - unsigned int storedTracklets{0}; - const Cluster& currentCluster{clustersCurrentLayer[currentClusterIndex]}; - const int currentSortedIndex{roFrameClusters[rof0] + currentClusterIndex}; - if (usedClustersLayer[currentSortedIndex]) { - continue; - } - short minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - short maxRof = (rof0 == static_cast(maxRofs - trkPars->DeltaROF)) ? rof0 : rof0 + trkPars->DeltaROF; - const float inverseR0{1.f / currentCluster.radius}; - for (int iPrimaryVertex{0}; iPrimaryVertex < nVertices; iPrimaryVertex++) { - const auto& primaryVertex{vertices[iPrimaryVertex]}; - if (primaryVertex.getX() == 0.f && primaryVertex.getY() == 0.f && primaryVertex.getZ() == 0.f) { - continue; - } - const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + if (blockIdx.x * blockDim.x + threadIdx.x == tId) { + for (auto rofId{0}; rofId < 2304; ++rofId) { + int nClus = ROFClusters[layerId][rofId + 1] - ROFClusters[layerId][rofId]; + if (!nClus) { continue; } - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += trkPars->PhiBins; - } - constexpr int tableSize{256 * 128 + 1}; // hardcoded for the time being + printf("rof: %d (%d) ==> ", rofId, nClus); - for (short rof1{minRof}; rof1 <= maxRof; ++rof1) { - if (!(roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { // number of clusters on next layer > 0 - continue; - } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - const int firstRowClusterIndex = indexTable[rof1 * tableSize + firstBinIndex]; - const int maxRowClusterIndex = indexTable[rof1 * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= (roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { - break; - } - const Cluster& nextCluster{getPtrFromRuler(rof1, clustersNextLayer, roFrameClustersNext)[iNextCluster]}; - if (usedClustersNextLayer[nextCluster.clusterId]) { - continue; - } - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - - if (deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { - trackletsLookUpTable[currentSortedIndex]++; // Race-condition safe - const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - new (tracklets + stride + storedTracklets) Tracklet{currentSortedIndex, roFrameClustersNext[rof1] + iNextCluster, tanL, phi, rof0, rof1}; - ++storedTracklets; - } - } - } + for (int iC{0}; iC < nClus; ++iC) { + int nT = luts[layerId][ROFClusters[layerId][rofId] + iC]; + printf("%d\t", nT); } + printf("\n"); } - // if (storedTracklets > maxTrackletsPerCluster) { - // printf("its-gpu-tracklet finder: found more tracklets per clusters (%d) than maximum set (%d), check the configuration!\n", maxTrackletsPerCluster, storedTracklets); - // } } } @@ -661,124 +697,7 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, const int nTracklets) { for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { - auto& tracklet{tracklets[currentTrackletIndex]}; - if (tracklet.firstClusterIndex >= 0) { - atomicAdd(trackletsLookUpTable + tracklet.firstClusterIndex, 1); - } - } -} - -template -GPUg() void computeLayerTrackletsKernelMultipleRof( - const int layerIndex, - const int iteration, - const unsigned int startRofId, - const unsigned int rofSize, - const int maxRofs, - const Cluster* clustersCurrentLayer, // input data rof0 - const Cluster* clustersNextLayer, // input data rof0-delta * trkPars, - const IndexTableUtils* utils, - const unsigned int maxClustersPerRof = 5e2, - const unsigned int maxTrackletsPerCluster = 50) -{ - const int phiBins{utils->getNphiBins()}; - const int zBins{utils->getNzBins()}; - for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) { - auto rof0 = iRof + startRofId; - auto nClustersCurrentLayerRof = o2::gpu::GPUCommonMath::Min(roFrameClustersCurrentLayer[rof0 + 1] - roFrameClustersCurrentLayer[rof0], (int)maxClustersPerRof); - // if (nClustersCurrentLayerRof > maxClustersPerRof) { - // printf("its-gpu-tracklet finder: on layer %d found more clusters per ROF (%d) than maximum set (%d), check the configuration!\n", layerIndex, nClustersCurrentLayerRof, maxClustersPerRof); - // } - auto* clustersCurrentLayerRof = clustersCurrentLayer + (roFrameClustersCurrentLayer[rof0] - roFrameClustersCurrentLayer[startRofId]); - auto nVerticesRof0 = nVertices[rof0 + 1] - nVertices[rof0]; - auto trackletsRof0 = tracklets + maxTrackletsPerCluster * maxClustersPerRof * iRof; - for (int currentClusterIndex = threadIdx.x; currentClusterIndex < nClustersCurrentLayerRof; currentClusterIndex += blockDim.x) { - unsigned int storedTracklets{0}; - const Cluster& currentCluster{clustersCurrentLayerRof[currentClusterIndex]}; - const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; - const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startRofId]}; - if (usedClustersLayer[currentSortedIndex]) { - continue; - } - - int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} - const float inverseR0{1.f / currentCluster.radius}; - - for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { - const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; - const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - - if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { - continue; - } - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += trkPars->PhiBins; - } - const int tableSize{phiBins * zBins + 1}; - for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { - auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; - if (!nClustersNext) { // number of clusters on next layer > 0 - continue; - } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - const int firstRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + firstBinIndex]; - const int maxRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= nClustersNext) { - break; - } - auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startRofId] + iNextCluster}; - const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; - if (usedClustersNextLayer[nextCluster.clusterId]) { - continue; - } - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - - if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { - const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - if (storedTracklets < maxTrackletsPerCluster) { - new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; - } - // else { - // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); - // } - ++storedTracklets; - } - } - } - } - } - } + atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); } } @@ -803,12 +722,176 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } // namespace gpu +template +void countTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads) +{ + for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + nullptr, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + gpuCheckError(cudaFree(d_temp_storage)); + } +} + +template +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads) +{ + for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { + gpu::computeLayerTrackletsMultiROFKernel<<>>(utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + tracklets, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); + thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); + thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); + auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); + nTracklets[iLayer] = unique_end - tracklets_ptr; + if (iLayer > 0) { + gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); + gpu::compileTrackletsLookupTableKernel<<>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + gpuCheckError(cudaFree(d_temp_storage)); + } + } +} + void countCellsHandler( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -850,7 +933,6 @@ void countCellsHandler( cellsLUTsHost, // d_out nTracklets + 1, // num_items 0)); // NOLINT: this is the offset of the sum, not a pointer - // gpu::printBufferLayerOnThread<<<1, 1>>>(layer, cellsLUTsHost, nTracklets + 1); gpuCheckError(cudaFree(d_temp_storage)); } @@ -858,8 +940,8 @@ void computeCellsHandler( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -963,8 +1045,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nThreads) { - gpu::computeLayerCellNeighboursKernel<<>>( + gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, neighboursLUT, neighboursIndexTable, @@ -1032,4 +1114,65 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); } -} // namespace o2::its + +template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + +template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); +} // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 906eb0fa5c21e..fa4f33782d16a 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -106,12 +106,16 @@ class TimeFrame float getBeamX() const; float getBeamY() const; - + std::vector& getMinRs() { return mMinR; } + std::vector& getMaxRs() { return mMaxR; } float getMinR(int layer) const { return mMinR[layer]; } float getMaxR(int layer) const { return mMaxR[layer]; } float getMSangle(int layer) const { return mMSangles[layer]; } + std::vector& getMSangles() { return mMSangles; } float getPhiCut(int layer) const { return mPhiCuts[layer]; } + std::vector& getPhiCuts() { return mPhiCuts; } float getPositionResolution(int layer) const { return mPositionResolution[layer]; } + std::vector& getPositionResolutions() { return mPositionResolution; } gsl::span getClustersOnLayer(int rofId, int layerId); gsl::span getClustersOnLayer(int rofId, int layerId) const; @@ -209,8 +213,8 @@ class TimeFrame const unsigned long long& getRoadLabel(int i) const; bool isRoadFake(int i) const; - void setMultiplicityCutMask(const std::vector& cutMask) { mMultiplicityCutMask = cutMask; } - void setROFMask(const std::vector& rofMask) { mROFMask = rofMask; } + void setMultiplicityCutMask(const std::vector& cutMask) { mMultiplicityCutMask = cutMask; } + void setROFMask(const std::vector& rofMask) { mROFMask = rofMask; } void swapMasks() { mMultiplicityCutMask.swap(mROFMask); } int hasBogusClusters() const { return std::accumulate(mBogusClusters.begin(), mBogusClusters.end(), 0); } @@ -289,6 +293,7 @@ class TimeFrame std::vector> mTracks; std::vector> mCellsNeighbours; std::vector> mCellsLookupTable; + std::vector mMultiplicityCutMask; const o2::base::PropagatorImpl* mPropagatorDevice = nullptr; // Needed only for GPU protected: @@ -311,8 +316,8 @@ class TimeFrame std::vector mPhiCuts; std::vector mPositionResolution; std::vector mClusterSize; - std::vector mMultiplicityCutMask; - std::vector mROFMask; + + std::vector mROFMask; std::vector> mPValphaX; /// PV x and alpha for track propagation std::vector> mTrackletLabels; std::vector> mCellLabels; @@ -439,33 +444,33 @@ inline gsl::span TimeFrame::getClustersPerROFrange(int rofMin, in return gsl::span(); } int startIdx{mROFramesClusters[layerId][rofMin]}; // First cluster of rofMin - int endIdx{mROFramesClusters[layerId][std::min(rofMin + range, mNrof)]}; + int endIdx{mROFramesClusters[layerId][o2::gpu::CAMath::Min(rofMin + range, mNrof)]}; return {&mClusters[layerId][startIdx], static_cast::size_type>(endIdx - startIdx)}; } inline gsl::span TimeFrame::getROFramesClustersPerROFrange(int rofMin, int range, int layerId) const { - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mROFramesClusters[layerId][rofMin], static_cast::size_type>(chkdRange)}; } inline gsl::span TimeFrame::getNClustersROFrange(int rofMin, int range, int layerId) const { - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mNClustersPerROF[layerId][rofMin], static_cast::size_type>(chkdRange)}; } inline int TimeFrame::getTotalClustersPerROFrange(int rofMin, int range, int layerId) const { int startIdx{rofMin}; // First cluster of rofMin - int endIdx{std::min(rofMin + range, mNrof)}; + int endIdx{o2::gpu::CAMath::Min(rofMin + range, mNrof)}; return mROFramesClusters[layerId][endIdx] - mROFramesClusters[layerId][startIdx]; } inline gsl::span TimeFrame::getIndexTablePerROFrange(int rofMin, int range, int layerId) const { const int iTableSize{mIndexTableUtils.getNphiBins() * mIndexTableUtils.getNzBins() + 1}; - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mIndexTables[layerId][rofMin * iTableSize], static_cast::size_type>(chkdRange * iTableSize)}; } diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index da0abbae9dc1f..409b20ea23235 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -75,9 +75,9 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in for (int rof0{startROF}; rof0 < endROF; ++rof0) { gsl::span primaryVertices = mTrkParams[iteration].UseDiamond ? diamondSpan : tf->getPrimaryVertices(rof0); const int startVtx{iVertex >= 0 ? iVertex : 0}; - const int endVtx{iVertex >= 0 ? std::min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; - int minRof = std::max(startROF, rof0 - mTrkParams[iteration].DeltaROF); - int maxRof = std::min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); + const int endVtx{iVertex >= 0 ? o2::gpu::CAMath::Min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; + int minRof = o2::gpu::CAMath::Max(startROF, rof0 - mTrkParams[iteration].DeltaROF); + int maxRof = o2::gpu::CAMath::Min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); #pragma omp parallel for num_threads(mNThreads) for (int iLayer = 0; iLayer < mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) { gsl::span layer0 = tf->getClustersOnLayer(rof0, iLayer); @@ -128,7 +128,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in if (layer1.empty()) { continue; } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { int iPhiBin = (selectedBinsRect.y + iPhiCount) % mTrkParams[iteration].PhiBins; const int firstBinIndex{tf->mIndexTableUtils.getBinIndex(selectedBinsRect.x, iPhiBin)}; @@ -145,9 +144,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in } const int firstRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[firstBinIndex]; const int maxRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= (int)layer1.size()) { break; } @@ -668,7 +665,7 @@ void TrackerTraits::findRoads(const int iteration) if (rofs[1] != INT_MAX) { track.setNextROFbit(); } - mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); + mTimeFrame->getTracks(o2::gpu::CAMath::Min(rofs[0], rofs[1])).emplace_back(track); } } } diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx index f00d87164d7d6..5b8a9bb1cb0f2 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx @@ -174,7 +174,7 @@ void ITSTrackingInterface::run(framework::ProcessingContext& pc) auto errorLogger = [&](std::string s) { LOG(error) << s; }; FastMultEst multEst; // mult estimator - std::vector processingMask, processUPCMask; + std::vector processingMask, processUPCMask; int cutVertexMult{0}, cutUPCVertex{0}, cutRandomMult = int(trackROFvec.size()) - multEst.selectROFs(trackROFvec, compClusters, physTriggers, processingMask); processUPCMask.resize(processingMask.size(), false); mTimeFrame->setMultiplicityCutMask(processingMask); diff --git a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx index 4eaddc8385b8a..e87e2289b49e7 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx @@ -90,7 +90,7 @@ float Vertexer::clustersToVerticesHybrid(std::function logg auto timeVertexingIteration = evaluateTask( &Vertexer::findVerticesHybrid, "Hybrid Vertexer vertex finding", [](std::string) {}, iteration); - printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration().size(), timeInit, timeTracklet, timeSelection, timeVertexing); + printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration()[iteration], timeInitIteration, timeTrackletIteration, timeSelectionIteration, timeVertexingIteration); timeInit += timeInitIteration; timeTracklet += timeTrackletIteration; timeSelection += timeSelectionIteration; @@ -142,9 +142,9 @@ void Vertexer::printEpilog(std::function logger, const float initT, const float trackletT, const float selecT, const float vertexT) { float total = initT + trackletT + selecT + vertexT; - logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", trackletN01, trackletN12, trackletT)); - logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", selectedN, selecT)); - logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid" : "", vertexN, vertexT)); + logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", trackletN01, trackletN12, trackletT)); + logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", selectedN, selecT)); + logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid " : "", vertexN, vertexT)); // logger(fmt::format(" - Timeframe {} vertexing completed in: {} ms, using {} thread(s).", mTimeFrameCounter++, total, mTraits->getNThreads())); } diff --git a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx index 01e649f982896..4a0470adcf07a 100644 --- a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx +++ b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx @@ -132,7 +132,7 @@ void CookedTrackerDPL::run(ProcessingContext& pc) const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts FastMultEst multEst; // mult estimator - std::vector processingMask; + std::vector processingMask; int cutVertexMult{0}, cutRandomMult = int(rofsinput.size()) - multEst.selectROFs(rofsinput, compClusters, physTriggers, processingMask); // auto processingMask_ephemeral = processingMask; diff --git a/Framework/Core/CMakeLists.txt b/Framework/Core/CMakeLists.txt index d1dde7e78fdd1..84a4a7f8b8c69 100644 --- a/Framework/Core/CMakeLists.txt +++ b/Framework/Core/CMakeLists.txt @@ -238,6 +238,7 @@ add_executable(o2-test-framework-core test/test_Services.cxx test/test_StringHelpers.cxx test/test_StaticFor.cxx + test/test_TableSpawner.cxx test/test_TMessageSerializer.cxx test/test_TableBuilder.cxx test/test_TimeParallelPipelining.cxx diff --git a/Framework/Core/include/Framework/ASoA.h b/Framework/Core/include/Framework/ASoA.h index 34d18476e483d..25e64daefeba7 100644 --- a/Framework/Core/include/Framework/ASoA.h +++ b/Framework/Core/include/Framework/ASoA.h @@ -15,6 +15,7 @@ #include "Framework/Pack.h" #include "Framework/FunctionalHelpers.h" #include "Headers/DataHeader.h" +#include "Headers/DataHeaderHelpers.h" #include "Framework/CompilerBuiltins.h" #include "Framework/Traits.h" #include "Framework/Expressions.h" @@ -26,152 +27,362 @@ #include #include #include +#include #include #include #include #include -#define DECLARE_SOA_METADATA() \ - template \ - struct MetadataTrait { \ - using metadata = std::void_t; \ - }; - -#define DECLARE_SOA_ITERATOR_METADATA() \ - template \ - requires(o2::soa::is_iterator) \ - struct MetadataTrait { \ - using metadata = typename MetadataTrait::metadata; \ - }; - namespace o2::framework { using ListVector = std::vector>; std::string cutString(std::string&& str); +std::string strToUpper(std::string&& str); +} // namespace o2::framework -struct OriginEnc { - static constexpr size_t size = 4; - uint32_t value; - consteval OriginEnc(uint32_t v) : value{v} - { - } - consteval OriginEnc(std::string_view in) : value{0} +namespace o2::soa +{ +void accessingInvalidIndexFor(const char* getter); +void dereferenceWithWrongType(); +void missingFilterDeclaration(int hash, int ai); +void notBoundTable(const char* tableName); +} // namespace o2::soa + +namespace o2::soa +{ +/// Generic identifier for a table type +struct TableRef { + consteval TableRef() + : label_hash{0}, + desc_hash{0}, + origin_hash{0}, + version{0} { - for (auto i = 0U; i < (size < in.size() ? size : in.size()); ++i) { - value |= ((uint8_t)in[i]) << (8 * i); - } } - operator const char*() const + consteval TableRef(uint32_t _label, uint32_t _desc, uint32_t _origin, uint32_t _version) + : label_hash{_label}, + desc_hash{_desc}, + origin_hash{_origin}, + version{_version} { - return (const char*)(&value); } + uint32_t label_hash; + uint32_t desc_hash; + uint32_t origin_hash; + uint32_t version; - consteval operator o2::header::DataOrigin() + constexpr bool operator==(TableRef const& other) const noexcept { - return o2::header::DataOrigin{value}; + return (this->label_hash == other.label_hash) && + (this->desc_hash == other.desc_hash) && + (this->origin_hash == other.origin_hash) && + (this->version == other.version); } - consteval OriginEnc(OriginEnc const& other) = default; - consteval OriginEnc(OriginEnc&& other) noexcept = default; - consteval OriginEnc& operator=(OriginEnc const& other) = default; - consteval OriginEnc& operator=(OriginEnc&& other) noexcept = default; -}; -} // namespace o2::framework - -template <> -struct fmt::formatter { - char presentation = 's'; - constexpr auto parse(format_parse_context& ctx) + constexpr bool descriptionCompatible(TableRef const& other) const noexcept { - auto it = ctx.begin(), end = ctx.end(); - if (it != end && (*it == 's')) { - presentation = *it++; - } - - // Check if reached the end of the range: - if (it != end && *it != '}') { - throw format_error("invalid pick format"); - } - - // Return an iterator past the end of the parsed range: - return it; + return this->desc_hash == other.desc_hash; } - template - auto format(o2::framework::OriginEnc const& origin, FormatContext& ctx) + constexpr bool descriptionCompatible(uint32_t _desc_hash) const noexcept { - return fmt::format_to(ctx.out(), "{}", (std::string_view)origin); + return this->desc_hash == _desc_hash; } + + constexpr TableRef(TableRef const&) = default; + constexpr TableRef& operator=(TableRef const&) = default; + constexpr TableRef(TableRef&&) = default; + constexpr TableRef& operator=(TableRef&&) = default; }; -namespace o2::aod +/// Helpers to manipulate TableRef arrays +template ar1, std::array ar2> +consteval auto merge() { -DECLARE_SOA_METADATA(); + constexpr const int duplicates = std::ranges::count_if(ar2.begin(), ar2.end(), [&](TableRef const& a) { return std::any_of(ar1.begin(), ar1.end(), [&](TableRef const& e) { return e == a; }); }); + std::array out; + + auto pos = std::copy(ar1.begin(), ar1.end(), out.begin()); + std::copy_if(ar2.begin(), ar2.end(), pos, [&](TableRef const& a) { return std::none_of(ar1.begin(), ar1.end(), [&](TableRef const& e) { return e == a; }); }); + return out; } -namespace o2::soa +template ar1, std::array ar2, typename L> +consteval auto merge_if(L l) { -/// special case for the template with origin -template class Ref> -struct is_specialization_origin : std::false_type { -}; + constexpr const int to_remove = std::ranges::count_if(ar1.begin(), ar1.end(), [&](TableRef const& a) { return !l(a); }); + constexpr const int duplicates = std::ranges::count_if(ar2.begin(), ar2.end(), [&](TableRef const& a) { return std::any_of(ar1.begin(), ar1.end(), [&](TableRef const& e) { return e == a; }) || !l(a); }); + std::array out; -template