Skip to content

Commit

Permalink
Add metrics for cpu and walltime profilers
Browse files Browse the repository at this point in the history
  • Loading branch information
gleocadie committed Nov 12, 2024
1 parent 51de3f6 commit 43d067b
Show file tree
Hide file tree
Showing 18 changed files with 331 additions and 66 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2022 Datadog, Inc.

#pragma

#include <list>
#include <string>
#include <utility>

#include "MetricBase.h"

enum class DiscardReason
{
InSegvHandler = 0,
InsideWrappedFunction,
ExternalSignal,
UnknownThread,
WrongManagedThread,
UnsufficientSpace,
EmptyBacktrace,

// This item must be the last one
GuardItem
};

// This pragma forces a compilation error if we forgot to add an enum item
#if __clang__
#pragma clang diagnostic push
#pragma clang diagnostic error "-Wswitch"
#else
#pragma warning(error : 4062)
#endif
static const char* to_string(DiscardReason type)
{
switch (type)
{
case DiscardReason::InSegvHandler:
return "_in_sigsegv_handler";
case DiscardReason::InsideWrappedFunction:
return "_inside_wrapped_function";
case DiscardReason::ExternalSignal:
return "_external_signal";
case DiscardReason::UnknownThread:
return "_unknown_thread";
case DiscardReason::WrongManagedThread:
return "_wrong_managed_thread";
case DiscardReason::UnsufficientSpace:
return "_unsufficient_space";
case DiscardReason::EmptyBacktrace:
return "_empty_backtrace";
case DiscardReason::GuardItem:
// pass through
break;
}
return "unknown_discard_type";
}
#if __clang__
#pragma clang diagnostic pop
#else
#pragma warning(default : 4062)
#endif

class DiscardMetrics : public MetricBase
{
private:
static constexpr std::size_t array_size = static_cast<std::size_t>(DiscardReason::GuardItem);

public:
DiscardMetrics(std::string name) :
MetricBase(std::move(name)), _metrics{0}
{
}

template <DiscardReason TType>
void Incr()
{
static_assert(TType != DiscardReason::GuardItem, "You must not use DiscardReason::GuardItem");
constexpr auto offset = static_cast<int>(TType);
static_assert(offset <= array_size, "");
_metrics[offset]++;
}

std::list<MetricBase::Metric> GetMetrics() override
{
std::list<MetricBase::Metric> result;
for (std::size_t idx = 0; idx < _metrics.size(); idx++)
{
auto& metric = _metrics[idx];
result.emplace_back(
std::make_pair(_name + to_string(static_cast<DiscardReason>(idx)),
metric.exchange(0)));
}
return result;
}

private:
std::array<std::atomic<std::uint64_t>, array_size> _metrics;
};
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <unordered_map>

#include "CallstackProvider.h"
#include "DiscardMetrics.hpp"
#include "IConfiguration.h"
#include "Log.h"
#include "ManagedThreadInfo.h"
Expand All @@ -29,7 +30,8 @@ LinuxStackFramesCollector* LinuxStackFramesCollector::s_pInstanceCurrentlyStackW
LinuxStackFramesCollector::LinuxStackFramesCollector(
ProfilerSignalManager* signalManager,
IConfiguration const* const configuration,
CallstackProvider* callstackProvider) :
CallstackProvider* callstackProvider,
MetricsRegistry& metricsRegistry) :
StackFramesCollectorBase(configuration, callstackProvider),
_lastStackWalkErrorCode{0},
_stackWalkFinished{false},
Expand All @@ -42,6 +44,11 @@ LinuxStackFramesCollector::LinuxStackFramesCollector(
{
_signalManager->RegisterHandler(LinuxStackFramesCollector::CollectStackSampleSignalHandler);
}

// For now have one metric for both walltime and cpu (naive)
_samplingRequest = metricsRegistry.GetOrRegister<CounterMetric>("dotnet_walltime_cpu_sampling_requests");
_discardMetrics = metricsRegistry.GetOrRegister<DiscardMetrics>("dotnet_walltime_cpu_sample_discard");

}

LinuxStackFramesCollector::~LinuxStackFramesCollector()
Expand Down Expand Up @@ -127,6 +134,7 @@ StackSnapshotResultBuffer* LinuxStackFramesCollector::CollectStackSampleImplemen

_stackWalkFinished = false;

_samplingRequest->Incr();
errorCode = _signalManager->SendSignal(threadId);

if (errorCode == -1)
Expand Down Expand Up @@ -190,6 +198,7 @@ std::int32_t LinuxStackFramesCollector::CollectCallStackCurrentThread(void* ctx)
{
if (dd_inside_wrapped_functions != nullptr && dd_inside_wrapped_functions() != 0)
{
_discardMetrics->Incr<DiscardReason::InsideWrappedFunction>();
return E_ABORT;
}

Expand Down Expand Up @@ -275,6 +284,7 @@ std::int32_t LinuxStackFramesCollector::CollectStackWithBacktrace2(void* ctx)

if (count == 0)
{
_discardMetrics->Incr<DiscardReason::EmptyBacktrace>();
return E_FAIL;
}

Expand All @@ -283,23 +293,6 @@ std::int32_t LinuxStackFramesCollector::CollectStackWithBacktrace2(void* ctx)
return S_OK;
}

bool LinuxStackFramesCollector::CanCollect(int32_t threadId, pid_t processId) const
{
// on OSX, processId can be equal to 0. https://sourcegraph.com/github.com/dotnet/runtime/-/blob/src/coreclr/pal/src/exception/signal.cpp?L818:5&subtree=true
// Since the profiler does not run on OSX, we leave it like this.
auto* currentThreadInfo = _pCurrentCollectionThreadInfo;
return currentThreadInfo != nullptr && currentThreadInfo->GetOsThreadId() == threadId && processId == _processId;
}

void LinuxStackFramesCollector::MarkAsInterrupted()
{
auto* currentThreadInfo = _pCurrentCollectionThreadInfo;

if (currentThreadInfo != nullptr)
{
currentThreadInfo->MarkAsInterrupted();
}
}

bool IsInSigSegvHandler(void* context)
{
Expand All @@ -311,44 +304,81 @@ bool IsInSigSegvHandler(void* context)
return sigismember(&(ctx->uc_sigmask), SIGSEGV) == 1;
}

bool LinuxStackFramesCollector::CollectStackSampleSignalHandler(int signal, siginfo_t* info, void* context)
bool LinuxStackFramesCollector::CanCollect(int32_t threadId, siginfo_t* info, void* context) const
{
// This is a workaround to prevent libunwind from unwind 2 signal frames and potentially crashing.
// This is a workaround to prevent libunwind from unwinding 2 signal frames and potentially crashing.
// Current crash occurs in libcoreclr.so, while reading the Elf header.
if (IsInSigSegvHandler(context))
{
_discardMetrics->Incr<DiscardReason::InSegvHandler>();
return false;
}

auto* currentThreadInfo = _pCurrentCollectionThreadInfo;
if (currentThreadInfo == nullptr)
{
_discardMetrics->Incr<DiscardReason::UnknownThread>();
return false;
}

if (currentThreadInfo->GetOsThreadId() != threadId)
{
_discardMetrics->Incr<DiscardReason::WrongManagedThread>();
return false;
}

// on OSX, processId can be equal to 0. https://sourcegraph.com/github.com/dotnet/runtime/-/blob/src/coreclr/pal/src/exception/signal.cpp?L818:5&subtree=true
// Since the profiler does not run on OSX, we leave it like this.
if (info->si_pid != _processId)
{
_discardMetrics->Incr<DiscardReason::ExternalSignal>();
return false;
}

return true;
}

void LinuxStackFramesCollector::MarkAsInterrupted()
{
auto* currentThreadInfo = _pCurrentCollectionThreadInfo;

if (currentThreadInfo != nullptr)
{
currentThreadInfo->MarkAsInterrupted();
}
}

bool LinuxStackFramesCollector::CollectStackSampleSignalHandler(int signal, siginfo_t* info, void* context)
{
// Libunwind can overwrite the value of errno - save it beforehand and restore it at the end
auto oldErrno = errno;

bool success = false;

LinuxStackFramesCollector* pCollectorInstance = s_pInstanceCurrentlyStackWalking;
LinuxStackFramesCollector* pCollector = s_pInstanceCurrentlyStackWalking;

if (pCollectorInstance != nullptr)
if (pCollector != nullptr)
{
std::unique_lock<std::mutex> stackWalkInProgressLock(s_stackWalkInProgressMutex);
std::unique_lock<std::mutex> lock(s_stackWalkInProgressMutex);

pCollectorInstance = s_pInstanceCurrentlyStackWalking;
pCollector = s_pInstanceCurrentlyStackWalking;

// sampling in progress
if (pCollectorInstance != nullptr)
if (pCollector != nullptr)
{
pCollectorInstance->MarkAsInterrupted();

// There can be a race:
// The sampling thread has sent the signal and is waiting, but another SIGUSR1 signal was sent
// by another thread and is handled before the one sent by the sampling thread.
if (pCollectorInstance->CanCollect(OpSysTools::GetThreadId(), info->si_pid))
if (pCollector->CanCollect(OpSysTools::GetThreadId(), info, context))
{
pCollector->MarkAsInterrupted();

// In case it's the thread we want to sample, just get its callstack
auto resultErrorCode = pCollectorInstance->CollectCallStackCurrentThread(context);
auto errorCode = pCollector->CollectCallStackCurrentThread(context);

// release the lock
stackWalkInProgressLock.unlock();
pCollectorInstance->NotifyStackWalkCompleted(resultErrorCode);
lock.unlock();
pCollector->NotifyStackWalkCompleted(errorCode);
success = true;
}
}
Expand Down Expand Up @@ -382,4 +412,4 @@ void LinuxStackFramesCollector::ErrorStatistics::Log()
ss.str());
_stats.clear();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "corprof.h"
// end

#include "CounterMetric.h"
#include "MetricsRegistry.h"
#include "StackFramesCollectorBase.h"

#include <atomic>
Expand All @@ -22,12 +24,18 @@ class ProfilerSignalManager;
class ProfilerSignalManager;
class IConfiguration;
class CallstackProvider;
class DiscardMetrics;

class LinuxStackFramesCollector : public StackFramesCollectorBase
{
public:
explicit LinuxStackFramesCollector(ProfilerSignalManager* signalManager, IConfiguration const* configuration, CallstackProvider* callstackProvider);
explicit LinuxStackFramesCollector(
ProfilerSignalManager* signalManager,
IConfiguration const* configuration,
CallstackProvider* callstackProvider,
MetricsRegistry& metricsRegistry);
~LinuxStackFramesCollector() override;

LinuxStackFramesCollector(LinuxStackFramesCollector const&) = delete;
LinuxStackFramesCollector& operator=(LinuxStackFramesCollector const&) = delete;

Expand Down Expand Up @@ -58,7 +66,7 @@ class LinuxStackFramesCollector : public StackFramesCollectorBase
void NotifyStackWalkCompleted(std::int32_t resultErrorCode);
void UpdateErrorStats(std::int32_t errorCode);
static bool ShouldLogStats();
bool CanCollect(int32_t threadId, pid_t processId) const;
bool CanCollect(int32_t threadId, siginfo_t* info, void* ucontext) const;
std::int32_t CollectStackManually(void* ctx);
std::int32_t CollectStackWithBacktrace2(void* ctx);
void MarkAsInterrupted();
Expand All @@ -77,7 +85,6 @@ class LinuxStackFramesCollector : public StackFramesCollectorBase
private:
static bool CollectStackSampleSignalHandler(int sig, siginfo_t* info, void* ucontext);

static char const* ErrorCodeToString(int32_t errorCode);
static std::mutex s_stackWalkInProgressMutex;

static LinuxStackFramesCollector* s_pInstanceCurrentlyStackWalking;
Expand All @@ -86,4 +93,7 @@ class LinuxStackFramesCollector : public StackFramesCollectorBase

ErrorStatistics _errorStatistics;
bool _useBacktrace2;
std::shared_ptr<CounterMetric> _samplingRequest;

std::shared_ptr<DiscardMetrics> _discardMetrics;
};
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,11 @@ std::pair<DWORD, std::string> GetLastErrorMessage()
std::unique_ptr<StackFramesCollectorBase> CreateNewStackFramesCollectorInstance(
ICorProfilerInfo4* pCorProfilerInfo,
IConfiguration const* const pConfiguration,
CallstackProvider* callstackProvider)
CallstackProvider* callstackProvider,
MetricsRegistry& metricsRegistry)
{
return std::make_unique<LinuxStackFramesCollector>(ProfilerSignalManager::Get(SIGUSR1), pConfiguration, callstackProvider);
return std::make_unique<LinuxStackFramesCollector>(
ProfilerSignalManager::Get(SIGUSR1), pConfiguration, callstackProvider, metricsRegistry);
}

// https://linux.die.net/man/5/proc
Expand Down
Loading

0 comments on commit 43d067b

Please sign in to comment.