From 7d4b093903db46e86a5c3e109d3a80980c948b94 Mon Sep 17 00:00:00 2001 From: Shraiysh Date: Mon, 14 Oct 2024 01:21:57 -0700 Subject: [PATCH] PR #17259: Adding Strictness level to PGLE accuracy checker. Imported from GitHub PR https://github.com/openxla/xla/pull/17259 Values for the new flag: `xla_gpu_pgle_accuracy_checker: {OFF, WARN, ERROR}` Copybara import of the project: -- 86ccf83eb4ba4310e68cff7c92c2020fa7b1f3c7 by Shraiysh Vaishay : Add xla_gpu_pgle_accuracy_checker to set strictness levels xla_gpu_pgle_accuracy_checker can take the values {OFF, WARN, ERROR} and this flag decides what will be done when there are missing instructions in PGLE profile: either do nothing (OFF), warn about it (WARN) or halt compilation (ERROR) Merging this change closes #17259 FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/openxla/xla/pull/17259 from shraiysh:pgle_strictness_levels 86ccf83eb4ba4310e68cff7c92c2020fa7b1f3c7 PiperOrigin-RevId: 685614179 --- xla/debug_options_flags.cc | 28 ++++++++++---- xla/service/gpu/gpu_compiler.cc | 3 +- xla/service/gpu/gpu_hlo_schedule.cc | 38 +++++++++++++------ xla/service/gpu/gpu_hlo_schedule_test.cc | 28 +++++++++++++- .../gpu/gpu_latency_hiding_scheduler_test.cc | 14 +++---- .../transforms/pgle_accuracy_checker_test.cc | 4 ++ .../profile_guided_latency_estimator.cc | 19 ++++++---- xla/xla.proto | 9 ++++- 8 files changed, 105 insertions(+), 38 deletions(-) diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc index 6035e2dce8da9d..108d45dbf26d5e 100644 --- a/xla/debug_options_flags.cc +++ b/xla/debug_options_flags.cc @@ -288,7 +288,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_gpu_cudnn_gemm_max_plans(5); - opts.set_xla_gpu_enable_pgle_accuracy_checker(false); + opts.set_xla_gpu_pgle_accuracy_checker( + DebugOptions::PGLE_STRICTNESS_LEVEL_WARN); opts.set_xla_gpu_executable_warn_stuck_timeout_seconds(10); opts.set_xla_gpu_executable_terminate_timeout_seconds(30); @@ -701,6 +702,18 @@ void MakeDebugOptionsFlags(std::vector* flag_list, return true; }; + // Custom "sub-parser" lambda for xla_gpu_pgle_accuracy_checker. + auto setter_for_xla_gpu_pgle_accuracy_checker = + [debug_options](const std::string& value) { + DebugOptions::PGLEStrictnessLevel strictness_level; + if (!DebugOptions::PGLEStrictnessLevel_Parse(value, + &strictness_level)) { + return false; + } + debug_options->set_xla_gpu_pgle_accuracy_checker(strictness_level); + return true; + }; + // Don't use an initializer list for initializing the vector; this would // create a temporary copy, and exceeds the stack space when compiling with // certain configurations. @@ -1975,12 +1988,13 @@ void MakeDebugOptionsFlags(std::vector* flag_list, "a training. The location of the marker (if any) is determined " "by the option value of type DebugOptions::StepMarkerLocation.")); flag_list->push_back(tsl::Flag( - "xla_gpu_enable_pgle_accuracy_checker", - bool_setter_for(&DebugOptions::set_xla_gpu_enable_pgle_accuracy_checker), - debug_options->xla_gpu_enable_pgle_accuracy_checker(), - "Enables strict PGLE checking. If an FDO profile is specified and " - "latency hiding scheduler encounters missing instructions in the profile " - "compilation will halt.")); + "xla_gpu_pgle_accuracy_checker", setter_for_xla_gpu_pgle_accuracy_checker, + DebugOptions::PGLEStrictnessLevel_Name( + debug_options->xla_gpu_pgle_accuracy_checker()), + "If an FDO profile is specified and latency hiding scheduler encounters " + "missing instructions in the profile, then the compilation will halt " + "(ERROR), or a warning will be emitted (WARN), or the checker is " + "disabled (OFF)")); flag_list->push_back(tsl::Flag( "xla_gpu_executable_warn_stuck_timeout", diff --git a/xla/service/gpu/gpu_compiler.cc b/xla/service/gpu/gpu_compiler.cc index d10d676b0fe40f..af3a19f1c62a94 100755 --- a/xla/service/gpu/gpu_compiler.cc +++ b/xla/service/gpu/gpu_compiler.cc @@ -2623,7 +2623,8 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines( pipeline.AddPass(); } - if (module->config().debug_options().xla_gpu_enable_pgle_accuracy_checker()) { + if (module->config().debug_options().xla_gpu_pgle_accuracy_checker() == + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR) { AddHloVerifier( &main_pipeline, module->config().debug_options().xla_experimental_ignore_channel_id(), diff --git a/xla/service/gpu/gpu_hlo_schedule.cc b/xla/service/gpu/gpu_hlo_schedule.cc index f4ef235c376349..d89635fe871e97 100644 --- a/xla/service/gpu/gpu_hlo_schedule.cc +++ b/xla/service/gpu/gpu_hlo_schedule.cc @@ -449,15 +449,32 @@ absl::StatusOr ScheduleGpuModule( VLOG(1) << "Fingerprint before LHS for module " << module->name() << "(" << module->unique_id() << ") = " << fingerprint; + const DebugOptions& options = module->config().debug_options(); const bool enable_latency_hiding_scheduler = - module->config() - .debug_options() - .xla_gpu_enable_latency_hiding_scheduler(); + options.xla_gpu_enable_latency_hiding_scheduler(); if (!enable_latency_hiding_scheduler) { return ScheduleMetadata{memory_limit}; } + if (options.xla_gpu_pgle_profile_file_or_directory_path().empty() && + module->config().fdo_profile().empty() && + options.xla_gpu_pgle_accuracy_checker() == + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR) { + return absl::InvalidArgumentError( + "xla_gpu_pgle_accuracy_checker is set to ERROR, but no profile " + "path specified in xla_gpu_pgle_profile_file_or_directory_path"); + } + + if (options.xla_gpu_pgle_profile_file_or_directory_path().empty() && + module->config().fdo_profile().empty() && + options.xla_gpu_pgle_accuracy_checker(), + DebugOptions::PGLE_STRICTNESS_LEVEL_WARN) { + LOG(WARNING) + << "xla_gpu_pgle_accuracy_checker is set to WARN, but no profile path " + "specified in xla_gpu_pgle_profile_file_or_directory_path"; + } + SchedulerConfig config = GetSchedulerConfig( memory_limit, module->config() @@ -481,9 +498,7 @@ absl::StatusOr ScheduleGpuModule( ReadPGLEProfile(module, fingerprint); const bool enable_analytical_latency_estimator = - module->config() - .debug_options() - .xla_gpu_enable_analytical_latency_estimator(); + options.xla_gpu_enable_analytical_latency_estimator(); HloPassPipeline pipeline("latency-hiding-scheduler"); if (profile.has_value()) { auto aggregator = std::make_unique(); @@ -492,9 +507,10 @@ absl::StatusOr ScheduleGpuModule( std::move(aggregator)); LOG(INFO) << "Found profile, using profile guided latency estimator"; VLOG(1) << "Profile:\n" << profile->DebugString(); - if (module->config() - .debug_options() - .xla_gpu_enable_pgle_accuracy_checker()) { + if (options.xla_gpu_pgle_accuracy_checker() == + DebugOptions::PGLE_STRICTNESS_LEVEL_WARN || + options.xla_gpu_pgle_accuracy_checker() == + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR) { pipeline.AddPass(*pg_latency_estimator); } latency_estimator = std::move(pg_latency_estimator); @@ -511,9 +527,7 @@ absl::StatusOr ScheduleGpuModule( } auto async_tracker = [&]() -> std::unique_ptr { - return module->config() - .debug_options() - .xla_gpu_lhs_enable_gpu_async_tracker() + return options.xla_gpu_lhs_enable_gpu_async_tracker() ? std::make_unique(config) : std::make_unique(config); }(); diff --git a/xla/service/gpu/gpu_hlo_schedule_test.cc b/xla/service/gpu/gpu_hlo_schedule_test.cc index 0a86aa71cc037d..3b2406dd61a49c 100644 --- a/xla/service/gpu/gpu_hlo_schedule_test.cc +++ b/xla/service/gpu/gpu_hlo_schedule_test.cc @@ -566,7 +566,8 @@ TEST_F(GpuHloScheduleTest, ProfileGuidedCostModelFailsWithIncompleteProfile) { HloModuleConfig config(module->config()); DebugOptions dboptions(config.debug_options()); - dboptions.set_xla_gpu_enable_pgle_accuracy_checker(true); + dboptions.set_xla_gpu_pgle_accuracy_checker( + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR); config.set_debug_options(dboptions); module->set_config(config); @@ -1696,5 +1697,30 @@ TEST_F(GpuHloScheduleTest, CopyStartDoneScheduled) { )")); } +TEST_F(GpuHloScheduleTest, InvalidPGLEOptions) { + const char* hlo = R"( + HloModule test + ENTRY add { + a = s32[] parameter(0) + b = s32[] parameter(1) + ROOT add = add(a,b) + } + )"; + + HloModuleConfig config; + DebugOptions options; + options.set_xla_gpu_pgle_accuracy_checker( + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR); + options.set_xla_gpu_enable_latency_hiding_scheduler(true); + config.set_debug_options(options); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseAndReturnVerifiedModule(hlo, config)); + + GTEST_FLAG_SET(death_test_style, "threadsafe"); + EXPECT_DEATH(BuildHloOrdering(module.get()), + "xla_gpu_pgle_accuracy_checker is set to ERROR, but no profile " + "path specified in xla_gpu_pgle_profile_file_or_directory_path"); +} + } // namespace gpu } // namespace xla diff --git a/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc index 7d947c23f881a0..42e05cf9db71cd 100644 --- a/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc +++ b/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc @@ -55,17 +55,17 @@ int GetIndexByName(absl::Span instruction_sequence, class GpuLatencyHidingSchedulerBaseTest : public HloTestBase { protected: absl::StatusOr ScheduleModule( - HloModule* module, int64_t num_parallel_resources = 1) { + HloModule* module, int64_t num_parallel_resources = 1, + DebugOptions::PGLEStrictnessLevel strictness = + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR) { auto& test_backend = backend(); const auto& gpu_device_info = test_backend.default_stream_executor()->GetDeviceDescription(); - HloModuleConfig config(module->config()); - DebugOptions dboptions(config.debug_options()); - dboptions.set_xla_gpu_enable_pgle_accuracy_checker(true); - dboptions.set_xla_gpu_experimental_parallel_collective_overlap_limit( + DebugOptions& options = module->mutable_config().mutable_debug_options(); + options.set_xla_gpu_experimental_parallel_collective_overlap_limit( num_parallel_resources); - config.set_debug_options(dboptions); - module->set_config(config); + options.set_xla_gpu_pgle_accuracy_checker(strictness); + TF_RETURN_IF_ERROR( ScheduleGpuModule(module, /*pointer_size=*/8, gpu_device_info) .status()); diff --git a/xla/service/gpu/transforms/pgle_accuracy_checker_test.cc b/xla/service/gpu/transforms/pgle_accuracy_checker_test.cc index 3f2d1ab6426fd0..d1994a1f6f0687 100644 --- a/xla/service/gpu/transforms/pgle_accuracy_checker_test.cc +++ b/xla/service/gpu/transforms/pgle_accuracy_checker_test.cc @@ -148,6 +148,10 @@ TEST_F(PGLEAccuracyCheckerTest, TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, ParseAndReturnVerifiedModule(kHloString)); *module->mutable_config().mutable_fdo_profile() = kProfileString; + module->mutable_config() + .mutable_debug_options() + .set_xla_gpu_pgle_accuracy_checker( + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR); auto pgle_estimator = GetProfileGuidedLatencyEstimator(profile); PGLEAccuracyChecker pgle_accuracy_checker(*pgle_estimator); diff --git a/xla/service/profile_guided_latency_estimator.cc b/xla/service/profile_guided_latency_estimator.cc index d8e20f2445c4a4..d66f5334933184 100644 --- a/xla/service/profile_guided_latency_estimator.cc +++ b/xla/service/profile_guided_latency_estimator.cc @@ -188,16 +188,19 @@ absl::Status ProfileGuidedLatencyEstimator::CheckAccuracy( ProfileStatisticsAggregator::Statistics stats = aggregator_->GetStats(); size_t missing_instructions_count = stats.missing_instructions.size(); if (missing_instructions_count > 0) { - LOG(ERROR) << "Found " << stats.found_instructions_count - << " instructions from the profile."; - LOG(ERROR) << "Missing " << missing_instructions_count - << " instructions from the profile."; + LOG(WARNING) << "Found " << stats.found_instructions_count + << " instructions from the profile."; + LOG(WARNING) << "Missing " << missing_instructions_count + << " instructions from the profile."; for (const HloInstruction* instr : stats.missing_instructions) { - LOG(ERROR) << " " << instr->name(); + LOG(WARNING) << " " << instr->name(); + } + if (module.config().debug_options().xla_gpu_pgle_accuracy_checker() == + DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR) { + return absl::InvalidArgumentError( + absl::StrCat("Found ", missing_instructions_count, + " missing instructions. Discarding the profile.")); } - return absl::InvalidArgumentError( - absl::StrCat("Found ", missing_instructions_count, - " missing instructions. Discarding the profile.")); } return absl::OkStatus(); } diff --git a/xla/xla.proto b/xla/xla.proto index 87b79dcfde593e..5335a7db89dac9 100644 --- a/xla/xla.proto +++ b/xla/xla.proto @@ -992,8 +992,13 @@ message DebugOptions { // Enables strict PGLE checking. If an FDO profile is specified and latency // hiding scheduler encounters missing instructions in the profile - // compilation will halt. - bool xla_gpu_enable_pgle_accuracy_checker = 326; + // compilation will halt or warn depending on the value of this option. + enum PGLEStrictnessLevel { + PGLE_STRICTNESS_LEVEL_OFF = 0; + PGLE_STRICTNESS_LEVEL_WARN = 1; + PGLE_STRICTNESS_LEVEL_ERROR = 2; + } + PGLEStrictnessLevel xla_gpu_pgle_accuracy_checker = 326; // Timeouts for RendezvousSingle stuck warning and termination. int32 xla_gpu_executable_warn_stuck_timeout_seconds = 327;