From 08f191a590f89b72ed51d191c5f788fdc39703a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 7 Jun 2021 19:10:55 -0500 Subject: [PATCH] Fix profiling array context for kernel callables (#372) * Fix profiling array context for kernel callables * Point requirements to executor-prog-renames for loopy * Point req.txt back to main for loopy * Explain disabled footprint gathering in profile * Wrap long comment line in fp gather explanation --- mirgecom/profiling.py | 75 ++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/mirgecom/profiling.py b/mirgecom/profiling.py index 24b4aa513..29802791d 100644 --- a/mirgecom/profiling.py +++ b/mirgecom/profiling.py @@ -68,7 +68,7 @@ class ProfileEvent: """Holds a profile event that has not been collected by the profiler yet.""" cl_event: cl._cl.Event - program: lp.kernel.LoopKernel + translation_unit: lp.TranslationUnit args_tuple: tuple @@ -148,12 +148,14 @@ def _wait_and_transfer_profile_events(self) -> None: # Then, collect all events and store them for t in self.profile_events: - program = t.program - if hasattr(program, "name"): - name = program.name + t_unit = t.translation_unit + if isinstance(t_unit, lp.TranslationUnit): + name = t_unit.default_entrypoint.name else: - name = program.function_name - r = self._get_kernel_stats(program, t.args_tuple) + # It's actually a cl.Kernel + name = t_unit.function_name + + r = self._get_kernel_stats(t_unit, t.args_tuple) time = t.cl_event.profile.end - t.cl_event.profile.start new = SingleCallKernelProfile(time, r.flops, r.bytes_accessed, @@ -270,11 +272,11 @@ def tabulate_profiling_data(self) -> pytools.Table: return tbl - def _get_kernel_stats(self, program: lp.kernel.LoopKernel, args_tuple: tuple) \ + def _get_kernel_stats(self, t_unit: lp.TranslationUnit, args_tuple: tuple) \ -> SingleCallKernelProfile: - return self.kernel_stats[program][args_tuple] + return self.kernel_stats[t_unit][args_tuple] - def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \ + def _cache_kernel_stats(self, t_unit: lp.TranslationUnit, kwargs: dict) \ -> tuple: """Generate the kernel stats for a program with its args.""" args_tuple = tuple( @@ -283,21 +285,22 @@ def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \ # Are kernel stats already in the cache? try: - self.kernel_stats[program][args_tuple] + self.kernel_stats[t_unit][args_tuple] return args_tuple except KeyError: # If not, calculate and cache the stats - executor = program.target.get_kernel_executor(program, self.queue) - info = executor.kernel_info(executor.arg_to_dtype_set(kwargs)) + ep_name = t_unit.default_entrypoint.name + executor = t_unit.target.get_kernel_executor(t_unit, self.queue, + entrypoint=ep_name) + info = executor.translation_unit_info( + ep_name, executor.arg_to_dtype_set(kwargs)) - kernel = executor.get_typed_and_scheduled_kernel( - executor.arg_to_dtype_set(kwargs)) + typed_t_unit = executor.get_typed_and_scheduled_translation_unit( + ep_name, executor.arg_to_dtype_set(kwargs)) + kernel = typed_t_unit[ep_name] idi = info.implemented_data_info - types = {k: v for k, v in kwargs.items() - if hasattr(v, "dtype") and not v.dtype == object} - param_dict = kwargs.copy() param_dict.update({k: None for k in kernel.arg_dict.keys() if k not in param_dict}) @@ -314,7 +317,7 @@ def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \ wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi) - param_names = program.all_params() + param_names = kernel.all_params() gen("return {%s}" % ", ".join( f"{repr(name)}: {name}" for name in param_names)) @@ -322,39 +325,45 @@ def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \ domain_params = gen.get_picklable_function()(**param_dict) # Get flops/memory statistics - kernel = lp.add_and_infer_dtypes(kernel, types) - op_map = lp.get_op_map(kernel, subgroup_size="guess") - bytes_accessed = lp.get_mem_access_map(kernel, subgroup_size="guess") \ - .to_bytes().eval_and_sum(domain_params) + op_map = lp.get_op_map(typed_t_unit, subgroup_size="guess") + bytes_accessed = lp.get_mem_access_map( + typed_t_unit, subgroup_size="guess") \ + .to_bytes().eval_and_sum(domain_params) flops = op_map.filter_by(dtype=[np.float32, np.float64]).eval_and_sum( domain_params) - try: - footprint = lp.gather_access_footprint_bytes(kernel) - footprint_bytes = sum(footprint[k].eval_with_dict(domain_params) - for k in footprint) - - except lp.symbolic.UnableToDetermineAccessRange: + # Footprint gathering is not yet available in loopy with + # kernel callables: + # https://github.com/inducer/loopy/issues/399 + if 0: + try: + footprint = lp.gather_access_footprint_bytes(typed_t_unit) + footprint_bytes = sum(footprint[k].eval_with_dict(domain_params) + for k in footprint) + + except lp.symbolic.UnableToDetermineAccessRange: + footprint_bytes = None + else: footprint_bytes = None res = SingleCallKernelProfile( time=0, flops=flops, bytes_accessed=bytes_accessed, footprint_bytes=footprint_bytes) - self.kernel_stats.setdefault(program, {})[args_tuple] = res + self.kernel_stats.setdefault(t_unit, {})[args_tuple] = res if self.logmgr: - if f"{program.name}_time" not in self.logmgr.quantity_data: - self.logmgr.add_quantity(KernelProfile(self, program.name)) + if f"{ep_name}_time" not in self.logmgr.quantity_data: + self.logmgr.add_quantity(KernelProfile(self, ep_name)) return args_tuple def call_loopy(self, program, **kwargs) -> dict: """Execute the loopy kernel and profile it.""" program = self.transform_loopy_program(program) - assert program.options.return_dict - assert program.options.no_numpy + assert program.default_entrypoint.options.return_dict + assert program.default_entrypoint.options.no_numpy evt, result = program(self.queue, **kwargs, allocator=self.allocator)