From 267ea8c8e66420c17618164c3a35e6a6e93e014a Mon Sep 17 00:00:00 2001 From: bcoutinho Date: Wed, 12 Feb 2025 00:08:05 -0800 Subject: [PATCH] Update gpu user annotation breakdown (#217) Summary: ## Summay Fixes https://github.com/facebookresearch/HolisticTraceAnalysis/issues/180. Some additional improvements to past change https://github.com/facebookresearch/HolisticTraceAnalysis/issues/209 1. Add use_gpu_time= option that allows the feature to aggregate both CPU user annotations and GPU user annotations. 2. Visualization and API were improved as some options are not needed. For example kernel_type does not make sense since user annotation names are user provided. 3. Added unit tests. ## Before submitting - [y] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [ ] N/A - [y] Did you write any new necessary tests? - [ ] N/A - [ ] Did you make sure to update the docs? - [y] N/A - [ ] Did you update the [changelog](https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/CHANGELOG.md)? - [y] N/A Testplan: ## Run feature ![Screenshot 2025-02-10 at 4 23 44 PM](https://github.com/user-attachments/assets/8ed6bb84-529d-4d8b-8091-87a8bf9726fd) ![Screenshot 2025-02-10 at 4 27 23 PM](https://github.com/user-attachments/assets/24110454-e57b-42cf-ab50-041e7de40b1f) ## unit test Pull Request resolved: https://github.com/facebookresearch/HolisticTraceAnalysis/pull/217 Reviewed By: fengxizhou Differential Revision: D69430645 Pulled By: briancoutinho fbshipit-source-id: c386d66431917fefcd3717335080ca169afec399 --- CHANGELOG.md | 1 + hta/analyzers/breakdown_analysis.py | 225 +++++++++++----------------- hta/trace_analysis.py | 33 ++-- tests/test_trace_analysis.py | 58 ++++++- 4 files changed, 160 insertions(+), 157 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 841dc22..a1d7d74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Versioning](https://semver.org/spec/v2.0.0.html). - Add nccl collective fields to parser config - Queue length analysis: Add feature to compute time blocked on a stream hitting max queue length. - Add `kernel_backend` to parser config for Triton / torch.compile() support. +- Add analyses features for GPU user annotation attribution at trace and kernel level. #### Changed - Change test data path in unittests from relative path to real path to support running test within IDEs. diff --git a/hta/analyzers/breakdown_analysis.py b/hta/analyzers/breakdown_analysis.py index c69ee78..8ecde68 100644 --- a/hta/analyzers/breakdown_analysis.py +++ b/hta/analyzers/breakdown_analysis.py @@ -346,17 +346,41 @@ def get_gpu_kernels_with_user_annotations( def get_gpu_user_annotation_breakdown( cls, t: "Trace", + use_gpu_annotation: bool = True, visualize: bool = True, duration_ratio: float = 0.8, - num_kernels: int = 10, - include_memory_kernels: bool = False, - image_renderer="notebook", - ) -> Tuple[pd.DataFrame, pd.DataFrame]: + num_kernels: int = 1000, + image_renderer: Optional[str] = None, + ) -> Optional[pd.DataFrame]: """ - GPU kernel breakdown implementation. See `get_gpu_kernel_breakdown` in `trace_analysis.py` for details. + Summarizes the time spent by each GPU user annotation. Outputs the following graphs: + + 1. Pie charts showing the most time consuming user annotations for each rank. + 2. Bar graphs showing the average duration for the most time user annotations for each rank. + + Args: + use_gpu_annotation (boolean): Use time on GPU for each user annotation, if false use the time on CPU instead. Default = True, + visualize (boolean): Set to True to display the graphs. Default = True. + duration_ratio (float): Floating point value between 0 and 1 specifying the ratio of time taken + by top user annotations. Default = 0.8. + num_kernels (int): Maximum number of user annotations to show. Default = 1000. Rest get grouped into "other". + image_renderer (str): Set to ``notebook`` when using jupyter and ``jupyterlab`` when using jupyter-lab. + To see all available options execute: ``import plotly; plotly.io.renderers`` in a python shell. + + Returns: + Optional[pd.DataFrame] + Returns a dataframe that shows the min, max, mean, standard deviation, total time taken by each + user annotation on each rank. This dataframe will be summarized based on values of ``duration_ratio`` + and ``num_kernels``. If both ``duration_ratio`` and ``num_kernels`` are specified, + ``num_kernels`` takes precedence. + If user_annotations are not present on CPU or GPU (according to use_gpu_annotation flag), return None. """ - sym_table = t.symbol_table.get_sym_table() - idx = sym_table.index("gpu_user_annotation") + annotation = "gpu_user_annotation" if use_gpu_annotation else "user_annotation" + image_renderer = image_renderer or "" + + if (idx := t.symbol_table.sym_index.get(annotation, None)) is None: + logger.warning(f"Trace does not contain any {annotation}") + return None all_kernel_df = pd.DataFrame( { @@ -366,81 +390,33 @@ def get_gpu_user_annotation_breakdown( "min": pd.Series(dtype="int"), "std": pd.Series(dtype="float"), "mean": pd.Series(dtype="int"), - "kernel_type": pd.Series(dtype="str"), "rank": pd.Series(dtype="int"), } ) - kernel_type_df = pd.DataFrame( - { - "kernel_type": pd.Series(dtype="str"), - "sum": pd.Series(dtype="int"), - } - ) - kernel_type_to_analysis: List[str] = [ - KernelType.COMPUTATION.name, - KernelType.COMMUNICATION.name, - ] - if include_memory_kernels: - kernel_type_to_analysis.append(KernelType.MEMORY.name) + kernel_per_rank: Dict[int, pd.DataFrame] = {} - kernel_per_rank: Dict[str, Dict] = defaultdict(dict) for rank, trace_df in t.traces.items(): gpu_user_annotation_kernels = trace_df[trace_df["cat"].eq(idx)].copy() - gpu_user_annotation_kernels["kernel_type"] = gpu_user_annotation_kernels[ - ["name"] - ].apply(lambda x: get_kernel_type(sym_table[x["name"]]), axis=1) - gpu_user_annotation_kernels["name"] = gpu_user_annotation_kernels[ - "name" - ].apply(lambda x: sym_table[x]) + t.symbol_table.add_symbols_to_trace_df(gpu_user_annotation_kernels, "name") + logger.info( + f"rank = {rank}, num {annotation}s = {len(gpu_user_annotation_kernels)}" + ) - # Create kernel type dataframe - kernel_type_df = pd.concat( - [ - kernel_type_df, - cls._get_gpu_kernel_type_time( - gpu_user_annotation_kernels, kernel_type_to_analysis - ), - ], - ignore_index=True, + gpu_kernel_time = cls._aggr_gpu_kernel_time( + gpu_user_annotation_kernels, + duration_ratio=duration_ratio, + num_kernels=num_kernels, ) + gpu_kernel_time["rank"] = int(rank) + kernel_per_rank[rank] = gpu_kernel_time # Create all kernel info dataframe - for kernel_type in kernel_type_to_analysis: - gpu_kernel_time = gpu_user_annotation_kernels[ - gpu_user_annotation_kernels["kernel_type"] == kernel_type - ] - - if kernel_type not in kernel_per_rank: - kernel_per_rank[kernel_type] = {} - - gpu_kernel_time = cls._aggr_gpu_kernel_time( - gpu_kernel_time, - duration_ratio=duration_ratio, - num_kernels=num_kernels, - ) - - kernel_per_rank[kernel_type][rank] = gpu_kernel_time - - gpu_kernel_time["kernel_type"] = kernel_type - gpu_kernel_time["rank"] = int(rank) - all_kernel_df = pd.concat( - [all_kernel_df, gpu_kernel_time], ignore_index=True - ) - - kernel_type_df = kernel_type_df.groupby(by=["kernel_type"])["sum"].agg(["sum"]) - kernel_type_df.reset_index(inplace=True) - kernel_type_df.sort_values( - by=["sum"], ignore_index=True, inplace=True, ascending=False - ) - kernel_type_df["percentage"] = ( - kernel_type_df["sum"] / kernel_type_df["sum"].sum() - ) * 100 - kernel_type_df = kernel_type_df.round({"percentage": 1}) + all_kernel_df = pd.concat( + [all_kernel_df, gpu_kernel_time], ignore_index=True + ) - all_kernel_df.sort_values( - by=["kernel_type", "name", "rank"], ignore_index=True, inplace=True - ) + all_kernel_df.sort_values(by=["rank", "name"], ignore_index=True, inplace=True) all_kernel_df.rename( columns={ "sum": "sum (us)", @@ -453,80 +429,61 @@ def get_gpu_user_annotation_breakdown( ) if visualize: # pragma: no cover - non_zero_kernel_df = kernel_type_df[(kernel_type_df["percentage"] > 0)] - - fig = px.pie( - non_zero_kernel_df, - values="percentage", - names="kernel_type", - height=500, - title="Kernel Type Percentage Across All Ranks", + specs = [] + for count, rank in enumerate(kernel_per_rank): + if count % 2 == 0: + specs.append([{"type": "domain"}, {"type": "domain"}]) + fig = make_subplots( + rows=int((len(kernel_per_rank) + 1) / 2), + cols=2, + specs=specs, ) + for rank in kernel_per_rank: + fig.add_trace( + go.Pie( + labels=kernel_per_rank[rank]["name"], + values=kernel_per_rank[rank]["sum"], + title=f"Rank {rank}", + automargin=False, + ), + int(rank / 2) + 1, + int(rank % 2) + 1, + ) + image_size_multiplier = 1 + (len(t.traces.keys())) / 2 fig.update_layout( + title_text="User annotation distribution on each rank", margin=dict(l=50, r=50, b=50, t=50), showlegend=True, - legend=dict(yanchor="bottom", y=-0.4, xanchor="left", x=0), + height=400 * image_size_multiplier, + legend=dict(yanchor="bottom", y=-0.1, xanchor="left", x=0), ) fig.show(renderer=image_renderer) - for kernel in kernel_per_rank: - specs = [] - for count, rank in enumerate(kernel_per_rank[kernel]): - if count % 2 == 0: - specs.append([{"type": "domain"}, {"type": "domain"}]) - fig = make_subplots( - rows=int((len(kernel_per_rank[kernel]) + 1) / 2), - cols=2, - specs=specs, + kernel_name = all_kernel_df["name"].unique() + for name in kernel_name: + if name == "others": + continue + kernel_name_df = all_kernel_df[all_kernel_df["name"].eq(name)] + fig = px.bar( + kernel_name_df, + x="rank", + y="mean (us)", + title=name, + labels={ + "rank": "Rank", + "mean (us)": "Mean Duration (us)", + }, + error_y=kernel_name_df["max (us)"] - kernel_name_df["mean (us)"], + error_y_minus=kernel_name_df["mean (us)"] + - kernel_name_df["min (us)"], ) - for rank in kernel_per_rank[kernel]: - fig.add_trace( - go.Pie( - labels=kernel_per_rank[kernel][rank]["name"], - values=kernel_per_rank[kernel][rank]["sum"], - title=f"Rank {rank}", - automargin=False, - ), - int(rank / 2) + 1, - int(rank % 2) + 1, - ) - image_size_multiplier = 1 + (len(t.traces.keys())) / 2 fig.update_layout( - title_text=f'Kernel type "{kernel}" - kernel distribution on each rank', - margin=dict(l=50, r=50, b=50, t=50), - showlegend=True, - height=400 * image_size_multiplier, - legend=dict(yanchor="bottom", y=-0.1, xanchor="left", x=0), + title_text=f"User annotation = {name}", + xaxis=dict(tickmode="linear", tick0=0, dtick=1), ) fig.show(renderer=image_renderer) - kernel_df = all_kernel_df[all_kernel_df["kernel_type"].eq(kernel)] - - kernel_name = kernel_df["name"].unique() - for name in kernel_name: - if name != "others": - kernel_name_df = kernel_df[kernel_df["name"].eq(name)] - fig = px.bar( - kernel_name_df, - x="rank", - y="mean (us)", - title=name, - labels={ - "rank": "Rank", - "mean (us)": "Mean Duration (us)", - }, - error_y=kernel_name_df["max (us)"] - - kernel_name_df["mean (us)"], - error_y_minus=kernel_name_df["mean (us)"] - - kernel_name_df["min (us)"], - ) - fig.update_layout( - title_text=f'Kernel type "{kernel}" - {name}', - xaxis=dict(tickmode="linear", tick0=0, dtick=1), - ) - fig.show(renderer=image_renderer) - - return kernel_type_df, all_kernel_df + return all_kernel_df @classmethod def _get_gpu_kernel_type_time( @@ -611,7 +568,7 @@ def _aggr_gpu_kernel_time( gpu_kernel_time = gpu_kernel_time.sort_values( by=["sum"], ascending=False, ignore_index=True ) - gpu_kernel_time["std"].fillna(0, inplace=True) + gpu_kernel_time.fillna({"std": 0}, inplace=True) # if there are more than num_kernels kernels, starting to aggregate kernels if gpu_kernel_time.shape[0] > num_kernels: @@ -628,7 +585,7 @@ def _aggr_gpu_kernel_time( ["sum", "max", "min", "mean", "std"] ) gpu_kernel_time.reset_index(inplace=True) - gpu_kernel_time["std"].fillna(0, inplace=True) + gpu_kernel_time.fillna({"std": 0}, inplace=True) return gpu_kernel_time diff --git a/hta/trace_analysis.py b/hta/trace_analysis.py index 6824163..e908759 100644 --- a/hta/trace_analysis.py +++ b/hta/trace_analysis.py @@ -119,7 +119,7 @@ def get_gpu_kernel_breakdown( duration_ratio: float = 0.8, num_kernels: int = 10, include_memory_kernels: bool = True, - image_renderer: str = "notebook", + image_renderer: str = "", ) -> Tuple[pd.DataFrame, pd.DataFrame]: r""" Summarizes the time spent by each kernel and by kernel type. Outputs the following graphs: @@ -187,43 +187,42 @@ def get_gpu_kernels_with_user_annotations( def get_gpu_user_annotation_breakdown( self, + use_gpu_annotation: bool = True, visualize: bool = True, duration_ratio: float = 0.8, - num_kernels: int = 10, - include_memory_kernels: bool = True, - image_renderer: str = "notebook", - ) -> Tuple[pd.DataFrame, pd.DataFrame]: + num_kernels: int = 1000, + image_renderer: Optional[str] = None, + ) -> Optional[pd.DataFrame]: r""" - Summarizes the time spent by each kernel and by kernel type. Outputs the following graphs: + Summarizes the time spent by each GPU user annotation. Outputs the following graphs: - 1. Pie chart indicating the percentage of time taken by each kernel type. - 2. Pie charts showing the most time consuming kernels for each rank for each kernel type. - 3. Bar graphs showing the average duration for the most time consuming kernels for each rank and each kernel type. + 1. Pie charts showing the most time consuming user annotations for each rank. + 2. Bar graphs showing the average duration for the most time user annotations for each rank. Args: + use_gpu_annotation (boolean): Use time on GPU for each user annotation, if false use the time on CPU instead. Default = True, visualize (boolean): Set to True to display the graphs. Default = True. duration_ratio (float): Floating point value between 0 and 1 specifying the ratio of time taken - by top COMM/COMP/MEMORY kernels. Default = 0.8. - num_kernels (int): Maximum number of COMM/COMP/MEMORY kernels to show. Default = 10. - include_memory_kernels (bool): Whether to include MEMORY kernels in the analysis. Default = True. + by top user annotations. Default = 0.8. + num_kernels (int): Maximum number of user annotations to show. Default = 1000. Rest get grouped into "other". image_renderer (str): Set to ``notebook`` when using jupyter and ``jupyterlab`` when using jupyter-lab. To see all available options execute: ``import plotly; plotly.io.renderers`` in a python shell. Returns: - Tuple[pd.DataFrame, pd.DataFrame] - Returns two dataframes. The first dataframe shows the percentage of time spent by kernel type. - The second dataframe shows the min, max, mean, standard deviation, total time taken by each - kernel on each rank. This dataframe will be summarized based on values of ``duration_ratio`` + Optional[pd.DataFrame] + Returns a dataframe that shows the min, max, mean, standard deviation, total time taken by each + user annotation on each rank. This dataframe will be summarized based on values of ``duration_ratio`` and ``num_kernels``. If both ``duration_ratio`` and ``num_kernels`` are specified, ``num_kernels`` takes precedence. + If user_annotations are not present on CPU or GPU (according to use_gpu_annotation flag), return None. """ return BreakdownAnalysis.get_gpu_user_annotation_breakdown( self.t, + use_gpu_annotation, visualize, duration_ratio, num_kernels, - include_memory_kernels, image_renderer, ) diff --git a/tests/test_trace_analysis.py b/tests/test_trace_analysis.py index a6f31bd..734f60e 100644 --- a/tests/test_trace_analysis.py +++ b/tests/test_trace_analysis.py @@ -12,6 +12,7 @@ from unittest.mock import patch import hta +import pandas as pd from hta.common.trace import PHASE_COUNTER from hta.trace_analysis import TimeSeriesTypes, TraceAnalysis @@ -285,12 +286,6 @@ def test_get_gpu_kernel_breakdown(self): self.assertEqual(kernel_breakdown.iloc[151]["kernel_type"], "MEMORY") self.assertEqual(kernel_breakdown.iloc[151]["sum (us)"], 1064) - # Negative test as this trace does not have gpu user annotations - gpu_kernels_df = ( - self.vision_transformer_t.get_gpu_kernels_with_user_annotations(rank=0) - ) - self.assertIsNone(gpu_kernels_df) - def test_get_mtia_kernel_breakdown(self): ( kernel_type_breakdown, @@ -306,6 +301,51 @@ def test_get_mtia_kernel_breakdown(self): self.assertEqual(kernel_breakdown.iloc[11]["kernel_type"], "MEMORY") self.assertEqual(kernel_breakdown.iloc[11]["sum (us)"], 400892.0) + def __test_gpu_user_annotation_common( + self, use_gpu_annotation: bool, expected_rows: int + ) -> None: + analyzer = self.ns_resolution_t + gpu_user_anno_df = analyzer.get_gpu_user_annotation_breakdown( + visualize=False, num_kernels=1000, use_gpu_annotation=use_gpu_annotation + ) + + self.assertEqual(len(gpu_user_anno_df), expected_rows) + + annotation = "gpu_user_annotation" if use_gpu_annotation else "user_annotation" + idx = analyzer.t.symbol_table.sym_index[annotation] + trace_df = analyzer.t.get_trace(0) + analyzer.t.symbol_table.add_symbols_to_trace_df(trace_df, "name") + ref_sum_df = ( + trace_df[trace_df.cat == idx][["name", "dur"]] + .groupby("name")["dur"] + .sum() + .reset_index() + ) + ref_mean_df = ( + trace_df[trace_df.cat == idx][["name", "dur"]] + .groupby("name")["dur"] + .mean() + .reset_index() + ) + pd.testing.assert_frame_equal( + gpu_user_anno_df[["name", "sum (us)"]], + ref_sum_df.rename(columns={"dur": "sum (us)"}), + check_dtype=False, + ) + pd.testing.assert_frame_equal( + gpu_user_anno_df[["name", "mean (us)"]], + ref_mean_df.rename(columns={"dur": "mean (us)"}), + check_dtype=False, + ) + + def test_gpu_user_annotation_breakdown(self): + self.__test_gpu_user_annotation_common(use_gpu_annotation=True, expected_rows=3) + + def test_cpu_user_annotation_breakdown(self): + self.__test_gpu_user_annotation_common( + use_gpu_annotation=False, expected_rows=12 + ) + def test_get_gpu_kernels_with_user_annotations(self): gpu_kernels_df = self.ns_resolution_t.get_gpu_kernels_with_user_annotations( rank=0, @@ -338,6 +378,12 @@ def test_get_gpu_kernels_with_user_annotations(self): row1["s_name"].item(), "at::native::::multi_tensor_apply_kernel" ) + # Negative test as this trace does not have gpu user annotations + gpu_kernels_df = ( + self.vision_transformer_t.get_gpu_kernels_with_user_annotations(rank=0) + ) + self.assertIsNone(gpu_kernels_df) + def test_get_queue_length_stats(self): qd_summary = self.vision_transformer_t.get_queue_length_summary(ranks=[0]) streams = qd_summary.index.to_list()