diff --git a/README.md b/README.md index 7ebad3e..508623f 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,48 @@ When enabled, two histogram charts are displayed alongside the results table: Both charts display separate lines for reads, writes, and all operations combined, making it easy to identify patterns in your code's I/O behavior. +### Heatmap Visualization + +Use the `--heatmap` flag to visualize I/O operations as a time-series heatmap (available for `strace` and `fs_usage` measurement modes): + +**Example - Visualizing I/O patterns over time:** +```python +%%iops --heatmap +import tempfile +import os +import time + +test_dir = tempfile.mkdtemp() + +try: + # Phase 1: Small writes over time + for i in range(5): + with open(os.path.join(test_dir, f'small_{i}.txt'), 'w') as f: + f.write('x' * 1024) # 1 KB + time.sleep(0.05) + + # Phase 2: Large writes over time + for i in range(5): + with open(os.path.join(test_dir, f'large_{i}.txt'), 'w') as f: + f.write('z' * (100 * 1024)) # 100 KB + time.sleep(0.05) + +finally: + import shutil + if os.path.exists(test_dir): + shutil.rmtree(test_dir) +``` + +When enabled, two heatmap heatmaps are displayed alongside the results table: +1. **Operation Count Over Time**: Shows when I/O operations of different sizes occurred (X-axis: time, Y-axis: operation size in log scale, Color: operation count) +2. **Total Bytes Over Time**: Shows data transfer patterns over time (X-axis: time, Y-axis: operation size in log scale, Color: total bytes) + +The heatmap visualization helps identify: +- Temporal patterns in I/O behavior +- When different I/O sizes occur during execution +- Bursts or gaps in I/O activity +- Correlation between application phases and I/O patterns + ## Platform Support - **Linux/Windows**: Uses `psutil` for per-process I/O tracking @@ -147,8 +189,8 @@ Both charts display separate lines for reads, writes, and all operations combine - Python 3.10+ - IPython/Jupyter - psutil -- matplotlib (for histogram visualization) -- numpy (for histogram visualization) +- matplotlib (for histogram and heatmap visualization) +- numpy (for histogram and heatmap visualization) ## Dev Guide - Getting Started diff --git a/docs/conf.py b/docs/conf.py index 52f8c23..b864e9f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,12 @@ # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = ["sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.viewcode"] +extensions = [ + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "nbsphinx", # For executing and rendering Jupyter notebooks +] extensions.append("autoapi.extension") extensions.append("nbsphinx") @@ -61,4 +66,12 @@ autoapi_add_toc_tree_entry = False autoapi_member_order = "bysource" +# -- nbsphinx configuration ------------------------------------------------- +# Execute notebooks before conversion +nbsphinx_execute = "always" +# Allow errors in notebook execution (for documentation purposes) +nbsphinx_allow_errors = True +# Timeout for notebook execution (in seconds) +nbsphinx_timeout = 600 + html_theme = "sphinx_rtd_theme" diff --git a/docs/index.rst b/docs/index.rst index e9b2a5a..b0bd79c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -85,4 +85,3 @@ Notes: into documentation for ReadTheDocs works as expected. For more information, see the Python Project Template documentation on `Sphinx and Python Notebooks `_. - diff --git a/docs/notebooks.rst b/docs/notebooks.rst index 0a70941..ea692c1 100644 --- a/docs/notebooks.rst +++ b/docs/notebooks.rst @@ -9,6 +9,7 @@ These Jupyter notebooks demonstrate the key features of iops-profiler with pract notebooks/basic_usage notebooks/histogram_visualization + notebooks/heatmap_visualization Running the Notebooks --------------------- @@ -72,6 +73,24 @@ Explore the histogram feature for visualizing I/O patterns: **Note:** Histogram mode is available on Linux and macOS, but not Windows. +Time-Series Heatmap Visualization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:doc:`notebooks/heatmap_visualization` + +Explore the heatmap feature for visualizing I/O patterns over time: + +- Enabling heatmap mode with ``--heatmap`` +- Understanding when operations of different sizes occur +- Analyzing temporal I/O patterns and bursts +- Identifying phases of different I/O behavior +- Comparing heatmap with histogram views +- Real-world temporal analysis examples + +**Recommended for:** Users who want to understand how I/O behavior changes during program execution. + +**Note:** Heatmap mode is available on Linux (with strace) and macOS (with fs_usage), but not Windows. + Additional Resources -------------------- diff --git a/docs/notebooks/heatmap_visualization.ipynb b/docs/notebooks/heatmap_visualization.ipynb new file mode 100644 index 0000000..c549d28 --- /dev/null +++ b/docs/notebooks/heatmap_visualization.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Heatmap Visualization Example\n", + "\n", + "This notebook demonstrates the heatmap feature of iops-profiler, which provides a time-series heatmap view of I/O operations.\n", + "\n", + "The heatmap shows:\n", + "- **X-axis**: Time (runtime in seconds)\n", + "- **Y-axis**: Operation size (bytes, log scale)\n", + "- **Color**: Either the number of operations or total bytes transferred\n", + "\n", + "This visualization helps identify I/O patterns over time, showing when different sizes of operations occur during program execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the iops-profiler extension\n", + "%load_ext iops_profiler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1: Simple I/O with Varying Sizes\n", + "\n", + "Let's create a workload that performs I/O operations of different sizes over time:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%iops --heatmap\n", + "import tempfile\n", + "import os\n", + "import time\n", + "\n", + "# Create a temporary directory for our test files\n", + "test_dir = tempfile.mkdtemp()\n", + "\n", + "try:\n", + " # Phase 1: Small writes (1 KB each)\n", + " for i in range(5):\n", + " with open(os.path.join(test_dir, f'small_{i}.txt'), 'w') as f:\n", + " f.write('x' * 1024) # 1 KB\n", + " time.sleep(0.05) # Small delay to spread operations over time\n", + " \n", + " # Phase 2: Medium writes (10 KB each)\n", + " for i in range(5):\n", + " with open(os.path.join(test_dir, f'medium_{i}.txt'), 'w') as f:\n", + " f.write('y' * (10 * 1024)) # 10 KB\n", + " time.sleep(0.05)\n", + " \n", + " # Phase 3: Large writes (100 KB each)\n", + " for i in range(5):\n", + " with open(os.path.join(test_dir, f'large_{i}.txt'), 'w') as f:\n", + " f.write('z' * (100 * 1024)) # 100 KB\n", + " time.sleep(0.05)\n", + " \n", + " # Phase 4: Mixed reads - read back all files in order\n", + " for size_category in ['small', 'medium', 'large']:\n", + " for i in range(5):\n", + " filepath = os.path.join(test_dir, f'{size_category}_{i}.txt')\n", + " if os.path.exists(filepath):\n", + " with open(filepath, 'r') as f:\n", + " _ = f.read()\n", + " time.sleep(0.03)\n", + "\n", + "finally:\n", + " # Cleanup\n", + " import shutil\n", + " if os.path.exists(test_dir):\n", + " shutil.rmtree(test_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The heatmap above shows:\n", + "\n", + "1. **Left plot (Operation Count)**: How many I/O operations occurred in each time/size bin\n", + "2. **Right plot (Total Bytes)**: How much data was transferred in each time/size bin\n", + "\n", + "You can see distinct phases:\n", + "- Early time: Small operations (1 KB writes)\n", + "- Middle time: Medium operations (10 KB writes)\n", + "- Later time: Large operations (100 KB writes)\n", + "- Final time: Mixed reads of all sizes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 2: Bursty I/O Pattern\n", + "\n", + "Let's create a workload with bursts of activity at different scales:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%iops --heatmap\n", + "import tempfile\n", + "import os\n", + "import time\n", + "\n", + "test_dir = tempfile.mkdtemp()\n", + "\n", + "try:\n", + " # Burst 1: Many small operations\n", + " for i in range(20):\n", + " with open(os.path.join(test_dir, f'burst1_{i}.txt'), 'w') as f:\n", + " f.write('a' * 512) # 512 bytes\n", + " \n", + " # Pause\n", + " time.sleep(0.2)\n", + " \n", + " # Burst 2: Few large operations\n", + " for i in range(3):\n", + " with open(os.path.join(test_dir, f'burst2_{i}.txt'), 'w') as f:\n", + " f.write('b' * (200 * 1024)) # 200 KB\n", + " \n", + " # Pause\n", + " time.sleep(0.2)\n", + " \n", + " # Burst 3: Medium-sized operations\n", + " for i in range(10):\n", + " with open(os.path.join(test_dir, f'burst3_{i}.txt'), 'w') as f:\n", + " f.write('c' * (5 * 1024)) # 5 KB\n", + "\n", + "finally:\n", + " import shutil\n", + " if os.path.exists(test_dir):\n", + " shutil.rmtree(test_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This heatmap shows a bursty pattern with clear gaps between activity phases. The color intensity helps identify which phases had the most activity or transferred the most data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing with Histogram View\n", + "\n", + "For comparison, here's the same workload with the histogram view:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%iops --histogram\n", + "import tempfile\n", + "import os\n", + "import time\n", + "\n", + "test_dir = tempfile.mkdtemp()\n", + "\n", + "try:\n", + " # Same workload as Example 1\n", + " for i in range(5):\n", + " with open(os.path.join(test_dir, f'small_{i}.txt'), 'w') as f:\n", + " f.write('x' * 1024)\n", + " time.sleep(0.05)\n", + " \n", + " for i in range(5):\n", + " with open(os.path.join(test_dir, f'medium_{i}.txt'), 'w') as f:\n", + " f.write('y' * (10 * 1024))\n", + " time.sleep(0.05)\n", + " \n", + " for i in range(5):\n", + " with open(os.path.join(test_dir, f'large_{i}.txt'), 'w') as f:\n", + " f.write('z' * (100 * 1024))\n", + " time.sleep(0.05)\n", + "\n", + "finally:\n", + " import shutil\n", + " if os.path.exists(test_dir):\n", + " shutil.rmtree(test_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The histogram shows the distribution of operation sizes but doesn't reveal when they occurred. The heatmap adds the temporal dimension, making it easier to:\n", + "\n", + "- Identify when different I/O patterns occur during execution\n", + "- Spot bursts or gaps in I/O activity\n", + "- Correlate I/O behavior with application phases\n", + "- Debug performance issues related to I/O timing" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/iops_heatmap.png b/iops_heatmap.png new file mode 100644 index 0000000..b49a5e8 Binary files /dev/null and b/iops_heatmap.png differ diff --git a/iops_spectrogram.png b/iops_spectrogram.png new file mode 100644 index 0000000..1cfece7 Binary files /dev/null and b/iops_spectrogram.png differ diff --git a/src/iops_profiler/collector.py b/src/iops_profiler/collector.py index 47d8348..e3f7c0d 100644 --- a/src/iops_profiler/collector.py +++ b/src/iops_profiler/collector.py @@ -60,29 +60,54 @@ def __init__(self, shell): # Compile regex patterns for better performance # Pattern matches: PID syscall(args) = result self._strace_pattern = re.compile(r"^\s*(\d+)\s+(\w+)\([^)]+\)\s*=\s*(-?\d+)") + # Pattern matches strace with timestamp: TIMESTAMP PID syscall(args) = result + self._strace_timestamp_pattern = re.compile(r"^\s*(\d+\.\d+)\s+(\d+)\s+(\w+)\([^)]+\)\s*=\s*(-?\d+)") # Pattern matches: B=0x[hex] in fs_usage output self._fs_usage_byte_pattern = re.compile(FS_USAGE_BYTE_PATTERN) + # Pattern matches fs_usage timestamp at start of line (HH:MM:SS or HH:MM:SS.ffffff format) + self._fs_usage_timestamp_pattern = re.compile(r"^\s*(\d{2}:\d{2}:\d{2}(?:\.\d+)?)") # Set of syscall names for I/O operations (lowercase) self._io_syscalls = set(STRACE_IO_SYSCALLS) @staticmethod - def parse_fs_usage_line_static(line, byte_pattern=None, collect_ops=False): + def parse_fs_usage_line_static(line, byte_pattern=None, collect_ops=False, timestamp_pattern=None): """Parse a single fs_usage output line for I/O operations (static version) Args: line: The line to parse byte_pattern: Compiled regex pattern for extracting byte count (optional) collect_ops: If True, return full operation info for histogram collection + timestamp_pattern: Compiled regex pattern for extracting timestamp (optional) Returns: If collect_ops is False: (op_type, bytes_transferred) - If collect_ops is True: {'type': op_type, 'bytes': bytes_transferred} + If collect_ops is True: {'type': op_type, 'bytes': bytes_transferred, 'timestamp': timestamp} """ parts = line.split() if len(parts) < 2: return None if collect_ops else (None, 0) - syscall = parts[1].lower() + # Check if line starts with timestamp (HH:MM:SS.ffffff or HH:MM:SS format) + # Timestamp detection: if first token contains colons and matches time pattern + timestamp = None + syscall_index = 0 + if parts[0] and ':' in parts[0]: + # Try to match timestamp pattern if provided + if timestamp_pattern: + ts_match = timestamp_pattern.match(line) + if ts_match: + timestamp = ts_match.group(1) + syscall_index = 1 + else: + # Fallback: check if first token looks like a timestamp (HH:MM:SS format) + # Pattern: digits:digits:digits (optionally followed by .digits) + # Compile pattern for reuse within this conditional + simple_ts_pattern = re.compile(r"^\d{1,2}:\d{2}:\d{2}(?:\.\d+)?$") + if simple_ts_pattern.match(parts[0]): + timestamp = parts[0] if collect_ops else None + syscall_index = 1 + + syscall = parts[syscall_index].lower() is_read = "read" in syscall is_write = "write" in syscall @@ -100,7 +125,10 @@ def parse_fs_usage_line_static(line, byte_pattern=None, collect_ops=False): op_type = "read" if is_read else "write" if collect_ops: - return {"type": op_type, "bytes": bytes_transferred} + result = {"type": op_type, "bytes": bytes_transferred} + if timestamp: + result["timestamp"] = timestamp + return result return op_type, bytes_transferred def parse_fs_usage_line(self, line, collect_ops=False): @@ -108,10 +136,14 @@ def parse_fs_usage_line(self, line, collect_ops=False): This is a convenience wrapper that uses the instance's compiled byte pattern. """ - return self.parse_fs_usage_line_static(line, self._fs_usage_byte_pattern, collect_ops) + return self.parse_fs_usage_line_static( + line, self._fs_usage_byte_pattern, collect_ops, self._fs_usage_timestamp_pattern + ) @staticmethod - def parse_strace_line_static(line, strace_pattern, io_syscalls, collect_ops=False): + def parse_strace_line_static( + line, strace_pattern, io_syscalls, collect_ops=False, strace_timestamp_pattern=None + ): """Parse a single strace output line for I/O operations (static version) Example strace lines: @@ -119,25 +151,40 @@ def parse_strace_line_static(line, strace_pattern, io_syscalls, collect_ops=Fals 3385 read(3, "data", 4096) = 133 3385 pread64(3, "...", 1024, 0) = 1024 + Example strace lines with timestamps: + 1234567890.123456 3385 write(3, "Hello World...", 1100) = 1100 + 1234567890.123457 3385 read(3, "data", 4096) = 133 + Note: Lines with or <... resumed> are not matched as they don't contain complete result information in a single line. Args: line: The line to parse - strace_pattern: Compiled regex pattern for strace output + strace_pattern: Compiled regex pattern for strace output (without timestamp) io_syscalls: Set of I/O syscall names to track collect_ops: If True, return full operation info for histogram collection + strace_timestamp_pattern: Compiled regex pattern for strace with timestamp (optional) Returns: If collect_ops is False: (op_type, bytes_transferred) - If collect_ops is True: {'type': op_type, 'bytes': bytes_transferred} + If collect_ops is True: {'type': op_type, 'bytes': bytes_transferred, 'timestamp': timestamp} """ - # Match patterns like: PID syscall(fd, ..., size) = result - match = strace_pattern.match(line) + timestamp = None + match = None + + # Try to match with timestamp pattern first if provided + if strace_timestamp_pattern: + match = strace_timestamp_pattern.match(line) + if match: + timestamp, pid, syscall, result = match.groups() + + # If no timestamp match, try regular pattern if not match: - return None if collect_ops else (None, 0) + match = strace_pattern.match(line) + if not match: + return None if collect_ops else (None, 0) + pid, syscall, result = match.groups() - pid, syscall, result = match.groups() syscall = syscall.lower() # Check if it's one of the I/O syscalls we're tracking @@ -160,7 +207,10 @@ def parse_strace_line_static(line, strace_pattern, io_syscalls, collect_ops=Fals op_type = "read" if is_read else "write" if collect_ops: - return {"type": op_type, "bytes": bytes_transferred} + result = {"type": op_type, "bytes": bytes_transferred} + if timestamp: + result["timestamp"] = timestamp + return result return op_type, bytes_transferred def parse_strace_line(self, line, collect_ops=False): @@ -168,7 +218,9 @@ def parse_strace_line(self, line, collect_ops=False): This is a convenience wrapper that uses the instance's strace pattern and syscalls. """ - return self.parse_strace_line_static(line, self._strace_pattern, self._io_syscalls, collect_ops) + return self.parse_strace_line_static( + line, self._strace_pattern, self._io_syscalls, collect_ops, self._strace_timestamp_pattern + ) @staticmethod def _create_helper_script(pid, output_file, control_file): @@ -385,6 +437,7 @@ def measure_linux_strace(self, code, collect_ops=False): strace_cmd = [ "strace", "-f", # Follow forks + "-ttt", # Add absolute timestamps as Unix epoch time with microsecond precision "-e", f"trace={syscalls_to_trace}", "-o", diff --git a/src/iops_profiler/display.py b/src/iops_profiler/display.py index 984db2e..15010ae 100644 --- a/src/iops_profiler/display.py +++ b/src/iops_profiler/display.py @@ -17,6 +17,12 @@ from IPython.display import HTML, display +# Constants for heatmap generation +HEATMAP_SIZE_BIN_EXPANSION = 0.01 # 1% expansion for size bins (0.99 to 1.01) +HEATMAP_SINGLE_VALUE_EXPANSION = 0.1 # 10% expansion for single value (0.9 to 1.1) +HEATMAP_SIZE_BINS = 30 # Number of bins for operation size (log scale) +HEATMAP_TIME_BINS = 50 # Number of bins for time + def is_notebook_environment(): """Detect if running in a graphical notebook environment vs plain IPython. @@ -56,6 +62,181 @@ def format_bytes(bytes_val): return f"{bytes_val:.2f} TB" +def format_time_axis(max_time_seconds): + """Determine appropriate time unit and formatting for axis display. + + Args: + max_time_seconds: Maximum time value in seconds + + Returns: + tuple: (time_unit_name, time_divisor, decimal_places) + """ + if max_time_seconds >= 1.0: + # Use seconds + return "s", 1.0, 2 + elif max_time_seconds >= 0.001: + # Use milliseconds + return "ms", 0.001, 1 + elif max_time_seconds >= 0.000001: + # Use microseconds + return "μs", 0.000001, 1 + else: + # Use nanoseconds + return "ns", 0.000000001, 0 + + +def generate_heatmap(operations, elapsed_time): + """Generate time-series heatmaps for I/O operations over time + + Args: + operations: List of dicts with 'type', 'bytes', and 'timestamp' keys + elapsed_time: Total elapsed time of the profiled code + """ + if not plt or not np: + print("⚠️ matplotlib or numpy not available. Cannot generate heatmaps.") + return + + if not operations: + print("⚠️ No operations captured for heatmap generation.") + return + + # Filter operations with timestamps and non-zero bytes + ops_with_time = [op for op in operations if "timestamp" in op and op["bytes"] > 0] + + if not ops_with_time: + print("⚠️ No operations with timestamps for heatmap generation.") + return + + # Convert timestamps to relative time (seconds from start) + # Handle both Unix timestamp format (strace) and HH:MM:SS.ffffff format (fs_usage) + start_timestamp = None + relative_times = [] + + for op in ops_with_time: + ts_str = op["timestamp"] + # Check if it's Unix timestamp format (decimal number) + if ":" not in ts_str: + # Unix timestamp from strace + timestamp = float(ts_str) + if start_timestamp is None: + start_timestamp = timestamp + relative_times.append(timestamp - start_timestamp) + else: + # HH:MM:SS.ffffff format from fs_usage + # Parse the time format + parts = ts_str.split(":") + if len(parts) == 3: + hours = float(parts[0]) + minutes = float(parts[1]) + seconds = float(parts[2]) + timestamp = hours * 3600 + minutes * 60 + seconds + if start_timestamp is None: + start_timestamp = timestamp + relative_times.append(timestamp - start_timestamp) + + # Handle case where no valid timestamps were extracted + if not relative_times: + print("⚠️ Could not parse timestamps for heatmap generation.") + return + + # Extract byte sizes + byte_sizes = [op["bytes"] for op in ops_with_time] + + # Create log-scale bins for byte sizes + min_bytes = max(1, min(byte_sizes)) + max_bytes = max(byte_sizes) + + if min_bytes == max_bytes: + # Single value case: expand range to create meaningful bins + expansion_factor = 1 - HEATMAP_SINGLE_VALUE_EXPANSION + size_bins = np.array([min_bytes * expansion_factor, min_bytes / expansion_factor]) + else: + # Create bins in log space - using fewer bins for heatmap + expansion_factor = 1 - HEATMAP_SIZE_BIN_EXPANSION + size_bins = np.logspace( + np.log10(min_bytes * expansion_factor), + np.log10(max_bytes / expansion_factor), + HEATMAP_SIZE_BINS, + ) + + # Create time bins + max_time = max(relative_times) + if max_time == 0: + max_time = elapsed_time + time_bins = np.linspace(0, max_time, HEATMAP_TIME_BINS) + + # Determine appropriate time unit and formatting for axis + time_unit, time_divisor, decimal_places = format_time_axis(max_time) + + # Create 2D histograms for operation counts + all_count_hist, time_edges, size_edges = np.histogram2d( + relative_times, byte_sizes, bins=[time_bins, size_bins] + ) + + # Create 2D histograms for total bytes + all_bytes_hist, _, _ = np.histogram2d( + relative_times, byte_sizes, bins=[time_bins, size_bins], weights=byte_sizes + ) + + # Create figure with 2 subplots (operation count and total bytes) + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) + + # Plot 1: Operation count heatmap + # Use pcolormesh for heatmap visualization + time_centers = (time_edges[:-1] + time_edges[1:]) / 2 + size_centers = (size_edges[:-1] + size_edges[1:]) / 2 + # Convert time to appropriate unit for display + time_mesh, size_mesh = np.meshgrid(time_centers / time_divisor, size_centers) + + im1 = ax1.pcolormesh(time_mesh, size_mesh, all_count_hist.T, cmap="viridis", shading="auto") + ax1.set_yscale("log") + ax1.set_xlabel(f"Time ({time_unit})") + ax1.set_ylabel("Operation Size (bytes, log scale)") + ax1.set_title("I/O Operation Count Over Time") + plt.colorbar(im1, ax=ax1, label="Number of Operations") + ax1.grid(True, alpha=0.3) + + # Format x-axis tick labels with limited decimal places + ax1.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.{decimal_places}f}')) + + # Plot 2: Total bytes spectrogram (with auto-scaling) + max_bytes_in_bin = np.max(all_bytes_hist) if all_bytes_hist.size > 0 else 0 + if max_bytes_in_bin < 1024: + unit, divisor = "B", 1 + elif max_bytes_in_bin < 1024**2: + unit, divisor = "KB", 1024 + elif max_bytes_in_bin < 1024**3: + unit, divisor = "MB", 1024**2 + elif max_bytes_in_bin < 1024**4: + unit, divisor = "GB", 1024**3 + else: + unit, divisor = "TB", 1024**4 + + im2 = ax2.pcolormesh(time_mesh, size_mesh, (all_bytes_hist / divisor).T, cmap="plasma", shading="auto") + ax2.set_yscale("log") + ax2.set_xlabel(f"Time ({time_unit})") + ax2.set_ylabel("Operation Size (bytes, log scale)") + ax2.set_title("I/O Total Bytes Over Time") + plt.colorbar(im2, ax=ax2, label=f"Total Bytes ({unit})") + ax2.grid(True, alpha=0.3) + + # Format x-axis tick labels with limited decimal places + ax2.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.{decimal_places}f}')) + + plt.tight_layout() + + # Check if running in plain IPython vs notebook environment + if is_notebook_environment(): + # In notebook, show the plot inline + plt.show() + else: + # In plain IPython, save to file + output_file = "iops_heatmap.png" + plt.savefig(output_file, dpi=100, bbox_inches="tight") + plt.close(fig) + print(f"📊 Heatmap saved to: {output_file}") + + def generate_histograms(operations): """Generate histograms for I/O operations using numpy diff --git a/src/iops_profiler/magic.py b/src/iops_profiler/magic.py index 1970c3d..b99f122 100644 --- a/src/iops_profiler/magic.py +++ b/src/iops_profiler/magic.py @@ -30,19 +30,20 @@ def __init__(self, shell): # Initialize the collector with shell context self.collector = Collector(shell) - def _profile_code(self, code, show_histogram=False): + def _profile_code(self, code, show_histogram=False, show_heatmap=False): """ Internal method to profile code with I/O measurements. Args: code: The code string to profile show_histogram: Whether to generate histograms + show_heatmap: Whether to generate time-series heatmap Returns: Dictionary with profiling results """ # Determine if we should collect individual operations - collect_ops = show_histogram + collect_ops = show_histogram or show_heatmap # Determine measurement method based on platform if self.platform == "darwin": # macOS @@ -55,12 +56,16 @@ def _profile_code(self, code, show_histogram=False): results = self.collector.measure_systemwide_fallback(code) if show_histogram: print("⚠️ Histograms not available for system-wide measurement mode.") + if show_heatmap: + print("⚠️ Heatmaps not available for system-wide measurement mode.") else: print(f"⚠️ Could not start fs_usage: {e}") print("Falling back to system-wide measurement.\n") results = self.collector.measure_systemwide_fallback(code) if show_histogram: print("⚠️ Histograms not available for system-wide measurement mode.") + if show_heatmap: + print("⚠️ Heatmaps not available for system-wide measurement mode.") elif self.platform in ("linux", "linux2"): # Use strace on Linux (no elevated privileges required) @@ -72,11 +77,15 @@ def _profile_code(self, code, show_histogram=False): results = self.collector.measure_linux_windows(code) if show_histogram: print("⚠️ Histograms not available for psutil measurement mode.") + if show_heatmap: + print("⚠️ Heatmaps not available for psutil measurement mode.") elif self.platform == "win32": results = self.collector.measure_linux_windows(code) if show_histogram: print("⚠️ Histograms not available for psutil measurement mode on Windows.") + if show_heatmap: + print("⚠️ Heatmaps not available for psutil measurement mode on Windows.") else: print(f"⚠️ Platform '{self.platform}' not fully supported.") @@ -84,6 +93,8 @@ def _profile_code(self, code, show_histogram=False): results = self.collector.measure_systemwide_fallback(code) if show_histogram: print("⚠️ Histograms not available for system-wide measurement mode.") + if show_heatmap: + print("⚠️ Heatmaps not available for system-wide measurement mode.") return results @@ -95,6 +106,7 @@ def iops(self, line, cell=None): Line magic usage (single line): %iops open('test.txt', 'w').write('data') %iops --histogram open('test.txt', 'w').write('data') + %iops --heatmap open('test.txt', 'w').write('data') Cell magic usage (multiple lines): %%iops @@ -106,10 +118,16 @@ def iops(self, line, cell=None): # Your code here (with histograms) with open('test.txt', 'w') as f: f.write('data') + + %%iops --heatmap + # Your code here (with time-series heatmap) + with open('test.txt', 'w') as f: + f.write('data') """ try: # Parse command line arguments show_histogram = False + show_heatmap = False code = None # Determine what code to execute @@ -119,20 +137,24 @@ def iops(self, line, cell=None): if line_stripped == "--histogram" or line_stripped.startswith("--histogram "): show_histogram = True code = line_stripped[len("--histogram") :].strip() + elif line_stripped == "--heatmap" or line_stripped.startswith("--heatmap "): + show_heatmap = True + code = line_stripped[len("--heatmap") :].strip() else: code = line_stripped if not code: print("❌ Error: No code provided to profile in line magic mode.") - print(" Usage: %iops [--histogram] ") + print(" Usage: %iops [--histogram|--heatmap] ") return else: # Cell magic mode - code is in the cell parameter show_histogram = "--histogram" in line + show_heatmap = "--heatmap" in line code = cell # Profile the code - results = self._profile_code(code, show_histogram) + results = self._profile_code(code, show_histogram, show_heatmap) # Display results table display.display_results(results) @@ -141,6 +163,10 @@ def iops(self, line, cell=None): if show_histogram and "operations" in results: display.generate_histograms(results["operations"]) + # Display heatmap if requested and available + if show_heatmap and "operations" in results: + display.generate_heatmap(results["operations"], results["elapsed_time"]) + except Exception as e: print(f"❌ Error during IOPS profiling: {e}") print("\nYour code was not executed. Please fix the profiling issue and try again.") diff --git a/tests/test_heatmap.py b/tests/test_heatmap.py new file mode 100644 index 0000000..41dfc19 --- /dev/null +++ b/tests/test_heatmap.py @@ -0,0 +1,345 @@ +""" +Tests for heatmap generation and timestamp parsing in iops_profiler. + +This module focuses on testing heatmap generation and timestamp extraction +from strace and fs_usage output. +""" + +from unittest.mock import MagicMock, patch + +import pytest + +from iops_profiler import display +from iops_profiler.collector import Collector +from iops_profiler.magic import IOPSProfiler + + +def create_test_profiler(): + """Helper function to create a test profiler instance""" + mock_shell = MagicMock() + mock_shell.configurables = [] + profiler = IOPSProfiler.__new__(IOPSProfiler) + profiler.shell = mock_shell + # Initialize the profiler attributes manually to avoid traitlets + import sys + + profiler.platform = sys.platform + # Initialize the collector with the mock shell + profiler.collector = Collector(mock_shell) + return profiler + + +class TestTimestampParsing: + """Test cases for timestamp parsing in strace and fs_usage output""" + + @pytest.fixture + def collector(self): + """Create a Collector instance with a mock shell""" + mock_shell = MagicMock() + return Collector(mock_shell) + + def test_strace_with_unix_timestamp(self, collector): + """Test parsing strace line with Unix timestamp""" + line = "1234567890.123456 3385 write(3, \"Hello\", 5) = 5" + result = collector.parse_strace_line(line, collect_ops=True) + + assert result is not None + assert result["type"] == "write" + assert result["bytes"] == 5 + assert result["timestamp"] == "1234567890.123456" + + def test_strace_without_timestamp(self, collector): + """Test parsing strace line without timestamp (backward compatibility)""" + line = "3385 write(3, \"Hello\", 5) = 5" + result = collector.parse_strace_line(line, collect_ops=True) + + assert result is not None + assert result["type"] == "write" + assert result["bytes"] == 5 + assert "timestamp" not in result + + def test_strace_read_with_timestamp(self, collector): + """Test parsing strace read operation with timestamp""" + line = "1234567890.654321 3385 read(3, \"data\", 4096) = 1024" + result = collector.parse_strace_line(line, collect_ops=True) + + assert result is not None + assert result["type"] == "read" + assert result["bytes"] == 1024 + assert result["timestamp"] == "1234567890.654321" + + def test_fs_usage_with_timestamp(self, collector): + """Test parsing fs_usage line with timestamp""" + line = "12:34:56.789012 write B=0x1000 /path/to/file Python" + result = collector.parse_fs_usage_line(line, collect_ops=True) + + assert result is not None + assert result["type"] == "write" + assert result["bytes"] == 4096 # 0x1000 in hex + assert result["timestamp"] == "12:34:56.789012" + + def test_fs_usage_read_with_timestamp(self, collector): + """Test parsing fs_usage read operation with timestamp""" + line = "01:23:45.123456 read B=0x800 /path/to/file Python" + result = collector.parse_fs_usage_line(line, collect_ops=True) + + assert result is not None + assert result["type"] == "read" + assert result["bytes"] == 2048 # 0x800 in hex + assert result["timestamp"] == "01:23:45.123456" + + def test_fs_usage_without_timestamp(self, collector): + """Test parsing fs_usage line without timestamp field""" + line = "write B=0x1000 /path/to/file Python" + result = collector.parse_fs_usage_line(line, collect_ops=True) + + assert result is not None + assert result["type"] == "write" + assert result["bytes"] == 4096 + # timestamp field may or may not be present depending on the line format + + +class TestHeatmapGeneration: + """Test cases for heatmap generation""" + + @pytest.fixture + def profiler(self): + """Create an IOPSProfiler instance with a mock shell""" + return create_test_profiler() + + @pytest.fixture(autouse=True) + def close_figures(self): + """Automatically close all matplotlib figures after each test""" + import matplotlib.pyplot as plt + + yield + plt.close("all") + + @pytest.fixture(autouse=True) + def mock_notebook_environment(self, profiler): + """Mock is_notebook_environment to return True for heatmap tests""" + with patch("iops_profiler.display.is_notebook_environment", return_value=True): + yield + + @patch("iops_profiler.display.plt.show") + def test_empty_operations_list(self, mock_show, profiler): + """Test heatmap generation with empty operations list""" + import matplotlib.pyplot as plt + + operations = [] + display.generate_heatmap(operations, 1.0) + + # plt.show should not be called since no plots were created + mock_show.assert_not_called() + + # No figures should have been created + assert len(plt.get_fignums()) == 0 + + @patch("iops_profiler.display.plt.show") + def test_operations_without_timestamps(self, mock_show, profiler): + """Test heatmap generation when operations lack timestamps""" + import matplotlib.pyplot as plt + + operations = [ + {"type": "read", "bytes": 1024}, + {"type": "write", "bytes": 2048}, + ] + display.generate_heatmap(operations, 1.0) + + # plt.show should not be called since no plots were created + mock_show.assert_not_called() + + # No figures should have been created + assert len(plt.get_fignums()) == 0 + + @patch("iops_profiler.display.plt.show") + def test_unix_timestamp_format(self, mock_show, profiler): + """Test heatmap with Unix timestamp format (strace)""" + import matplotlib.pyplot as plt + + operations = [ + {"type": "read", "bytes": 1024, "timestamp": "1234567890.100000"}, + {"type": "write", "bytes": 2048, "timestamp": "1234567890.200000"}, + {"type": "read", "bytes": 512, "timestamp": "1234567890.300000"}, + {"type": "write", "bytes": 4096, "timestamp": "1234567890.400000"}, + ] + display.generate_heatmap(operations, 1.0) + + # plt.show should be called once + mock_show.assert_called_once() + + # Should create one figure with two subplots (plus colorbars) + figs = plt.get_fignums() + assert len(figs) == 1 + + fig = plt.figure(figs[0]) + axes = fig.get_axes() + # 4 axes total: 2 main plots + 2 colorbars + assert len(axes) >= 2 + + # Check first subplot (operation count) + ax1 = axes[0] + assert "Time" in ax1.get_xlabel() + assert "Operation Size" in ax1.get_ylabel() + assert ax1.get_yscale() == "log" + + # Check second subplot (total bytes) + ax2 = axes[1] + assert "Time" in ax2.get_xlabel() + assert "Operation Size" in ax2.get_ylabel() + assert ax2.get_yscale() == "log" + + @patch("iops_profiler.display.plt.show") + def test_fs_usage_timestamp_format(self, mock_show, profiler): + """Test heatmap with HH:MM:SS.ffffff timestamp format (fs_usage)""" + import matplotlib.pyplot as plt + + operations = [ + {"type": "read", "bytes": 1024, "timestamp": "12:34:56.100000"}, + {"type": "write", "bytes": 2048, "timestamp": "12:34:56.200000"}, + {"type": "read", "bytes": 512, "timestamp": "12:34:56.300000"}, + {"type": "write", "bytes": 4096, "timestamp": "12:34:56.400000"}, + ] + display.generate_heatmap(operations, 1.0) + + # plt.show should be called once + mock_show.assert_called_once() + + # Should create one figure with two subplots (plus colorbars) + figs = plt.get_fignums() + assert len(figs) == 1 + + fig = plt.figure(figs[0]) + axes = fig.get_axes() + # 4 axes total: 2 main plots + 2 colorbars + assert len(axes) >= 2 + + @patch("iops_profiler.display.plt.show") + def test_wide_range_of_sizes_over_time(self, mock_show, profiler): + """Test heatmap with wide range of operation sizes over time""" + import matplotlib.pyplot as plt + + operations = [] + # Generate operations with varying sizes over time + for i in range(20): + timestamp = f"1234567890.{i:06d}" + byte_size = 2 ** (i % 10 + 1) # Sizes from 2 to 1024 bytes + op_type = "read" if i % 2 == 0 else "write" + operations.append({"type": op_type, "bytes": byte_size, "timestamp": timestamp}) + + display.generate_heatmap(operations, 1.0) + + # plt.show should be called once + mock_show.assert_called_once() + + # Should create one figure with two subplots (plus colorbars) + figs = plt.get_fignums() + assert len(figs) == 1 + + fig = plt.figure(figs[0]) + axes = fig.get_axes() + # 4 axes total: 2 main plots + 2 colorbars + assert len(axes) >= 2 + + @patch("iops_profiler.display.plt.show") + def test_many_operations_over_time(self, mock_show, profiler): + """Test heatmap with many operations""" + import matplotlib.pyplot as plt + + operations = [] + # Use 200 operations to validate handling of larger datasets without excessive test time + for i in range(200): + timestamp = f"1234567890.{i:06d}" + byte_size = (i % 100 + 1) * 100 + op_type = "read" if i % 2 == 0 else "write" + operations.append({"type": op_type, "bytes": byte_size, "timestamp": timestamp}) + + display.generate_heatmap(operations, 10.0) + + # plt.show should be called once + mock_show.assert_called_once() + + # Should create one figure with two subplots + figs = plt.get_fignums() + assert len(figs) == 1 + + @patch("iops_profiler.display.plt.show") + def test_operations_with_zero_bytes_ignored(self, mock_show, profiler): + """Test that operations with zero bytes are ignored in heatmap""" + operations = [ + {"type": "read", "bytes": 0, "timestamp": "1234567890.100000"}, + {"type": "write", "bytes": 2048, "timestamp": "1234567890.200000"}, + {"type": "read", "bytes": 0, "timestamp": "1234567890.300000"}, + ] + display.generate_heatmap(operations, 1.0) + + # Should still create a plot (has one non-zero operation) + mock_show.assert_called_once() + + def test_no_matplotlib_installed(self, profiler): + """Test heatmap generation when matplotlib is not available""" + from iops_profiler import display + + original_plt = display.plt + + # Set plt to None + display.plt = None + + try: + operations = [{"type": "read", "bytes": 1024, "timestamp": "1234567890.100000"}] + # Should print warning and return early + display.generate_heatmap(operations, 1.0) + finally: + # Restore original plt + display.plt = original_plt + + def test_no_numpy_installed(self, profiler): + """Test heatmap generation when numpy is not available""" + from iops_profiler import display + + original_np = display.np + + # Set np to None + display.np = None + + try: + operations = [{"type": "read", "bytes": 1024, "timestamp": "1234567890.100000"}] + # Should print warning and return early + display.generate_heatmap(operations, 1.0) + finally: + # Restore original np + display.np = original_np + + @patch("iops_profiler.display.plt") + @patch("iops_profiler.display.np") + def test_heatmap_saves_to_file_in_terminal(self, mock_np, mock_plt, profiler, capsys): + """Test heatmap saves to file in terminal mode""" + import numpy as np + + mock_np.histogram2d = np.histogram2d + mock_np.logspace = np.logspace + mock_np.log10 = np.log10 + mock_np.linspace = np.linspace + mock_np.array = np.array + mock_np.meshgrid = np.meshgrid + mock_np.max = np.max + + operations = [ + {"type": "read", "bytes": 1024, "timestamp": "1234567890.100000"}, + {"type": "write", "bytes": 2048, "timestamp": "1234567890.200000"}, + ] + + mock_fig = MagicMock() + mock_ax1 = MagicMock() + mock_ax2 = MagicMock() + mock_plt.subplots.return_value = (mock_fig, (mock_ax1, mock_ax2)) + + # Test terminal mode - saves to file + with patch("iops_profiler.display.is_notebook_environment", return_value=False): + display.generate_heatmap(operations, 1.0) + + mock_plt.savefig.assert_called_once() + mock_plt.show.assert_not_called() + mock_plt.close.assert_called_once_with(mock_fig) + captured = capsys.readouterr() + assert "iops_heatmap.png" in captured.out diff --git a/tests/test_integration.py b/tests/test_integration.py index d714d97..92d262f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -340,9 +340,9 @@ def test_collect_ops_fs_usage_multiple_lines(self, profiler): operations.append(op) assert len(operations) == 3 - assert operations[0] == {"type": "read", "bytes": 0x100} - assert operations[1] == {"type": "write", "bytes": 0x200} - assert operations[2] == {"type": "read", "bytes": 0x300} + assert operations[0] == {"type": "read", "bytes": 0x100, "timestamp": "12:34:56"} + assert operations[1] == {"type": "write", "bytes": 0x200, "timestamp": "12:34:57"} + assert operations[2] == {"type": "read", "bytes": 0x300, "timestamp": "12:34:58"} def test_collect_ops_filters_invalid_lines(self, profiler): """Test that collect_ops filters out invalid lines"""