Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ The extension will display a table showing:

### Histogram Visualization

Use the `--histogram` flag to visualize I/O operation distributions (available for `strace` and `fs_usage` measurement modes):
Use the `--histogram` flag to visualize I/O operation distributions:

```python
%%iops --histogram
Expand All @@ -61,8 +61,14 @@ Both charts display separate lines for reads, writes, and all operations combine

## Platform Support

- **Linux/Windows**: Uses `psutil` for per-process I/O tracking
- **Linux**: Uses `strace` for detailed per-operation tracking (fallback to `psutil` if `strace` unavailable)
- With `strace`: Captures all system-level I/O operations
- With `psutil`: Provides aggregate counts only (no histogram support)
- **macOS**: Uses `fs_usage` with privilege elevation (requires password prompt)
- Captures all system-level I/O operations
- **Windows**: Uses Python-level I/O tracking for granular data
- Captures Python `open()`/`read()`/`write()` operations
- **Note**: May not capture I/O from native C extensions or libraries

## Requirements

Expand Down
212 changes: 198 additions & 14 deletions src/iops_profiler/iops_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@
f.write('Hello World')
"""

import builtins
import os
import sys
import time
import re
import subprocess
import tempfile
from pathlib import Path
from IPython.core.magic import Magics, magics_class, cell_magic
from IPython.display import display, HTML
import math

try:
import psutil
Expand Down Expand Up @@ -59,8 +58,13 @@ def __init__(self, shell):
# Set of syscall names for I/O operations (lowercase)
self._io_syscalls = set(STRACE_IO_SYSCALLS)

def _measure_linux_windows(self, code):
"""Measure IOPS on Linux/Windows using psutil"""
def _measure_linux_windows(self, code, collect_ops=False):
"""Measure IOPS on Linux/Windows using psutil

Args:
code: The code to profile
collect_ops: If True, collect individual operation sizes for histogram (Windows only)
"""
if not psutil:
raise RuntimeError("psutil not installed. Run: pip install psutil")

Expand All @@ -72,6 +76,10 @@ def _measure_linux_windows(self, code):
except AttributeError:
raise RuntimeError(f"psutil.Process.io_counters() not supported on {self.platform}")

# On Windows, use Python-level I/O tracking if granular data is requested
if collect_ops and self.platform == 'win32':
return self._measure_windows_python_io(code, io_before)

# Execute the code
start_time = time.time()
self.shell.run_cell(code)
Expand All @@ -95,6 +103,188 @@ def _measure_linux_windows(self, code):
'method': 'psutil (per-process)'
}

def _measure_windows_python_io(self, code, io_before):
"""Measure IOPS on Windows with Python-level I/O tracking for granular data

This method intercepts Python's built-in open() function to track individual
read and write operations at the Python level.

Note: Only captures Python-level I/O (open/read/write), not native C extensions.

Args:
code: The code to profile
io_before: Initial psutil I/O counters for verification
"""
operations = []

original_open = builtins.open

class IOTracker:
"""Wrapper to track I/O operations on file objects"""
def __init__(self, file_obj, mode):
self.file = file_obj
self.mode = mode

def read(self, size=-1):
result = self.file.read(size)
if result:
bytes_read = len(result) if isinstance(result, (bytes, str)) else 0
if bytes_read > 0:
operations.append({'type': 'read', 'bytes': bytes_read})
return result

def readline(self, size=-1):
result = self.file.readline(size)
if result:
bytes_read = len(result) if isinstance(result, (bytes, str)) else 0
if bytes_read > 0:
operations.append({'type': 'read', 'bytes': bytes_read})
return result

def readlines(self, hint=-1):
result = self.file.readlines(hint)
if result:
total_bytes = sum(len(line) for line in result)
if total_bytes > 0:
# Record as individual operations to better reflect actual I/O
for line in result:
if line:
operations.append({'type': 'read', 'bytes': len(line)})
return result

def write(self, data):
result = self.file.write(data)
bytes_written = len(data) if isinstance(data, (bytes, str)) else 0
if bytes_written > 0:
operations.append({'type': 'write', 'bytes': bytes_written})
return result

def writelines(self, lines):
result = self.file.writelines(lines)
# Record as individual operations
for line in lines:
if line:
bytes_written = len(line) if isinstance(line, (bytes, str)) else 0
if bytes_written > 0:
operations.append({'type': 'write', 'bytes': bytes_written})
return result

def __enter__(self):
# Return self to allow our wrapper methods to be used
# The wrapped file's __enter__ is called automatically on the underlying file
self.file.__enter__()
return self

def __exit__(self, *args):
return self.file.__exit__(*args)

def __iter__(self):
return self

def __next__(self):
# Let StopIteration propagate naturally to signal iteration end
line = self.file.__next__()
# Track the operation only if it transferred data (non-zero bytes)
# This is consistent with strace/fs_usage which report actual bytes transferred
if line:
bytes_read = len(line) if isinstance(line, (bytes, str)) else 0
if bytes_read > 0:
operations.append({'type': 'read', 'bytes': bytes_read})
return line

def close(self):
"""Close the file"""
return self.file.close()

def flush(self):
"""Flush the file buffer"""
return self.file.flush()

def seek(self, offset, whence=0):
"""Seek to a position in the file"""
return self.file.seek(offset, whence)

def tell(self):
"""Return current file position"""
return self.file.tell()

def __getattr__(self, name):
"""Fallback for any other file methods"""
return getattr(self.file, name)

def tracked_open(file, mode='r', *args, **kwargs):
"""Wrapper for open() that tracks I/O operations"""
file_obj = original_open(file, mode, *args, **kwargs)
# Only track readable/writable files
if 'r' in mode or 'w' in mode or 'a' in mode or '+' in mode:
return IOTracker(file_obj, mode)
return file_obj

try:
# Install the tracking wrapper in builtins
builtins.open = tracked_open

# Also inject into the IPython namespace to ensure it's used
# Save whether 'open' was already in the namespace
had_open_in_ns = 'open' in self.shell.user_ns
shell_open = self.shell.user_ns.get('open') if had_open_in_ns else None
self.shell.user_ns['open'] = tracked_open

# Execute the code
start_time = time.time()
self.shell.run_cell(code)
elapsed_time = time.time() - start_time

finally:
# Restore original open in both places
builtins.open = original_open
# Restore the IPython namespace
if had_open_in_ns and shell_open is not None:
self.shell.user_ns['open'] = shell_open
else:
# Remove 'open' from namespace if it wasn't there before
self.shell.user_ns.pop('open', None)

# Get final I/O counters for aggregate counts
process = psutil.Process()
io_after = process.io_counters()

# Calculate aggregate differences from psutil
read_count = io_after.read_count - io_before.read_count
write_count = io_after.write_count - io_before.write_count
read_bytes = io_after.read_bytes - io_before.read_bytes
write_bytes = io_after.write_bytes - io_before.write_bytes

# If we captured operations, use them; otherwise fall back to psutil counts
if operations:
# Count operations from our tracker
tracked_read_count = sum(1 for op in operations if op['type'] == 'read')
tracked_write_count = sum(1 for op in operations if op['type'] == 'write')
tracked_read_bytes = sum(op['bytes'] for op in operations if op['type'] == 'read')
tracked_write_bytes = sum(op['bytes'] for op in operations if op['type'] == 'write')

# Prefer tracked counts if available, fall back to psutil
result_read_count = tracked_read_count if tracked_read_count > 0 else read_count
result_write_count = tracked_write_count if tracked_write_count > 0 else write_count
result_read_bytes = tracked_read_bytes if tracked_read_bytes > 0 else read_bytes
result_write_bytes = tracked_write_bytes if tracked_write_bytes > 0 else write_bytes
else:
# No operations tracked, use psutil counts
result_read_count = read_count
result_write_count = write_count
result_read_bytes = read_bytes
result_write_bytes = write_bytes

return {
'read_count': result_read_count,
'write_count': result_write_count,
'read_bytes': result_read_bytes,
'write_bytes': result_write_bytes,
'elapsed_time': elapsed_time,
'method': 'Python I/O tracking (per-operation)',
'operations': operations
}

def _parse_fs_usage_line(self, line, collect_ops=False):
"""Parse a single fs_usage output line for I/O operations

Expand Down Expand Up @@ -163,11 +353,9 @@ def _parse_strace_line(self, line, collect_ops=False):
# (e.g., read, pread64, readv, write, pwrite64, writev)
# Note: No standard syscalls contain both 'read' and 'write' in their names
if 'read' in syscall:
is_read = True
is_write = False
op_type = 'read'
elif 'write' in syscall:
is_read = False
is_write = True
op_type = 'write'
else:
return None if collect_ops else (None, 0)

Expand All @@ -176,8 +364,6 @@ def _parse_strace_line(self, line, collect_ops=False):
if bytes_transferred < 0:
return None if collect_ops else (None, 0)

op_type = 'read' if is_read else 'write'

if collect_ops:
return {'type': op_type, 'bytes': bytes_transferred}
return op_type, bytes_transferred
Expand Down Expand Up @@ -764,14 +950,12 @@ def iops(self, line, cell):
except (RuntimeError, FileNotFoundError) as e:
print(f"⚠️ Could not use strace: {e}")
print("Falling back to psutil per-process measurement.\n")
results = self._measure_linux_windows(cell)
results = self._measure_linux_windows(cell, collect_ops=False)
if show_histogram:
print("⚠️ Histograms not available for psutil measurement mode.")

elif self.platform == 'win32':
results = self._measure_linux_windows(cell)
if show_histogram:
print("⚠️ Histograms not available for psutil measurement mode on Windows.")
results = self._measure_linux_windows(cell, collect_ops=collect_ops)

else:
print(f"⚠️ Platform '{self.platform}' not fully supported.")
Expand Down