Skip to content

Commit

Permalink
Merge branch 'main' into pre-commit-ci-update-config
Browse files Browse the repository at this point in the history
  • Loading branch information
t-vi authored Oct 18, 2024
2 parents 58e1d34 + ec50c73 commit 1ba9abf
Show file tree
Hide file tree
Showing 35 changed files with 873 additions and 195 deletions.
3 changes: 1 addition & 2 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ jobs:
pytest thunder/tests/ \
-m "not standalone" \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--timeout=240 \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
Expand All @@ -97,7 +96,7 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "30"
timeoutInMinutes: "40"
displayName: "Testing: regular"
- bash: |
Expand Down
2 changes: 1 addition & 1 deletion notebooks/zero_to_thunder.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@
"source": [
"So what has changed? Quite a bit!\n",
"\n",
"When we call the Thunder module, it do the computation in a single function without control flow. And what's more, it applies optimizations, such as creating fusions for NVFuser to execute. We can see all this by showing the last computation trace:"
"When we call the Thunder module, it does the computation in a single function without control flow. And what's more, it applies optimizations, such as creating fusions for NVFuser to execute. We can see all this by showing the last computation trace:"
]
},
{
Expand Down
3 changes: 1 addition & 2 deletions thunder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,6 @@ def jit(
sharp_edges=sharp_edges,
using_jit=True,
disable_torch_autograd_support=disable_torch_autograd,
use_rematerialization=False,
only_execute_prims=False,
disable_preprocessing=True,
compile_options=compile_options,
Expand Down Expand Up @@ -614,7 +613,7 @@ def get_computation_and_inputs(*args, **kwargs):
use_del_last_used=False,
)
prologue_trc = prologue_traces[-1]
pro = prologue_trc.python_callable()
pro = prologue_trc.python_callable(include_decorators=False)

if epilogue_trc is not None:
epilogue = epilogue_trc.python_callable()
Expand Down
79 changes: 48 additions & 31 deletions thunder/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from torch.testing import make_tensor

import thunder
import thunder.dynamo
import thunder.core.devices as Devices
import thunder.core.dtypes as dtypes
import thunder.executors as executors
Expand All @@ -30,8 +31,7 @@
from thunder.executors.sdpaex import sdpa_ex
from thunder.executors.torch_compile import torch_compile_cat_ex, torch_compile_ex
from thunder.transforms.cudagraph import CUDAGraphTransform
from thunder.tests import nanogpt_model, hf_bart_self_attn, litgpt_model
from thunder.tests.litgpt_model import Config as LitGPTConfig
from thunder.tests import nanogpt_model, hf_bart_self_attn
from thunder.tests.make_tensor import make_tensor, make_tensor_like

# List of all benchmarks
Expand Down Expand Up @@ -707,6 +707,13 @@ def torch_compile_executor(fn: Callable) -> Callable:
return torch.compile(fn)


def thunderfx_executor(fn: Callable) -> Callable:
torch.backends.cuda.matmul.allow_tf32 = True
backend = thunder.dynamo.ThunderCompiler()
torch._dynamo.reset()
return torch.compile(fn, backend=backend)


def thunder_torch_executor(fn: Callable) -> Callable:
torch.backends.cuda.matmul.allow_tf32 = True
return thunder.jit(fn, executors=[thunder.pytorch_executor])
Expand Down Expand Up @@ -781,9 +788,6 @@ def func(fn: Callable) -> Callable:

@dataclass(frozen=True)
class get_default_torch_fsdp_executor:
from torch.distributed.fsdp import ShardingStrategy

sharding_strategy: ShardingStrategy
apply_torch_compile: bool
auto_wrap_policy: Any | None

Expand Down Expand Up @@ -1213,12 +1217,14 @@ def args(cls) -> tuple[BenchmarkArg, ...]:

def __init__(
self,
config: str | LitGPTConfig,
config: str,
batchdims: Sequence[int],
device: str,
dtype: dtypes.dtype,
requires_grad: bool,
) -> None:
from litgpt.config import Config as LitGPTConfig

super().__init__()

self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
Expand Down Expand Up @@ -1282,13 +1288,15 @@ def args(cls) -> tuple[BenchmarkArg, ...]:

def __init__(
self,
config: str | LitGPTConfig,
config: str,
batchdims: Sequence[int],
device: str,
dtype: dtypes.dtype,
requires_grad: bool,
use_liger: bool = False,
) -> None:
from litgpt.config import Config as LitGPTConfig

super().__init__()

self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
Expand Down Expand Up @@ -1941,12 +1949,14 @@ def args(cls) -> tuple[BenchmarkArg, ...]:

def __init__(
self,
config: str | LitGPTConfig = "Llama-2-7b-hf",
config: str = "Llama-2-7b-hf",
batchdims: Sequence[int] = (16,),
device: str = "cuda",
dtype: dtypes.dtype = thunder.bfloat16,
requires_grad: bool = True,
) -> None:
from litgpt.config import Config as LitGPTConfig

super().__init__()

self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
Expand All @@ -1968,11 +1978,9 @@ def make_batch(self) -> tuple[list, dict]:
return (make(shape),), {}

def fn(self) -> Callable:
module = (
litgpt_model.LLaMAMLP(self.config)
.to(device=self.device, dtype=self.tdtype)
.requires_grad_(self.requires_grad)
)
from litgpt.model import LLaMAMLP

module = LLaMAMLP(self.config).to(device=self.device, dtype=self.tdtype).requires_grad_(self.requires_grad)
return module


Expand Down Expand Up @@ -2012,12 +2020,14 @@ def args(cls) -> tuple[BenchmarkArg, ...]:

def __init__(
self,
config: str | LitGPTConfig = "Llama-2-7b-hf",
config: str = "Llama-2-7b-hf",
batchdims: Sequence[int] = (16,),
device: str = "cuda",
dtype: dtypes.dtype = thunder.bfloat16,
requires_grad: bool = True,
) -> None:
from litgpt.config import Config as LitGPTConfig

super().__init__()

self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
Expand All @@ -2038,8 +2048,10 @@ def make_batch(self) -> tuple[list, dict]:
return (x, cos, sin, mask, input_pos), {}

def fn(self) -> Callable:
from litgpt.model import CausalSelfAttention

module = (
litgpt_model.CausalSelfAttention(self.config)
CausalSelfAttention(self.config)
.to(device=self.device, dtype=self.tdtype)
.requires_grad_(self.requires_grad)
)
Expand Down Expand Up @@ -2119,8 +2131,10 @@ def make_batch(self) -> tuple[list, dict]:
return (make(shape),), {}

def fn(self) -> Callable:
from litgpt.model import RMSNorm

module = (
litgpt_model.RMSNorm(self.size, self.dim, self.eps)
RMSNorm(self.size, self.dim, self.eps)
.to(device=self.device, dtype=self.tdtype)
.requires_grad_(self.requires_grad)
)
Expand Down Expand Up @@ -2168,7 +2182,7 @@ def args(cls) -> tuple[BenchmarkArg, ...]:

def __init__(
self,
config: LitGPTConfig,
config,
batchdims: Sequence[int] = (8,),
indices_dtype: dtypes.dtype = thunder.int64,
device: str = "cuda",
Expand Down Expand Up @@ -2201,11 +2215,9 @@ def make_batch(self) -> tuple[list, dict]:
return (x,), {}

def fn(self) -> Callable:
gpt = (
litgpt_model.GPT(self.config)
.to(device=self.device, dtype=self.model_tdtype)
.requires_grad_(self.requires_grad)
)
from litgpt.model import GPT

gpt = GPT(self.config).to(device=self.device, dtype=self.model_tdtype).requires_grad_(self.requires_grad)
return gpt

def postprocess_for_backward(self, output: torch.Tensor) -> torch.Tensor | None:
Expand All @@ -2222,6 +2234,8 @@ def postprocess_for_backward(self, output: torch.Tensor) -> torch.Tensor | None:
# "scaled_dot_product_attention" call.
class QKVSplitRope(nn.Module):
def __init__(self, config, use_apex) -> None:
from litgpt.model import apply_rope

self.fused_apply_rotary_pos_emb_cached = None
if use_apex:
try:
Expand All @@ -2233,7 +2247,7 @@ def __init__(self, config, use_apex) -> None:

super().__init__()
self.config = config
self.apply_rope = litgpt_model.apply_rope
self.apply_rope = apply_rope
self.use_apex = use_apex

def forward(
Expand Down Expand Up @@ -2324,13 +2338,15 @@ def args(cls) -> tuple[BenchmarkArg, ...]:

def __init__(
self,
config: str | LitGPTConfig = "Llama-2-7b-hf",
config: str = "Llama-2-7b-hf",
batchdims: Sequence[int] = (16,),
device: str = "cuda",
dtype: dtypes.dtype = thunder.bfloat16,
requires_grad: bool = True,
use_apex: bool = False,
) -> None:
from litgpt.config import Config as LitGPTConfig

super().__init__()

self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
Expand Down Expand Up @@ -2635,7 +2651,7 @@ def description(cls) -> str:

def __init__(
self,
config: str | LitGPTConfig = "Llama-2-7b-hf",
config: str = "Llama-2-7b-hf",
batchdims: Sequence[int] = (16,),
device: str = "cuda",
dtype: dtypes.dtype = thunder.bfloat16,
Expand Down Expand Up @@ -2824,6 +2840,9 @@ def __init__(
dtype: thunder.dtypes.dtype | torch.dtype | str = thunder.bfloat16,
requires_grad: bool = True,
) -> None:
from litgpt.model import build_rope_cache
from litgpt.config import Config as LitGPTConfig

super().__init__()

self.config = LitGPTConfig.from_name(config)
Expand All @@ -2839,9 +2858,7 @@ def __init__(
# Sets required benchmark parameters
self.devices: list[str] = [device]

self.cos, self.sin = litgpt_model.build_rope_cache(
seq_len=seq_length, n_elem=self.config.rope_n_elem, device=self.device
)
self.cos, self.sin = build_rope_cache(seq_len=seq_length, n_elem=self.config.rope_n_elem, device=self.device)

def make_batch(self) -> tuple[list, dict]:
make = partial(make_tensor, device=self.device, dtype=self.tdtype, requires_grad=self.requires_grad)
Expand All @@ -2851,9 +2868,9 @@ def make_batch(self) -> tuple[list, dict]:
return (a, self.cos, self.sin), {}

def fn(self) -> Callable:
model = (
litgpt_model.Block(self.config).to(device=self.device, dtype=self.tdtype).requires_grad_(self.requires_grad)
)
from litgpt.model import Block

model = Block(self.config).to(device=self.device, dtype=self.tdtype).requires_grad_(self.requires_grad)
return model


Expand Down
24 changes: 14 additions & 10 deletions thunder/benchmarks/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
thunder_cudnn_executor,
thunder_cudnn_nvfuser_executor,
thunder_executor,
thunderfx_executor,
thunder_sdpa_torch_compile_nvfuser_executor,
torch_compile_executor,
torch_executor,
Expand All @@ -55,6 +56,7 @@
"phi-2",
]
RUN_ALL_CONFIGS = os.environ.get("THUNDER_BENCH_RUN_ALL_CONFIGS", "0") == "1"
MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB"


class ComputeType(Enum):
Expand Down Expand Up @@ -112,7 +114,7 @@ def deco(old_timer):
@functools.wraps(old_timer)
def timer():
ret = old_timer()
benchmark.extra_info["max_allocated_memory_MB"] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
torch.cuda.reset_peak_memory_stats()
return ret

Expand Down Expand Up @@ -151,17 +153,19 @@ def interpreter_fwd(module: Callable):
return fn_


executors = (
torch_executor,
torch_compile_executor,
thunder_executor,
)
executors = (torch_executor, torch_compile_executor, thunder_executor)
executors_ids = (
"torch",
"torch.compile",
"thunder",
)

torchbench_executors = (*executors, thunderfx_executor)
torchbench_executors_ids = (
*executors_ids,
"thunderfx",
)

apex_executors = (thunder_apex_executor, thunder_apex_nvfuser_executor)
apex_executors_ids = ("thunder+apex-grad", "thunder+apex+nvfuser-grad")

Expand Down Expand Up @@ -841,8 +845,8 @@ def test_resnet50(benchmark, executor: Callable, compute_type: ComputeType):
)
@pytest.mark.parametrize(
"executor,",
executors,
ids=executors_ids,
torchbench_executors,
ids=torchbench_executors_ids,
)
@parametrize_compute_type
def test_torchbench(benchmark, module_name, executor, compute_type: ComputeType):
Expand All @@ -867,8 +871,8 @@ def test_torchbench(benchmark, module_name, executor, compute_type: ComputeType)
)
@pytest.mark.parametrize(
"executor,",
executors,
ids=executors_ids,
torchbench_executors,
ids=torchbench_executors_ids,
)
@parametrize_compute_type
def test_torchbench_canary(benchmark, module_name, executor, compute_type: ComputeType):
Expand Down
Loading

0 comments on commit 1ba9abf

Please sign in to comment.