From ffa49d242ce39b962be79b186b04f447650a5bc3 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Thu, 2 Feb 2023 15:43:36 -0800
Subject: [PATCH 1/9] remove amp, ddp, and sync batchnorm

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 README.md                                     |  94 +--
 apex/RNN/README.md                            |   3 -
 apex/RNN/RNNBackend.py                        | 365 --------
 apex/RNN/__init__.py                          |   3 -
 apex/RNN/cells.py                             |  84 --
 apex/RNN/models.py                            |  56 --
 apex/__init__.py                              |   5 +-
 apex/amp/README.md                            |  72 --
 apex/amp/__init__.py                          |   5 -
 apex/amp/__version__.py                       |   2 -
 apex/amp/_amp_state.py                        |  59 --
 apex/amp/_initialize.py                       | 265 ------
 apex/amp/_process_optimizer.py                | 489 -----------
 apex/amp/amp.py                               | 183 ----
 apex/amp/compat.py                            |  46 -
 apex/amp/frontend.py                          | 446 ----------
 apex/amp/handle.py                            | 281 -------
 apex/amp/lists/__init__.py                    |   0
 apex/amp/lists/functional_overrides.py        |  80 --
 apex/amp/lists/tensor_overrides.py            |  63 --
 apex/amp/lists/torch_overrides.py             | 115 ---
 apex/amp/opt.py                               | 103 ---
 apex/amp/rnn_compat.py                        |  53 --
 apex/amp/scaler.py                            | 217 -----
 apex/amp/utils.py                             | 210 -----
 apex/amp/wrap.py                              | 276 ------
 apex/fp16_utils/README.md                     |  16 -
 apex/fp16_utils/__init__.py                   |  16 -
 apex/fp16_utils/fp16_optimizer.py             | 557 ------------
 apex/fp16_utils/fp16util.py                   | 189 -----
 apex/fp16_utils/loss_scaler.py                | 188 -----
 apex/parallel/LARC.py                         |   6 +-
 apex/parallel/README.md                       |  66 --
 apex/parallel/__init__.py                     |  96 ---
 apex/parallel/distributed.py                  | 643 --------------
 apex/parallel/multiproc.py                    |  35 -
 apex/parallel/optimized_sync_batchnorm.py     |  85 --
 .../optimized_sync_batchnorm_kernel.py        | 119 ---
 apex/parallel/sync_batchnorm.py               | 136 ---
 apex/parallel/sync_batchnorm_kernel.py        |  87 --
 examples/simple/distributed/README.md         |  13 -
 .../distributed/distributed_data_parallel.py  |  65 --
 examples/simple/distributed/run.sh            |   2 -
 tests/L0/run_amp/__init__.py                  |   0
 tests/L0/run_amp/test_add_param_group.py      | 148 ----
 tests/L0/run_amp/test_basic_casts.py          | 143 ----
 tests/L0/run_amp/test_cache.py                | 137 ---
 tests/L0/run_amp/test_checkpointing.py        | 267 ------
 tests/L0/run_amp/test_fused_sgd.py            | 794 ------------------
 tests/L0/run_amp/test_larc.py                 |  53 --
 tests/L0/run_amp/test_multi_tensor_axpby.py   | 180 ----
 tests/L0/run_amp/test_multi_tensor_l2norm.py  |  87 --
 tests/L0/run_amp/test_multi_tensor_scale.py   | 126 ---
 .../test_multiple_models_optimizers_losses.py | 762 -----------------
 tests/L0/run_amp/test_promotion.py            |  75 --
 tests/L0/run_amp/test_rnn.py                  | 116 ---
 tests/L0/run_amp/utils.py                     |  21 -
 .../run_deprecated/test_deprecated_warning.py |  56 --
 tests/L0/run_fp16util/__init__.py             |   0
 tests/L0/run_fp16util/test_fp16util.py        |  75 --
 tests/L0/run_test.py                          |   3 -
 61 files changed, 13 insertions(+), 8924 deletions(-)
 delete mode 100644 apex/RNN/README.md
 delete mode 100644 apex/RNN/RNNBackend.py
 delete mode 100644 apex/RNN/__init__.py
 delete mode 100644 apex/RNN/cells.py
 delete mode 100644 apex/RNN/models.py
 delete mode 100644 apex/amp/README.md
 delete mode 100644 apex/amp/__init__.py
 delete mode 100644 apex/amp/__version__.py
 delete mode 100644 apex/amp/_amp_state.py
 delete mode 100644 apex/amp/_initialize.py
 delete mode 100644 apex/amp/_process_optimizer.py
 delete mode 100644 apex/amp/amp.py
 delete mode 100644 apex/amp/compat.py
 delete mode 100644 apex/amp/frontend.py
 delete mode 100644 apex/amp/handle.py
 delete mode 100644 apex/amp/lists/__init__.py
 delete mode 100644 apex/amp/lists/functional_overrides.py
 delete mode 100644 apex/amp/lists/tensor_overrides.py
 delete mode 100644 apex/amp/lists/torch_overrides.py
 delete mode 100644 apex/amp/opt.py
 delete mode 100644 apex/amp/rnn_compat.py
 delete mode 100644 apex/amp/scaler.py
 delete mode 100644 apex/amp/utils.py
 delete mode 100644 apex/amp/wrap.py
 delete mode 100644 apex/fp16_utils/README.md
 delete mode 100644 apex/fp16_utils/__init__.py
 delete mode 100755 apex/fp16_utils/fp16_optimizer.py
 delete mode 100644 apex/fp16_utils/fp16util.py
 delete mode 100644 apex/fp16_utils/loss_scaler.py
 delete mode 100644 apex/parallel/README.md
 delete mode 100644 apex/parallel/distributed.py
 delete mode 100644 apex/parallel/multiproc.py
 delete mode 100644 apex/parallel/optimized_sync_batchnorm.py
 delete mode 100644 apex/parallel/optimized_sync_batchnorm_kernel.py
 delete mode 100644 apex/parallel/sync_batchnorm.py
 delete mode 100644 apex/parallel/sync_batchnorm_kernel.py
 delete mode 100644 examples/simple/distributed/README.md
 delete mode 100644 examples/simple/distributed/distributed_data_parallel.py
 delete mode 100644 examples/simple/distributed/run.sh
 delete mode 100644 tests/L0/run_amp/__init__.py
 delete mode 100644 tests/L0/run_amp/test_add_param_group.py
 delete mode 100644 tests/L0/run_amp/test_basic_casts.py
 delete mode 100644 tests/L0/run_amp/test_cache.py
 delete mode 100644 tests/L0/run_amp/test_checkpointing.py
 delete mode 100644 tests/L0/run_amp/test_fused_sgd.py
 delete mode 100644 tests/L0/run_amp/test_larc.py
 delete mode 100644 tests/L0/run_amp/test_multi_tensor_axpby.py
 delete mode 100644 tests/L0/run_amp/test_multi_tensor_l2norm.py
 delete mode 100644 tests/L0/run_amp/test_multi_tensor_scale.py
 delete mode 100644 tests/L0/run_amp/test_multiple_models_optimizers_losses.py
 delete mode 100644 tests/L0/run_amp/test_promotion.py
 delete mode 100644 tests/L0/run_amp/test_rnn.py
 delete mode 100644 tests/L0/run_amp/utils.py
 delete mode 100644 tests/L0/run_deprecated/test_deprecated_warning.py
 delete mode 100644 tests/L0/run_fp16util/__init__.py
 delete mode 100644 tests/L0/run_fp16util/test_fp16util.py

diff --git a/README.md b/README.md
index 32fdd927a..eba21bffd 100644
--- a/README.md
+++ b/README.md
@@ -1,106 +1,30 @@
 # Introduction
 
-This repository holds NVIDIA-maintained utilities to streamline mixed precision and distributed training in Pytorch.
-Some of the code here will be included in upstream Pytorch eventually.
+This repository holds NVIDIA-maintained utilities to streamline mixed precision and distributed training in PyTorch.
+Some of the code here will be included in upstream PyTorch eventually.
 The intent of Apex is to make up-to-date utilities available to users as quickly as possible.
 
 ## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)
 
-## [GTC 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/GTC_2019) and [Pytorch DevCon 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/Pytorch_Devcon_2019) Slides
+## [GTC 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/GTC_2019) and [PyTorch DevCon 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/PyTorch_Devcon_2019) Slides
 
 # Contents
 
 ## 1. Amp:  Automatic Mixed Precision
 
-**Deprecated. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)**
-
-`apex.amp` is a tool to enable mixed precision training by changing only 3 lines of your script.
-Users can easily experiment with different pure and mixed precision training modes by supplying
-different flags to `amp.initialize`.
-
-[Webinar introducing Amp](https://info.nvidia.com/webinar-mixed-precision-with-pytorch-reg-page.html)
-(The flag `cast_batchnorm` has been renamed to `keep_batchnorm_fp32`).
-
-[API Documentation](https://nvidia.github.io/apex/amp.html)
-
-[Comprehensive Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
-
-[DCGAN example coming soon...](https://github.com/NVIDIA/apex/tree/master/examples/dcgan)
-
-[Moving to the new Amp API](https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users) (for users of the deprecated "Amp" and "FP16_Optimizer" APIs)
+**Removed. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)**
 
 ## 2. Distributed Training
 
-**`apex.parallel.DistributedDataParallel` is deprecated. Use [`torch.nn.parallel.DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html?highlight=distributeddataparallel#torch.nn.parallel.DistributedDataParallel)**
+**`apex.parallel.DistributedDataParallel` is removed. Use [`torch.nn.parallel.DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html?highlight=distributeddataparallel#torch.nn.parallel.DistributedDataParallel)**
 
 `apex.parallel.DistributedDataParallel` is a module wrapper, similar to
 `torch.nn.parallel.DistributedDataParallel`.  It enables convenient multiprocess distributed training,
 optimized for NVIDIA's NCCL communication library.
 
-[API Documentation](https://nvidia.github.io/apex/parallel.html)
-
-[Python Source](https://github.com/NVIDIA/apex/tree/master/apex/parallel)
-
-[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed)
-
-The [Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
-shows use of `apex.parallel.DistributedDataParallel` along with `apex.amp`.
-
 ### Synchronized Batch Normalization
 
-**Deprecated. Use [`torch.nn.SyncBatchNorm`](https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html)**
-
-`apex.parallel.SyncBatchNorm` extends `torch.nn.modules.batchnorm._BatchNorm` to
-support synchronized BN.
-It allreduces stats across processes during multiprocess (DistributedDataParallel) training.
-Synchronous BN has been used in cases where only a small
-local minibatch can fit on each GPU.
-Allreduced stats increase the effective batch size for the BN layer to the
-global batch size across all processes (which, technically, is the correct
-formulation).
-Synchronous BN has been observed to improve converged accuracy in some of our research models.
-
-### Checkpointing
-
-To properly save and load your `amp` training, we introduce the `amp.state_dict()`, which contains all `loss_scalers` and their corresponding unskipped steps,
-as well as `amp.load_state_dict()` to restore these attributes.
-
-In order to get bitwise accuracy, we recommend the following workflow:
-```python
-# Initialization
-opt_level = 'O1'
-model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
-
-# Train your model
-...
-with amp.scale_loss(loss, optimizer) as scaled_loss:
-    scaled_loss.backward()
-...
-
-# Save checkpoint
-checkpoint = {
-    'model': model.state_dict(),
-    'optimizer': optimizer.state_dict(),
-    'amp': amp.state_dict()
-}
-torch.save(checkpoint, 'amp_checkpoint.pt')
-...
-
-# Restore
-model = ...
-optimizer = ...
-checkpoint = torch.load('amp_checkpoint.pt')
-
-model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
-model.load_state_dict(checkpoint['model'])
-optimizer.load_state_dict(checkpoint['optimizer'])
-amp.load_state_dict(checkpoint['amp'])
-
-# Continue training
-...
-```
-
-Note that we recommend restoring the model using the same `opt_level`. Also note that we recommend calling the `load_state_dict` methods after `amp.initialize`.
+**Removed. Use [`torch.nn.SyncBatchNorm`](https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html)**
 
 # Installation
 Each [`apex.contrib`](./apex/contrib) module requires one or more install options other than `--cpp_ext` and `--cuda_ext`.
@@ -117,7 +41,7 @@ See [the NGC documentation](https://docs.nvidia.com/deeplearning/frameworks/pyto
 
 ## From Source
 
-To install Apex from source, we recommend using the nightly Pytorch obtainable from https://github.com/pytorch/pytorch.
+To install Apex from source, we recommend using the nightly PyTorch obtainable from https://github.com/pytorch/pytorch.
 
 The latest stable release obtainable from https://pytorch.org should also work.
 
@@ -143,9 +67,9 @@ A Python-only build omits:
 
 
 ### [Experimental] Windows
-`pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
+`pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build PyTorch from source
 on your system. A Python-only build via `pip install -v --no-cache-dir .` is more likely to work.  
-If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
+If you installed PyTorch in a Conda environment, make sure to install Apex in that same environment.
 
 
 ## Custom C++/CUDA Extensions and Install Options
diff --git a/apex/RNN/README.md b/apex/RNN/README.md
deleted file mode 100644
index 82c4eb680..000000000
--- a/apex/RNN/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-**This module will be removed by the end of February 2023**
-
-Under construction...
diff --git a/apex/RNN/RNNBackend.py b/apex/RNN/RNNBackend.py
deleted file mode 100644
index a9382e601..000000000
--- a/apex/RNN/RNNBackend.py
+++ /dev/null
@@ -1,365 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-
-import torch.nn.functional as F
-
-import math
-
-
-def is_iterable(maybe_iterable):
-    return isinstance(maybe_iterable, list) or isinstance(maybe_iterable, tuple)
-
-
-def flatten_list(tens_list):
-    """
-    flatten_list
-    """
-    if not is_iterable(tens_list):
-        return tens_list
-    
-    return torch.cat(tens_list, dim=0).view(len(tens_list), *tens_list[0].size() )
-
-    
-#These modules always assumes batch_first
-class bidirectionalRNN(nn.Module):
-    """
-    bidirectionalRNN
-    """
-    def __init__(self, inputRNN, num_layers=1, dropout = 0):
-        super(bidirectionalRNN, self).__init__()
-        self.dropout = dropout
-        self.fwd = stackedRNN(inputRNN, num_layers=num_layers, dropout = dropout)
-        self.bckwrd = stackedRNN(inputRNN.new_like(), num_layers=num_layers, dropout = dropout)
-        self.rnns = nn.ModuleList([self.fwd, self.bckwrd])
-        
-    #collect hidden option will return all hidden/cell states from entire RNN
-    def forward(self, input, collect_hidden=False):
-        """
-        forward()
-        """
-        seq_len = input.size(0)
-        bsz = input.size(1)
-
-        fwd_out, fwd_hiddens = list(self.fwd(input, collect_hidden = collect_hidden))
-        bckwrd_out, bckwrd_hiddens = list(self.bckwrd(input, reverse=True, collect_hidden = collect_hidden))
-        
-        output = torch.cat( [fwd_out, bckwrd_out], -1 )
-        hiddens = tuple( torch.cat(hidden, -1) for hidden in zip( fwd_hiddens, bckwrd_hiddens) )
-
-        return output, hiddens
-
-    def reset_parameters(self):
-        """
-        reset_parameters()
-        """
-        for rnn in self.rnns:
-            rnn.reset_parameters()
-        
-    def init_hidden(self, bsz):
-        """
-        init_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.init_hidden(bsz)
-
-    def detach_hidden(self):
-        """
-        detach_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.detachHidden()
-        
-    def reset_hidden(self, bsz):
-        """
-        reset_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.reset_hidden(bsz)
-
-    def init_inference(self, bsz):    
-        """
-        init_inference()
-        """
-        for rnn in self.rnns:
-            rnn.init_inference(bsz)
-
-   
-#assumes hidden_state[0] of inputRNN is output hidden state
-#constructor either takes an RNNCell or list of RNN layers
-class stackedRNN(nn.Module):        
-    """
-    stackedRNN
-    """
-    def __init__(self, inputRNN, num_layers=1, dropout=0):
-        super(stackedRNN, self).__init__()
-        
-        self.dropout = dropout
-        
-        if isinstance(inputRNN, RNNCell):
-            self.rnns = [inputRNN]
-            for i in range(num_layers-1):
-                self.rnns.append(inputRNN.new_like(inputRNN.output_size))
-        elif isinstance(inputRNN, list):
-            assert len(inputRNN) == num_layers, "RNN list length must be equal to num_layers"
-            self.rnns=inputRNN
-        else:
-            raise RuntimeError()
-        
-        self.nLayers = len(self.rnns)
-        
-        self.rnns = nn.ModuleList(self.rnns)
-
-
-    '''
-    Returns output as hidden_state[0] Tensor([sequence steps][batch size][features])
-    If collect hidden will also return Tuple(
-        [n_hidden_states][sequence steps] Tensor([layer][batch size][features])
-    )
-    If not collect hidden will also return Tuple(
-        [n_hidden_states] Tensor([layer][batch size][features])
-    '''
-    def forward(self, input, collect_hidden=False, reverse=False):
-        """
-        forward()
-        """
-        seq_len = input.size(0)
-        bsz = input.size(1)
-        inp_iter = reversed(range(seq_len)) if reverse else range(seq_len)
-
-        hidden_states = [[] for i in range(self.nLayers)]
-        outputs = []
-
-        for seq in inp_iter:
-            for layer in range(self.nLayers):
-
-                if layer == 0:
-                    prev_out = input[seq]
-                    
-                outs = self.rnns[layer](prev_out)
-
-                if collect_hidden:
-                    hidden_states[layer].append(outs)
-                elif seq == seq_len-1:
-                    hidden_states[layer].append(outs)
-                    
-                prev_out = outs[0]
-
-            outputs.append(prev_out)
-
-        if reverse:
-            outputs = list(reversed(outputs))
-        '''
-        At this point outputs is in format:
-        list( [seq_length] x Tensor([bsz][features]) )
-        need to convert it to:
-        list( Tensor([seq_length][bsz][features]) )
-        '''
-        output = flatten_list(outputs)
-
-        '''
-        hidden_states at this point is in format:
-        list( [layer][seq_length][hidden_states] x Tensor([bsz][features]) )
-        need to convert it to:
-          For not collect hidden:
-            list( [hidden_states] x Tensor([layer][bsz][features]) )
-          For collect hidden:
-            list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
-        '''
-        if not collect_hidden:
-            seq_len = 1
-        n_hid = self.rnns[0].n_hidden_states
-        new_hidden = [ [ [ None for k in range(self.nLayers)] for j in range(seq_len) ] for i in range(n_hid) ]
-
-
-        for i in range(n_hid):
-            for j in range(seq_len):
-                for k in range(self.nLayers):
-                    new_hidden[i][j][k] = hidden_states[k][j][i]
-
-        hidden_states = new_hidden
-        #Now in format list( [hidden_states][seq_length][layer] x Tensor([bsz][features]) )
-        #Reverse seq_length if reverse
-        if reverse:
-            hidden_states = list( list(reversed(list(entry))) for entry in hidden_states)
-
-        #flatten layer dimension into tensor
-        hiddens = list( list(
-            flatten_list(seq) for seq in hidden )
-                        for hidden in hidden_states )
-        
-        #Now in format list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
-        #Remove seq_length dimension if not collect_hidden
-        if not collect_hidden:
-            hidden_states = list( entry[0] for entry in hidden_states)
-        return output, hidden_states
-    
-    def reset_parameters(self):
-        """
-        reset_parameters()
-        """
-        for rnn in self.rnns:
-            rnn.reset_parameters()
-        
-    def init_hidden(self, bsz):
-        """
-        init_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.init_hidden(bsz)
-
-    def detach_hidden(self):
-        """
-        detach_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.detach_hidden()
-        
-    def reset_hidden(self, bsz):
-        """
-        reset_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.reset_hidden(bsz)
-
-    def init_inference(self, bsz):    
-        """ 
-        init_inference()
-        """
-        for rnn in self.rnns:
-            rnn.init_inference(bsz)
-
-class RNNCell(nn.Module):
-    """ 
-    RNNCell 
-    gate_multiplier is related to the architecture you're working with
-    For LSTM-like it will be 4 and GRU-like will be 3.
-    Always assumes input is NOT batch_first.
-    Output size that's not hidden size will use output projection
-    Hidden_states is number of hidden states that are needed for cell
-    if one will go directly to cell as tensor, if more will go as list
-    """
-    def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_hidden_states = 2, bias = False, output_size = None):
-        super(RNNCell, self).__init__()
-
-        self.gate_multiplier = gate_multiplier
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.cell = cell
-        self.bias = bias
-        self.output_size = output_size
-        if output_size is None:
-            self.output_size = hidden_size
-
-        self.gate_size = gate_multiplier * self.hidden_size
-        self.n_hidden_states = n_hidden_states
-
-        self.w_ih = nn.Parameter(torch.empty(self.gate_size, self.input_size))
-        self.w_hh = nn.Parameter(torch.empty(self.gate_size, self.output_size))
-
-        #Check if there's recurrent projection
-        if(self.output_size != self.hidden_size):
-            self.w_ho = nn.Parameter(torch.empty(self.output_size, self.hidden_size))
-
-        self.b_ih = self.b_hh = None
-        if self.bias:
-            self.b_ih = nn.Parameter(torch.empty(self.gate_size))
-            self.b_hh = nn.Parameter(torch.empty(self.gate_size))
-            
-        #hidden states for forward
-        self.hidden = [ None for states in range(self.n_hidden_states)]
-
-        self.reset_parameters()
-
-    def new_like(self, new_input_size=None):
-        """
-        new_like()
-        """
-        if new_input_size is None:
-            new_input_size = self.input_size
-            
-        return type(self)(self.gate_multiplier,
-                       new_input_size,
-                       self.hidden_size,
-                       self.cell,
-                       self.n_hidden_states,
-                       self.bias,
-                       self.output_size)
-
-    
-    #Use xavier where we can (weights), otherwise use uniform (bias)
-    def reset_parameters(self, gain=1):
-        """
-        reset_parameters()
-        """
-        stdev = 1.0 / math.sqrt(self.hidden_size)
-        for param in self.parameters():
-            param.data.uniform_(-stdev, stdev)
-    '''
-    Xavier reset:
-    def reset_parameters(self, gain=1):
-        stdv = 1.0 / math.sqrt(self.gate_size)
-
-        for param in self.parameters():
-            if (param.dim() > 1):
-                torch.nn.init.xavier_normal(param, gain)
-            else:
-                param.data.uniform_(-stdv, stdv)
-    '''
-    def init_hidden(self, bsz):
-        """
-        init_hidden()
-        """
-        for param in self.parameters():
-            if param is not None:
-                a_param = param
-                break
-
-        for i, _ in enumerate(self.hidden):
-            if(self.hidden[i] is None or self.hidden[i].data.size()[0] != bsz):
-
-                if i==0:
-                    hidden_size = self.output_size
-                else:
-                    hidden_size = self.hidden_size
-
-                tens = a_param.data.new(bsz, hidden_size).zero_()
-                self.hidden[i] = Variable(tens, requires_grad=False)
-            
-        
-    def reset_hidden(self, bsz):
-        """
-        reset_hidden()
-        """
-        for i, _ in enumerate(self.hidden):
-            self.hidden[i] = None
-        self.init_hidden(bsz)
-
-    def detach_hidden(self):
-        """
-        detach_hidden()
-        """
-        for i, _ in enumerate(self.hidden):
-            if self.hidden[i] is None:
-                raise RuntimeError("Must initialize hidden state before you can detach it")
-        for i, _ in enumerate(self.hidden):
-            self.hidden[i] = self.hidden[i].detach()
-        
-    def forward(self, input):
-        """
-        forward()
-        if not inited or bsz has changed this will create hidden states
-        """
-        self.init_hidden(input.size()[0])
-
-        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
-        self.hidden = self.cell(input, hidden_state, self.w_ih, self.w_hh, b_ih=self.b_ih, b_hh=self.b_hh)
-        if(self.n_hidden_states > 1):
-            self.hidden = list(self.hidden)
-        else:
-            self.hidden=[self.hidden]
-
-        if self.output_size != self.hidden_size:
-            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
-
-        return tuple(self.hidden)
diff --git a/apex/RNN/__init__.py b/apex/RNN/__init__.py
deleted file mode 100644
index d70674666..000000000
--- a/apex/RNN/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .models import LSTM, GRU, ReLU, Tanh, mLSTM
-
-__all__ = ['models']
diff --git a/apex/RNN/cells.py b/apex/RNN/cells.py
deleted file mode 100644
index 09b08581d..000000000
--- a/apex/RNN/cells.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .RNNBackend import RNNCell
-
-from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
-
-import math 
-
-
-class mLSTMRNNCell(RNNCell):
-    """
-    mLSTMRNNCell
-    """
-
-    def __init__(self, input_size, hidden_size, bias = False, output_size = None):
-        gate_multiplier = 4
-        super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
-
-        self.w_mih = nn.Parameter(torch.empty(self.output_size, self.input_size))
-        self.w_mhh = nn.Parameter(torch.empty(self.output_size, self.output_size))
-
-        self.reset_parameters()
-
-    def forward(self, input):
-        """
-        mLSTMRNNCell.forward()
-        """
-        #if not inited or bsz has changed this will create hidden states
-        self.init_hidden(input.size()[0])
-
-        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
-
-        self.hidden = list(
-                           self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
-                           b_ih=self.b_ih, b_hh=self.b_hh)
-        )
-        
-        if self.output_size != self.hidden_size:
-            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
-        return tuple(self.hidden)
-
-
-    def new_like(self, new_input_size=None):
-        if new_input_size is None:
-            new_input_size = self.input_size
-        
-        return type(self)(
-            new_input_size,
-            self.hidden_size,
-            self.bias,
-            self.output_size)
-
-def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
-    """
-    mLSTMCell
-    """
-
-    if input.is_cuda:
-        igates = F.linear(input, w_ih)
-        m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
-        hgates = F.linear(m, w_hh)
-
-        state = fusedBackend.LSTMFused.apply
-        return state(igates, hgates, hidden[1], b_ih, b_hh)
-
-    hx, cx = hidden
-    
-    m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
-    gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
-
-    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
-
-    ingate = F.sigmoid(ingate)
-    forgetgate = F.sigmoid(forgetgate)
-    cellgate = F.tanh(cellgate)
-    outgate = F.sigmoid(outgate)
-    
-    cy = (forgetgate * cx) + (ingate * cellgate)
-    hy = outgate * F.tanh(cy)
-    
-    return hy, cy
-                                                                            
diff --git a/apex/RNN/models.py b/apex/RNN/models.py
deleted file mode 100644
index d661aa0de..000000000
--- a/apex/RNN/models.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import torch
-
-from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
-
-from apex import deprecated_warning
-from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
-from .cells import mLSTMRNNCell, mLSTMCell
-
-def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
-    """
-    :class:`toRNNBackend`
-    """
-
-    deprecated_warning("`apex.RNN` is deprecated and will be removed by the end of February 2023.")
-    if bidirectional:
-        return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
-    else:
-        return stackedRNN(inputRNN, num_layers, dropout = dropout)
-
-
-def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`LSTM`
-    """
-    inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-
-def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`GRU`
-    """
-    inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-
-def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`ReLU`
-    """
-    inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-
-def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`Tanh`
-    """
-    inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-
-def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`mLSTM`
-    """
-    inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-
-
diff --git a/apex/__init__.py b/apex/__init__.py
index 74851f5b3..f9f8aa379 100644
--- a/apex/__init__.py
+++ b/apex/__init__.py
@@ -5,16 +5,13 @@
 import torch
 
 
-__all__ = ["amp", "fp16_utils", "optimizers", "normalization", "transformer"]
+__all__ = ["optimizers", "normalization", "transformer"]
 
 
 if torch.distributed.is_available():
     from . import parallel
     __all__.append("parallel")
 
-from . import amp
-from . import fp16_utils
-
 # For optimizers and normalization there is no Python fallback.
 # Absence of cuda backend is a hard error.
 # I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
diff --git a/apex/amp/README.md b/apex/amp/README.md
deleted file mode 100644
index a87b5010e..000000000
--- a/apex/amp/README.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# amp: Automatic Mixed Precision
-
-## Annotating User Functions
-
-Nearly all PyTorch user code needs nothing more than the two steps
-above to use amp. After all, custom layers are built out of simpler
-PyTorch components, and amp already can see those.
-
-However, any custom C++ or CUDA code is outside of amp's (default)
-view of things. For example, suppose I implemented a new recurrent
-cell called a "forgetful recurrent unit" that calls directly into a
-CUDA backend:
-
-```python
-from backend import FRUBackend
-
-def fru(input, hidden, weight, bias):
-    # call to CUDA code
-    FRUBackend(input, hidden, weight, bias)
-```
-
-In this case, it is possible to get a runtime type mismatch. For
-example, you might have `input` in fp16, and `weight` in fp32, and amp
-doesn't have the visibility to insert an appropriate cast.
-
-amp exposes two ways to handle "invisible" backend code: function
-annotations and explicit registration.
-
-#### Function annotation
-
-The first way to handle backend code is a set of function annotations:
-
-- `@amp.half_function`
-- `@amp.float_function`
-- `@amp.promote_function`
-
-These correspond to:
-
-- Cast all arguments to fp16
-- Cast all argumnets fo fp32
-- If there are any type mismatches, cast everything to the widest type
-
-In our example, we believe that the FRU unit is fp16-safe and will get
-performance gains from casting its arguments to fp16, so we write:
-
-```python
-@amp.half_function
-def fru(input, hidden, weight, bias):
-    #...
-```
-
-#### Explicit registration
-
-The other way to handle backend code is with explicit function
-registration:
-
-- `amp.register_half_function(module, function_name)`
-- `amp.register_float_function(module, function_name)`
-- `amp.register_promote_function(module, function_name)`
-
-When using this API, `module` is the containing class or module for
-the function, and `function_name` is the _string_ name of the
-function. Note that the function must be registered before the call to
-`amp.initalize()`.
-
-For our FRU unit, we can register the backend function directly:
-
-```python
-import backend
-
-amp.register_half_function(backend, 'FRUBackend')
-```
diff --git a/apex/amp/__init__.py b/apex/amp/__init__.py
deleted file mode 100644
index 34d080a69..000000000
--- a/apex/amp/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .amp import init, half_function, float_function, promote_function,\
-    register_half_function, register_float_function, register_promote_function
-from .handle import scale_loss, disable_casts
-from .frontend import initialize, state_dict, load_state_dict
-from ._amp_state import master_params, _amp_state
diff --git a/apex/amp/__version__.py b/apex/amp/__version__.py
deleted file mode 100644
index 3a83701b2..000000000
--- a/apex/amp/__version__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-VERSION = (0, 1, 0)
-__version__ = '.'.join(map(str, VERSION))
diff --git a/apex/amp/_amp_state.py b/apex/amp/_amp_state.py
deleted file mode 100644
index 7e8a329f5..000000000
--- a/apex/amp/_amp_state.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# This is a "header object" that allows different amp modules to communicate.
-# I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.
-# But apparently it's ok:
-# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
-import torch
-
-
-class AmpState(object):
-    def __init__(self):
-        self.hard_override=False
-        self.allow_incoming_model_not_fp32 = False
-        self.verbosity=1
-
-
-# Attribute stash.  Could also just stash things as global module attributes.
-_amp_state = AmpState()
-
-
-def warn_or_err(msg):
-    if _amp_state.hard_override:
-        print("Warning:  " + msg)
-    else:
-        raise RuntimeError(msg)
-        # I'm not sure if allowing hard_override is a good idea.
-        # + "  If you're sure you know what you're doing, supply " +
-        #                    "hard_override=True to amp.initialize.")
-
-
-def maybe_print(msg, rank0=False):
-    distributed = torch.distributed.is_available() and \
-        torch.distributed.is_initialized() and \
-        torch.distributed.get_world_size() > 1
-    if _amp_state.verbosity > 0:
-        if rank0:
-            if distributed:
-                if torch.distributed.get_rank() == 0:
-                    print(msg)
-            else:
-                print(msg)
-        else:
-            print(msg)
-
-
-# def iter_params(param_groups):
-#     for group in param_groups:
-#         for p in group['params']:
-#             yield p
-
-
-def master_params(optimizer):
-    """
-    Generator expression that iterates over the params owned by ``optimizer``.
-
-    Args:
-        optimizer: An optimizer previously returned from ``amp.initialize``.
-    """
-    for group in optimizer.param_groups:
-        for p in group['params']:
-            yield p
diff --git a/apex/amp/_initialize.py b/apex/amp/_initialize.py
deleted file mode 100644
index 3ae6fded1..000000000
--- a/apex/amp/_initialize.py
+++ /dev/null
@@ -1,265 +0,0 @@
-import collections.abc as container_abcs
-from types import MethodType
-import functools
-import sys
-import warnings
-
-import numpy as np
-import torch
-
-from ._amp_state import _amp_state, warn_or_err
-from .handle import disable_casts
-from .scaler import LossScaler
-from ._process_optimizer import _process_optimizer
-from apex.fp16_utils import convert_network
-from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
-from ..contrib.optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
-
-if torch.distributed.is_available():
-    from ..parallel import DistributedDataParallel as apex_DDP
-    from ..parallel.LARC import LARC
-
-
-def to_type(dtype, t):
-    if isinstance(t, torch.Tensor):
-        if not t.is_cuda:
-            # This should not be a hard error, since it may be legitimate.
-            warnings.warn("An input tensor was not cuda.")
-        # GANs require this.
-        # if t.requires_grad:
-        #     warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
-        #         "its gradients will not be properly allreduced by DDP.")
-        if t.is_floating_point():
-            return t.to(dtype)
-        return t
-    else:
-        # Trust the user's custom batch type, that's all I can do here.
-        return t.to(dtype)
-
-
-# Modified from torch.optim.optimizer.py.  This is a bit more general than casted_args in utils.py.
-def applier(value, fn):
-    if isinstance(value, torch.Tensor):
-        return fn(value)
-    elif isinstance(value, str):
-        return value
-    elif isinstance(value, np.ndarray):
-        return value
-    elif hasattr(value, "to"): # Allow handling of custom batch classes
-        return fn(value)
-    elif isinstance(value, container_abcs.Mapping):
-        return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
-    elif isinstance(value, container_abcs.Iterable):
-        return type(value)(applier(v, fn) for v in value)
-    else:
-        # Do I want this to fire off even if someone chooses to pass something ordinary like
-        # an int or float?  May be more annoying than it's worth.
-        # print("Warning:  unrecognized type in applier.  If your input data is a custom class, "
-        #     "provide it with a .to(dtype) method which converts its floating-point Tensors to dtype. "
-        #     "Amp will check for your custom to() and invoke it to cast the batch's "
-        #     "floating-point Tensors to the appropriate type. "
-        #     "Also, if your data is a custom class, it is your responsibility to ensure that "
-        #     "any Tensors you want to be cuda are already cuda."
-        return value
-
-
-def check_models(models):
-    for model in models:
-        parallel_type = None
-        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
-            parallel_type = "torch.nn.parallel.DistributedDataParallel"
-        if ('apex_DDP' in sys.modules) and isinstance(model, apex_DDP):
-            parallel_type = "apex.parallel.DistributedDataParallel"
-        if isinstance(model, torch.nn.parallel.DataParallel):
-            parallel_type = "torch.nn.parallel.DataParallel"
-        if parallel_type is not None:
-            raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
-                "Parallel wrappers should only be applied to the model(s) AFTER \n"
-                "the model(s) have been returned from amp.initialize.")
-
-
-def check_params_fp32(models):
-    for model in models:
-        for name, param in model.named_parameters():
-            if param.is_floating_point():
-                if 'Half' in param.type():
-                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you do not need to call .half() on your model\n"
-                        "before passing it, no matter what optimization level you choose.".format(
-                        name, param.type()))
-                elif not param.is_cuda:
-                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you need to provide a model with parameters\n"
-                        "located on a CUDA device before passing it no matter what optimization level\n"
-                        "you chose. Use model.to('cuda') to use the default device.".format(
-                        name, param.type()))
-
-        # Backward compatibility for PyTorch 0.4
-        if hasattr(model, 'named_buffers'):
-            buf_iter = model.named_buffers()
-        else:
-            buf_iter = model._buffers
-        for obj in buf_iter:
-            if type(obj)==tuple:
-                name, buf = obj
-            else:
-                name, buf = obj, buf_iter[obj]
-            if buf.is_floating_point():
-                if 'Half' in buf.type():
-                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you do not need to call .half() on your model\n"
-                        "before passing it, no matter what optimization level you choose.".format(
-                        name, buf.type()))
-                elif not buf.is_cuda:
-                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you need to provide a model with buffers\n"
-                        "located on a CUDA device before passing it no matter what optimization level\n"
-                        "you chose. Use model.to('cuda') to use the default device.".format(
-                        name, buf.type()))
-
-
-def check_optimizers(optimizers):
-    for optim in optimizers:
-        bad_optim_type = None
-        if isinstance(optim, FP16_Optimizer_general):
-            bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
-        if isinstance(optim, FP16_Optimizer_for_fused):
-            bad_optim_type = "apex.optimizers.FP16_Optimizer"
-        if bad_optim_type is not None:
-            raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
-                               "The optimizer(s) passed to amp.initialize() must be bare \n"
-                               "instances of either ordinary Pytorch optimizers, or Apex fused \n"
-                               "optimizers.\n")
-
-
-class O2StateDictHook(object):
-    def __init__(self, fn):
-        self.fn = fn
-
-    def __call__(self, module, state_dict, prefix, local_metadata):
-        for key in state_dict:
-            param = state_dict[key]
-            if 'Half' in param.type():
-                param = param.to(torch.float32)
-                state_dict[key] = param
-
-
-def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
-    from .amp import init as amp_init
-
-    optimizers_was_list = False
-    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
-        optimizers = [optimizers]
-    elif optimizers is None:
-        optimizers = []
-    elif isinstance(optimizers, list):
-        optimizers_was_list = True
-        check_optimizers(optimizers)
-    else:
-        check_optimizers([optimizers])
-        raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")
-
-    if isinstance(models, torch.nn.Module):
-        models_was_list = False
-        models = [models]
-    elif isinstance(models, list):
-        models_was_list = True
-    else:
-        raise TypeError("models must be either a single model or a list of models.")
-
-    check_models(models)
-
-    if not _amp_state.allow_incoming_model_not_fp32:
-        check_params_fp32(models)
-
-    # In the future, when FP16_Optimizer can be deprecated and master weights can
-    # become an attribute, remember to stash master weights before casting the model.
-
-    if properties.cast_model_type:
-        if properties.keep_batchnorm_fp32:
-            for model in models:
-                convert_network(model, properties.cast_model_type)
-        else:
-            for model in models:
-                model.to(properties.cast_model_type)
-
-        input_caster = functools.partial(to_type, properties.cast_model_type)
-        if cast_model_outputs is not None:
-            output_caster = functools.partial(to_type, cast_model_outputs)
-        else:
-            output_caster = functools.partial(to_type, torch.float32)
-
-        for model in models:
-            # Patch the forward method to cast incoming data to the correct type, and
-            # outgoing data to float32, so "the user never needs to call .half()."
-            # I like writing things explicitly more than decorators.
-            def patch_forward(old_fwd):
-                def new_fwd(*args, **kwargs):
-                    output = old_fwd(*applier(args, input_caster),
-                                     **applier(kwargs, input_caster))
-                    return applier(output, output_caster)
-                return new_fwd
-
-            model.forward = patch_forward(model.forward)
-
-        # State dict trick to recast any preexisting per-param state tensors
-        for optimizer in optimizers:
-            optimizer.load_state_dict(optimizer.state_dict())
-
-        # patch model.state_dict() to return float32 params
-        for model in models:
-            for module in model.modules():
-                module._register_state_dict_hook(O2StateDictHook(functools.partial(to_type, torch.float32)))
-
-    elif cast_model_outputs is not None:
-        output_caster = functools.partial(to_type, cast_model_outputs)
-
-        for model in models:
-            def patch_forward(old_fwd):
-                def new_fwd(*args, **kwargs):
-                    output = old_fwd(*args, **kwargs)
-                    return applier(output, output_caster)
-                return new_fwd
-
-            model.forward = patch_forward(model.forward)
-
-    for i, optimizer in enumerate(optimizers):
-        optimizers[i] = _process_optimizer(optimizer, properties)
-
-    _amp_state.loss_scalers = []
-    for _ in range(num_losses):
-        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale,
-                                                  min_loss_scale=_amp_state.min_loss_scale,
-                                                  max_loss_scale=_amp_state.max_loss_scale))
-
-    if properties.patch_torch_functions:
-        # handle is unused here. It's accessible later through a global value anyway.
-        handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
-        for optimizer in optimizers:
-            # Disable Amp casting for the optimizer step, because it should only be
-            # applied to FP32 master params anyway.
-            def patch_step(old_step):
-                def new_step(self, *args, **kwargs):
-                    with disable_casts():
-                        output = old_step(*args, **kwargs)
-                    return output
-                return new_step
-
-            optimizer.step = MethodType(patch_step(optimizer.step), optimizer)
-
-    if optimizers_was_list:
-        if models_was_list:
-            return models, optimizers
-        else:
-            return models[0], optimizers
-    else:
-        if models_was_list:
-            if len(optimizers) == 0:
-                return models
-            else:
-                return models, optimizers[0]
-        else:
-            if len(optimizers) == 0:
-                return models[0]
-            else:
-                return models[0], optimizers[0]
diff --git a/apex/amp/_process_optimizer.py b/apex/amp/_process_optimizer.py
deleted file mode 100644
index 471289bba..000000000
--- a/apex/amp/_process_optimizer.py
+++ /dev/null
@@ -1,489 +0,0 @@
-import types
-from ..fp16_utils import master_params_to_model_params
-from ..multi_tensor_apply import multi_tensor_applier
-from ._amp_state import maybe_print
-import torch
-from ..optimizers import FusedSGD
-
-
-class AmpOptimizerState(object):
-    def __init__(self):
-        pass
-
-
-def _master_params_to_model_params(self):
-    stash = self._amp_stash
-    if multi_tensor_applier.available:
-        if len(stash.all_fp16_params) > 0:
-            multi_tensor_applier(
-                stash.multi_tensor_scale,
-                stash.dummy_overflow_buf,
-                [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
-                1.0)
-    else:
-        for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
-            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
-
-
-def lazy_init_with_master_weights(self):
-        stash = self._amp_stash
-        stash.fp16_groups = []
-        stash.fp32_from_fp16_groups = []
-        stash.fp32_from_fp32_groups = []
-        for i, param_group in enumerate(self.param_groups):
-            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
-                        #             .format(param.size()))
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        param_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                        # Reset existing state dict key to the new master param.
-                        # We still need to recast per-param state tensors, if any, to FP32.
-                        if param in self.state:
-                           self.state[master_param] = self.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
-                        #             .format(param.size()))
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-                    else:
-                        raise TypeError("Optimizer's parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                        "Received {}".format(param.type()))
-
-            stash.fp16_groups.append(fp16_params_this_group)
-            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-        stash.all_fp16_params = []
-        for group in stash.fp16_groups:
-            stash.all_fp16_params += group
-
-        stash.all_fp32_from_fp16_params = []
-        for group in stash.fp32_from_fp16_groups:
-            stash.all_fp32_from_fp16_params += group
-
-        stash.all_fp32_from_fp32_params = []
-        for group in stash.fp32_from_fp32_groups:
-            stash.all_fp32_from_fp32_params += group
-
-        # all_fp16_grad_stash is only needed for fused optimizers.
-        stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
-        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
-        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
-
-        for param in stash.all_fp32_from_fp16_params:
-            param.grad = None
-
-        for param in stash.all_fp32_from_fp32_params:
-            param.grad = None
-
-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
-        self.load_state_dict(self.state_dict())
-
-
-def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None):
-        grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0
-
-        # not much to do if scale == 1.0 and static scaling
-        if scaler.loss_scale() == 1.0 and not scaler.dynamic:
-            # Clear the stash.
-            for i in range(len(stashed_grads)):
-                stashed_grads[i] = None
-            return
-        
-        if scale_override is not None:
-            grads_have_scale, stashed_have_scale, out_scale = scale_override
-
-        # This is a lot of python overhead...
-        grads_needing_unscale = []
-        grads_needing_unscale_with_stash = []
-        stashed = []
-        for param, stashed_grad in zip(params, stashed_grads):
-            if param.grad is None and stashed_grad is not None:
-                param.grad = stashed_grad
-            elif param.grad is not None and stashed_grad is None:
-                grads_needing_unscale.append(param.grad)
-            elif param.grad is not None and stashed_grad is not None:
-                grads_needing_unscale_with_stash.append(param.grad)
-                stashed.append(stashed_grad)
-            else: # param.grad is None and stashed_grad is None
-                continue
-
-        # unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
-        if len(grads_needing_unscale) > 0:
-            scaler.unscale(
-                grads_needing_unscale,
-                grads_needing_unscale,
-                None, # unused_scale, currently present to avoid API breakage elsewhere
-                models_are_masters=True,
-                scale_override=grads_have_scale/out_scale)
-
-        if len(grads_needing_unscale_with_stash) > 0:
-            scaler.unscale_with_stashed(
-                grads_needing_unscale_with_stash,
-                stashed,
-                grads_needing_unscale_with_stash,
-                scale_override=(grads_have_scale, stashed_have_scale, out_scale))
-
-        # Clear the stash.
-        for i in range(len(stashed_grads)):
-            stashed_grads[i] = None
-
-
-def prepare_backward_with_master_weights(self):
-    stash = self._amp_stash
-
-    self._amp_lazy_init()
-
-    for i, param in enumerate(stash.all_fp16_params):
-        # Set up to leverage grad copy elision.
-        # This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
-        param.grad = None
-
-    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
-    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad
-
-    for i, param in enumerate(stash.all_fp32_from_fp32_params):
-        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
-        # Set up to leverage grad copy elision:
-        param.grad = None
-
-
-def post_backward_with_master_weights(self, scaler):
-    stash = self._amp_stash
-
-    self._amp_lazy_init()
-
-    # This is a lot of python overhead...
-    fp16_grads_needing_unscale = []
-    new_fp32_grads = []
-    fp16_grads_needing_unscale_with_stash = []
-    preexisting_fp32_grads = []
-    for fp16_param, fp32_param in zip(stash.all_fp16_params,
-                                      stash.all_fp32_from_fp16_params):
-        if fp16_param.grad is None and fp32_param.grad is not None:
-            continue
-        elif fp16_param.grad is not None and fp32_param.grad is None:
-            fp32_param.grad = torch.empty_like(fp32_param)
-            fp16_grads_needing_unscale.append(fp16_param.grad)
-            new_fp32_grads.append(fp32_param.grad)
-        elif fp16_param.grad is not None and fp32_param.grad is not None:
-            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
-            preexisting_fp32_grads.append(fp32_param.grad)
-        else: # fp16_param.grad is None and fp32_param.grad is None:
-            continue
-
-    if len(fp16_grads_needing_unscale) > 0:
-        scaler.unscale(
-            fp16_grads_needing_unscale,
-            new_fp32_grads,
-            scaler.loss_scale(),
-            models_are_masters=False)
-
-    if len(fp16_grads_needing_unscale_with_stash) > 0:
-        scaler.unscale_with_stashed(
-            fp16_grads_needing_unscale_with_stash,
-            preexisting_fp32_grads,
-            preexisting_fp32_grads)
-
-    # fp32 params can be treated as they would be in the "no_master_weights" case.
-    post_backward_models_are_masters(
-        scaler,
-        stash.all_fp32_from_fp32_params,
-        stash.all_fp32_from_fp32_grad_stash)
-
-
-def lazy_init_no_master_weights(self):
-    stash = self._amp_stash
-    stash.all_fp16_params = []
-    stash.all_fp32_params = []
-    for i, param_group in enumerate(self.param_groups):
-        for i, param in enumerate(param_group['params']):
-            if param.type() == 'torch.cuda.HalfTensor':
-                stash.all_fp16_params.append(param)
-            elif param.type() == 'torch.cuda.FloatTensor':
-                stash.all_fp32_params.append(param)
-            else:
-                raise TypeError("Optimizer's parameters must be either "
-                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                "Received {}".format(param.type()))
-
-    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
-    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
-
-
-def prepare_backward_no_master_weights(self):
-    stash = self._amp_stash
-
-    self._amp_lazy_init()
-
-    for i, param in enumerate(stash.all_fp16_params):
-        stash.all_fp16_grad_stash[i] = param.grad
-        # Set up to leverage grad copy elision:
-        param.grad = None
-
-    for i, param in enumerate(stash.all_fp32_params):
-        stash.all_fp32_grad_stash[i] = param.grad
-        # Set up to leverage grad copy elision:
-        param.grad = None
-
-
-def post_backward_no_master_weights(self, scaler):
-    stash = self._amp_stash
-
-    self._amp_lazy_init()
-
-    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
-             (stash.all_fp32_params, stash.all_fp32_grad_stash))
-
-    for params, stashed_grads in split_types:
-        post_backward_models_are_masters(scaler, params, stashed_grads)
-
-
-#####################################################################################
-# FusedSGD versions
-#####################################################################################
-
-# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
-# outside the kernel, so we must accumulate directly into the model grads.
-def prepare_backward_with_master_weights_FusedSGD(self):
-    if self.materialize_master_grads:
-        prepare_backward_with_master_weights(self)
-    else:
-        stash = self._amp_stash
-
-        self._amp_lazy_init()
-
-        for i, param in enumerate(stash.all_fp16_params):
-            stash.all_fp16_grad_stash[i] = param.grad
-            # Set up to leverage grad copy elision:
-            param.grad = None
-
-        for i, param in enumerate(stash.all_fp32_from_fp32_params):
-            stash.all_fp32_from_fp32_grad_stash[i] = param.grad
-            # Set up to leverage grad copy elision:
-            param.grad = None
-
-
-def post_backward_with_master_weights_FusedSGD(self, scaler):
-    if self.materialize_master_grads:
-        post_backward_with_master_weights(self, scaler)
-    else:
-        stash = self._amp_stash
-
-        self._amp_lazy_init()
-
-        grads_have_scale = scaler.loss_scale()
-        stashed_have_scale = self.most_recent_scale
-        out_scale = grads_have_scale
-        if self.scale_set_by_backward:
-            out_scale = min(grads_have_scale, self.most_recent_scale)
-
-        split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
-                 (stash.all_fp32_from_fp32_params, stash.all_fp32_from_fp32_grad_stash))
-
-
-        # unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
-        # stashed_grads are scaled by self.most_recent_scale.
-        for params, stashed_grads in split_types:
-            post_backward_models_are_masters(scaler, params, stashed_grads,
-                                             (grads_have_scale, stashed_have_scale, out_scale))
-
-        self.most_recent_scale = out_scale
-        self.scale_set_by_backward = True
-
-
-def prepare_backward_no_master_weights_FusedSGD(self):
-    prepare_backward_no_master_weights(self)
-
-
-def post_backward_no_master_weights_FusedSGD(self, scaler):
-    post_backward_no_master_weights(self, scaler)
-
-
-def _amp_lazy_init(self):
-    stash = self._amp_stash
-
-    if not stash.lazy_init_called:
-        self._lazy_init_maybe_master_weights()
-        stash.lazy_init_called = True
-
-
-def _process_optimizer(optimizer, properties):
-    if hasattr(optimizer, "_amp_stash"):
-        raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
-    else:
-        optimizer._amp_stash = AmpOptimizerState()
-
-    optimizer._amp_stash.lazy_init_called = False
-    optimizer._amp_stash.already_patched = False
-    optimizer._amp_stash.params_have_scaled_gradients = False
-
-    for name in ("_lazy_init_maybe_master_weights",
-                 "_master_params_to_model_params",
-                 "_prepare_amp_backward",
-                 "_post_amp_backward",
-                 "_amp_lazy_init"):
-        if hasattr(optimizer, name):
-            raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
-
-    # TODO:  Centralize exposure and import error checking for the C backend.
-    if multi_tensor_applier.available:
-        import amp_C
-        optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
-        optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
-        optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
-
-    if properties.master_weights:
-        optimizer._lazy_init_maybe_master_weights = types.MethodType(
-            lazy_init_with_master_weights, optimizer)
-
-        optimizer._master_params_to_model_params = types.MethodType(
-            _master_params_to_model_params, optimizer)
-
-        old_step = optimizer.step
-        def new_step(self, closure=None):
-            if closure is not None:
-                raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
-            retval = old_step()
-            if not isinstance(self, FusedSGD):
-                self._master_params_to_model_params()
-            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
-            for param in self._amp_stash.all_fp32_from_fp16_params:
-                param.grad = None
-            return retval
-        optimizer.step = types.MethodType(new_step, optimizer)
-
-        old_zero_grad = optimizer.zero_grad
-        def new_zero_grad(self):
-            stash = self._amp_stash
-            self._amp_lazy_init()
-            # Zero the model grads.
-            for param in stash.all_fp16_params:
-                if param.grad is not None:
-                    param.grad.detach_()
-                    param.grad.zero_()
-            for param in stash.all_fp32_from_fp32_params:
-                if param.grad is not None:
-                    param.grad.detach_()
-                    param.grad.zero_()
-            # Clear the master grads that are independent of model grads
-            for param in self._amp_stash.all_fp32_from_fp16_params:
-                param.grad = None
-        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
-
-        if isinstance(optimizer, FusedSGD):
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_with_master_weights_FusedSGD, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_with_master_weights_FusedSGD, optimizer)
-        else:
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_with_master_weights, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_with_master_weights, optimizer)
-    else:
-        optimizer._lazy_init_maybe_master_weights = types.MethodType(
-            lazy_init_no_master_weights, optimizer)
-
-        if isinstance(optimizer, FusedSGD):
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_no_master_weights_FusedSGD, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_no_master_weights_FusedSGD, optimizer)
-        else:
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_no_master_weights, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_no_master_weights, optimizer)
-
-    optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)
-
-    old_add_param_group = optimizer.add_param_group
-
-    def new_add_param_group(self, new_group):
-        stash = self._amp_stash
-
-        if not stash.lazy_init_called:
-            self._lazy_init_maybe_master_weights()
-            stash.lazy_init_called = True
-
-        assert isinstance(new_group, dict), "param group must be a dict"
-
-        new_params = new_group['params']
-        if isinstance(new_params, torch.Tensor):
-            new_group['params'] = [new_params]
-        elif isinstance(new_params, set):
-            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
-                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
-        else:
-            new_group['params'] = list(new_params)
-
-        if properties.master_weights:
-            # Mutate new_group in-place to use FP32 master params
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(new_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        new_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        fp32_params_this_group.append(param)
-                        new_group['params'][i] = param
-                    else:
-                        raise TypeError("Optimizer's parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                        "Received {}".format(param.type()))
-
-            stash.fp16_groups.append(fp16_params_this_group)
-            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-            stash.all_fp16_params += fp16_params_this_group
-            stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
-            stash.all_fp32_from_fp32_params += fp32_params_this_group
-
-            # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
-            stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]
-
-            # It should be ok to let params be added with existing .grad attributes.
-            # for param in fp16_params_this_group:
-            #     param.grad = None
-
-            # for param in fp32_from_fp16_params_this_group:
-            #     param.grad = None
-
-            # for param in stash.fp32_params_this_group:
-            #     param.grad = None
-        else:
-            for param in new_group['params']:
-                if param.type() == 'torch.cuda.HalfTensor':
-                    stash.all_fp16_params.append(param)
-                    stash.all_fp16_grad_stash.append(None)
-                elif param.type() == 'torch.cuda.FloatTensor':
-                    stash.all_fp32_params.append(param)
-                    stash.all_fp32_grad_stash.append(None)
-                else:
-                    raise TypeError("Optimizer's parameters must be either "
-                                    "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                    "Received {}".format(param.type()))
-
-        old_add_param_group(new_group)
-
-    optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)
-
-    return optimizer
diff --git a/apex/amp/amp.py b/apex/amp/amp.py
deleted file mode 100644
index 1a6046663..000000000
--- a/apex/amp/amp.py
+++ /dev/null
@@ -1,183 +0,0 @@
-import functools
-import itertools
-
-import torch
-
-from . import compat, rnn_compat, utils, wrap
-from .handle import AmpHandle, NoOpHandle
-from .lists import functional_overrides, torch_overrides, tensor_overrides
-from ._amp_state import _amp_state
-from .frontend import *
-
-
-_DECORATOR_HANDLE = None
-_USER_CAST_REGISTRY = set()
-_USER_PROMOTE_REGISTRY = set()
-
-
-def _decorator_helper(orig_fn, cast_fn, wrap_fn):
-    def wrapper(*args, **kwargs):
-        handle = _DECORATOR_HANDLE
-        if handle is None or not handle.is_active():
-            return orig_fn(*args, **kwargs)
-        inner_cast_fn = utils.verbosify(cast_fn, orig_fn.__name__,
-                                  handle.verbose)
-        return wrap_fn(orig_fn, inner_cast_fn, handle)(*args, **kwargs)
-    return wrapper
-
-
-# Decorator form
-def half_function(fn):
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=True)
-    return _decorator_helper(fn, utils.maybe_half, wrap_fn)
-
-
-def float_function(fn):
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=False)
-    return _decorator_helper(fn, utils.maybe_float, wrap_fn)
-
-
-def promote_function(fn):
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    wrap_fn = functools.partial(wrap.make_promote_wrapper)
-    return _decorator_helper(fn, utils.maybe_float, wrap_fn)
-
-
-# Registry form
-def register_half_function(module, name):
-    if not hasattr(module, name):
-        raise ValueError('No function named {} in module {}.'.format(
-            name, module))
-    _USER_CAST_REGISTRY.add((module, name, utils.maybe_half))
-
-
-def register_float_function(module, name):
-    if not hasattr(module, name):
-        raise ValueError('No function named {} in module {}.'.format(
-            name, module))
-    _USER_CAST_REGISTRY.add((module, name, utils.maybe_float))
-
-
-def register_promote_function(module, name):
-    if not hasattr(module, name):
-        raise ValueError('No function named {} in module {}.'.format(
-            name, module))
-    _USER_PROMOTE_REGISTRY.add((module, name))
-
-
-# Top-level function to insert _all_ the hooks.
-def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
-    global _DECORATOR_HANDLE
-
-    if not enabled:
-        handle = NoOpHandle()
-        _DECORATOR_HANDLE = handle
-        return handle
-
-    handle = AmpHandle(loss_scale, enable_caching, verbose)
-
-    # 0) Force-{fp16, fp32} for user-annotated functions
-    for mod, fn, cast_fn in _USER_CAST_REGISTRY:
-        try_caching = (cast_fn == utils.maybe_half)
-        wrap.cached_cast(mod, fn, cast_fn, handle,
-                         try_caching, verbose)
-    _USER_CAST_REGISTRY.clear()
-
-    # 0.5) Force-promote for user-annotated functions
-    for mod, fn in _USER_PROMOTE_REGISTRY:
-        wrap.promote(mod, fn, handle, verbose)
-    _USER_PROMOTE_REGISTRY.clear()
-
-    # 1) Force-{fp16, fp32} on white- / black-list functions
-    override_modules = [functional_overrides,
-                        torch_overrides,
-                        tensor_overrides]
-    cast_table = [('FP16_FUNCS', utils.maybe_half),
-                  ('FP32_FUNCS', utils.maybe_float)]
-    for module, (list_name, cast_fn) in itertools.product(override_modules,
-                                                          cast_table):
-        for fn in getattr(module, list_name):
-            try_caching = (cast_fn == utils.maybe_half)
-            wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
-                             try_caching, verbose)
-
-    # 1.5) Pre-0.4, put the blacklist methods on HalfTensor and whitelist
-    #      methods on FloatTensor, since they're distinct types.
-    if compat.tensor_is_float_tensor():
-        for fn in tensor_overrides.FP16_FUNCS:
-            wrap.cached_cast(torch.cuda.FloatTensor, fn, utils.maybe_half,
-                             handle, try_caching=True, verbose=verbose)
-        for fn in tensor_overrides.FP32_FUNCS:
-            wrap.cached_cast(torch.cuda.HalfTensor, fn, utils.maybe_float,
-                             handle, try_caching=False, verbose=verbose)
-
-    # 2) Enable type-promotion on multi-arg functions and methods.
-    #    NB: special handling for sequence fns (e.g. `torch.cat`).
-    promote_modules = [torch_overrides, tensor_overrides]
-    promote_table = [('CASTS', wrap.promote),
-                     ('SEQUENCE_CASTS', wrap.sequence_promote)]
-    for promote_mod, (list_name, promote_fn) in itertools.product(promote_modules,
-                                                                  promote_table):
-        for fn in getattr(promote_mod, list_name):
-            promote_fn(promote_mod.MODULE, fn, handle, verbose)
-
-    # 2.5) Pre-0.4, add blacklist methods directly to HalfTensor and FloatTensor types
-    if compat.tensor_is_float_tensor():
-        for cls, (list_name, promote_fn) in itertools.product([torch.cuda.FloatTensor,
-                                                               torch.cuda.HalfTensor],
-                                                              promote_table):
-            for fn in getattr(tensor_overrides, list_name):
-                promote_fn(cls, fn, handle, verbose)
-
-    # 3) For any in-place version of a blacklist function, error if any input is fp16.
-    #    NB: this is overly conservative.
-    for fn in utils.as_inplace(torch_overrides.FP32_FUNCS):
-        wrap.err_if_any_half(torch_overrides.MODULE, fn, handle)
-
-    # 3.5) For any in-place blacklist method, error if called on fp16 tensor
-    for fn in utils.as_inplace(tensor_overrides.FP32_FUNCS):
-        wrap.err_if_arg0_half(tensor_overrides.MODULE, fn, handle, verbose)
-        if compat.tensor_is_float_tensor():
-            wrap.err_if_arg0_half(torch.cuda.HalfTensor, fn, handle, verbose)
-
-    # 4) For other in-place methods, match the type of self tensor
-    for fn in utils.as_inplace(itertools.chain(
-            tensor_overrides.FP16_FUNCS,
-            tensor_overrides.CASTS)):
-        wrap.promote_match_arg0(tensor_overrides.MODULE, fn, handle, verbose)
-        if compat.tensor_is_float_tensor():
-            wrap.promote_match_arg0(torch.cuda.HalfTensor, fn, handle, verbose)
-            wrap.promote_match_arg0(torch.cuda.FloatTensor, fn, handle, verbose)
-
-    # 5) RNNs + RNN cells are whitelisted specially
-    if rnn_compat.has_old_rnns():
-        wrap.rnn_cast(torch.nn.backends.thnn.backend, 'RNN', handle, verbose)
-    if not rnn_compat.has_old_rnns():
-        # Patch in our own indirection of `_VF` in modules/rnn s.t. it is mutable.
-        torch.nn.modules.rnn._VF = rnn_compat.VariableFunctionsShim()
-        # Wrap all the rnns
-        for x in rnn_compat.RNN_NAMES:
-            wrap.new_rnn_cast(x.upper(), handle, verbose)
-
-    # Wrap all the RNN cells
-    rnn_compat.whitelist_rnn_cells(handle, verbose)
-
-    # 6) Place error+print message on banned functions.
-    #    Or, if allow_banned, then cast to FP32.
-    for fn, err_msg in functional_overrides.BANNED_FUNCS:
-        if allow_banned:
-            wrap.cached_cast(functional_overrides.MODULE, fn, utils.maybe_float,
-                             handle, try_caching=True, verbose=verbose)
-        else:
-            wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)
-
-    _DECORATOR_HANDLE = handle
-
-    _amp_state.handle = handle
-
-    return handle
diff --git a/apex/amp/compat.py b/apex/amp/compat.py
deleted file mode 100644
index 22276bd47..000000000
--- a/apex/amp/compat.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import torch
-
-# True for post-0.4, when Variables/Tensors merged.
-def variable_is_tensor():
-    v = torch.autograd.Variable()
-    return isinstance(v, torch.Tensor)
-
-def tensor_is_variable():
-    x = torch.Tensor()
-    return type(x) == torch.autograd.Variable
-
-# False for post-0.4
-def tensor_is_float_tensor():
-    x = torch.Tensor()
-    return type(x) == torch.FloatTensor
-
-# Akin to `torch.is_tensor`, but returns True for Variable
-# objects in pre-0.4.
-def is_tensor_like(x):
-    return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
-
-# Wraps `torch.is_floating_point` if present, otherwise checks
-# the suffix of `x.type()`.
-def is_floating_point(x):
-    if hasattr(torch, 'is_floating_point'):
-        return torch.is_floating_point(x)
-    try:
-        torch_type = x.type()
-        return torch_type.endswith('FloatTensor') or \
-            torch_type.endswith('HalfTensor') or \
-            torch_type.endswith('DoubleTensor')
-    except AttributeError:
-        return False
-
-def scalar_python_val(x):
-    if hasattr(x, 'item'):
-        return x.item()
-    else:
-        if isinstance(x, torch.autograd.Variable):
-            return x.data[0]
-        else:
-            return x[0]
-
-# Accounts for the possibility that some ops may be removed from a namespace.
-def filter_attrs(module, attrs):
-    return list(attrname for attrname in attrs if hasattr(module, attrname))
diff --git a/apex/amp/frontend.py b/apex/amp/frontend.py
deleted file mode 100644
index 616fb113e..000000000
--- a/apex/amp/frontend.py
+++ /dev/null
@@ -1,446 +0,0 @@
-from collections import OrderedDict
-
-import torch
-
-from ._initialize import _initialize
-from ._amp_state import _amp_state, warn_or_err, maybe_print
-
-
-class Properties(object):
-    """
-    This class has two purposes: to establish a set of default properties,
-    and to route setting of these attributes through __setattr__ so that (in theory)
-    they can be checked for consistency with other existing args.
-    """
-    def __init__(self):
-        self.options = {
-            "enabled" : False,
-            "opt_level" : None,
-            "cast_model_type" : None,
-            "patch_torch_functions" : False,
-            "keep_batchnorm_fp32" : None,
-            "master_weights" : None,
-            "loss_scale" : 1.0,
-            # Reserved for future functionality
-            # "fused_optimizer" : False,
-            # "enable_ddp_interop" : False,
-            }
-
-    """
-    This function allows updating several options at a time without routing through
-    __setattr__ checks, to avoid "you can't get there from here" scenarios.
-    Currently not intended to be exposed; users are expected to select an opt_level
-    and apply consistent modifications.
-    """
-    def _update_options_dict(self, new_options):
-        for k, v in new_options:
-            if k in self.options:
-                self.options[k] = v
-            else:
-                raise ValueError("Tried to set unexpected option {}".format(k))
-    """
-    The members of "options" are not direct attributes of self, so access attempts
-    will roll down to __getattr__.  This borrows from the logic in torch.nn.Module.
-    """
-    def __getattr__(self, name):
-        if "options" in self.__dict__:
-            options =  self.__dict__["options"]
-            if name in options:
-                return options[name]
-        raise AttributeError("'{}' object has no attribute '{}'".format(
-            type(self).__name__, name))
-
-    def __setattr__(self, name, value):
-        if "options" in self.__dict__:
-            if name in self.options:
-                # print("setting {} {}".format(name, value))
-                if name == "cast_model_type":
-                    if self.opt_level == "O1" and value is not None:
-                        if value is not False:
-                            if value is not torch.float32:
-                                warn_or_err("O1 inserts casts around Torch functions rather than "
-                                            "model weights, so with O1, the model weights themselves "
-                                            "should remain FP32. If you wish to cast the model to a "
-                                            "different type, use opt_level='O2' or 'O3'. " +
-                                            "cast_model_type was {}".format(value))
-                    self.options[name] = value
-                elif name == "patch_torch_functions":
-                    if self.opt_level != "O1" and value:
-                        warn_or_err("Currently, patch_torch_functions=True should only be set by "
-                                    "selecting opt_level='O1'.")
-                    self.options[name] = value
-                elif name == "keep_batchnorm_fp32":
-                    if self.opt_level == "O1" and value is not None:
-                        warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
-                                    "to run in FP32, so keep_batchnorm_fp32 should be None." +
-                                    " keep_batchnorm_fp32 was {}".format(value))
-                    if value == "False":
-                        self.options[name] = False
-                    elif value == "True":
-                        self.options[name] = True
-                    else:
-                        assert (value is True or value is False or value is None),\
-                            "keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
-                            "or None, found keep_batchnorm_fp32={}".format(value)
-                        self.options[name] = value
-                elif name == "master_weights":
-                    if self.opt_level == "O1" and value is not None:
-                        warn_or_err("It doesn't make sense to use master_weights with O1. "
-                                    "With O1, your model weights themselves should be FP32.")
-                    self.options[name] = value
-                elif name == "loss_scale":
-                    if value == "dynamic":
-                        self.options[name] = value
-                    else:
-                        self.options[name] = float(value)
-                else:
-                    self.options[name] = value
-        else:
-            super(Properties, self).__setattr__(name, value)
-
-
-""" O0-O3 are convenience wrappers to establish defaults for typically used mixed precision options. """
-
-class O3:
-    brief = "O3:  Pure FP16 training."
-    more = "Calls .half() on your model, converting the entire model to FP16.\n"\
-        "A casting operation is also inserted to cast incoming Tensors to FP16,\n"\
-        "so you don't need to change your data pipeline.\n"\
-        "This mode is useful for establishing a performance ceiling.\n"\
-        "It's also possible training may 'just work' in this mode.\n"\
-        "If not, try other optimization levels."
-
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O3"
-        properties.cast_model_type = torch.float16
-        properties.patch_torch_functions = False
-        properties.keep_batchnorm_fp32 = False
-        properties.master_weights = False
-        properties.loss_scale = 1.0
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-
-
-class O2:
-    brief = "O2:  FP16 training with FP32 batchnorm and FP32 master weights.\n"
-    more = "Calls .half() on your model, converting the entire model (except for batchnorms)\n"\
-        "to FP16.  Batchnorms are retained in FP32 for additional stability.\n"\
-        "The forward pass is patched to cast incoming Tensors to FP16, so you don't need to change\n"\
-        "your data pipeline.\n"\
-        "O2 creates FP32 master weights outside the model and patches any optimizers to update\n"\
-        "these master weights, then copy the master weights into the FP16 model weights.\n"\
-        "Master weights can also improve convergence and stability."
-
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O2"
-        properties.cast_model_type = torch.float16
-        properties.patch_torch_functions = False
-        properties.keep_batchnorm_fp32 = True
-        properties.master_weights = True
-        properties.loss_scale = "dynamic"
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-
-
-class O1:
-    brief = "O1:  Insert automatic casts around Pytorch functions and Tensor methods.\n"
-    more = "The type of your model's weights is not altered.  However, internally,\n"\
-        "Pytorch functions are patched to cast any Tensor Core-friendly ops to FP16 for speed,\n"\
-        "while operations that might benefit from the additional stability of FP32 are patched\n"\
-        "to cast their inputs to fp32.\n"\
-        "O1 is the safest way to try mixed precision training, and is recommended when\n"\
-        "trying mixed precision training for the first time."
-
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O1"
-        properties.cast_model_type = None
-        properties.patch_torch_functions = True
-        properties.keep_batchnorm_fp32 = None
-        properties.master_weights = None
-        properties.loss_scale = "dynamic"
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-
-
-class O0:
-    brief = "O0:  Pure FP32 training.\n"
-    more = "Your models are checked to make sure parameters are FP32, but otherwise the\n"\
-        "types of weights and internal Pytorch operations are not altered.  This mode disables any\n"\
-        "FP16 arithmetic, although other optimizations like DDP interop may still be requested.\n"
-
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O0"
-        properties.cast_model_type = torch.float32
-        properties.patch_torch_functions = False
-        properties.keep_batchnorm_fp32 = None
-        properties.master_weights = False
-        properties.loss_scale = 1.0
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-
-
-opt_levels = {"O3": O3(),
-              "O2": O2(),
-              "O1": O1(),
-              "O0": O0()}
-
-
-# allow user to directly pass Properties struct as well?
-def initialize(
-    models,
-    optimizers=None,
-    enabled=True,
-    opt_level="O1",
-    cast_model_type=None,
-    patch_torch_functions=None,
-    keep_batchnorm_fp32=None,
-    master_weights=None,
-    loss_scale=None,
-    cast_model_outputs=None,
-    num_losses=1,
-    verbosity=1,
-    min_loss_scale=None,
-    max_loss_scale=2.**24
-    ):
-    """
-    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
-    chosen ``opt_level`` and overridden properties, if any.
-
-    ``amp.initialize`` should be called **after** you have finished
-    constructing your model(s) and
-    optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
-    See `Distributed training`_ in the Imagenet example.
-
-    Currently, ``amp.initialize`` should only be called **once**,
-    although it can process an arbitrary number of
-    models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
-    If you think your use case requires ``amp.initialize`` to be called more than once,
-    `let us know`_.
-
-    Any property keyword argument that is not ``None`` will be interpreted as a manual override.
-
-    To prevent having to rewrite anything else in your script, name the returned models/optimizers
-    to replace the passed models/optimizers, as in the code sample below.
-
-    Args:
-        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
-        optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
-            REQUIRED for training, optional for inference.
-        enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops, so your script
-            should run as if Amp were not present.
-        opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.  Accepted values are
-            "O0", "O1", "O2", and "O3", explained in detail above.
-        cast_model_type (``torch.dtype``, optional, default=None):  Optional property override, see
-            above.
-        patch_torch_functions (bool, optional, default=None):  Optional property override.
-        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
-            passed as a string, must be the string "True" or "False".
-        master_weights (bool, optional, default=None):  Optional property override.
-        loss_scale (float or str, optional, default=None):  Optional property override.  If passed as a string,
-            must be a string representing a number, e.g., "128.0", or the string "dynamic".
-        cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that the outputs
-            of your model(s) are always cast to a particular type regardless of ``opt_level``.
-        num_losses (int, optional, default=1):  Option to tell Amp in advance how many losses/backward
-            passes you plan to use.  When used in conjunction with the ``loss_id`` argument to
-            ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
-            which can improve stability.  See "Multiple models/optimizers/losses"
-            under `Advanced Amp Usage`_ for examples.  If ``num_losses`` is left to 1, Amp will still
-            support multiple losses/backward passes, but use a single global loss scale
-            for all of them.
-        verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
-        min_loss_scale (float, default=None):  Sets a floor for the loss scale values that can be chosen by dynamic
-            loss scaling.  The default value of None means that no floor is imposed.
-            If dynamic loss scaling is not used, `min_loss_scale` is ignored.
-        max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
-            dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.
-
-    Returns:
-        Model(s) and optimizer(s) modified according to the ``opt_level``.
-        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
-        also be a list.
-
-    Permissible invocations::
-
-        model, optim = amp.initialize(model, optim,...)
-        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
-        [model1, model2], optim = amp.initialize([model1, model2], optim,...)
-        [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
-
-        # This is not an exhaustive list of the cross product of options that are possible,
-        # just a set of examples.
-        model, optim = amp.initialize(model, optim, opt_level="O0")
-        model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
-
-        model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
-        model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
-
-        model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
-        model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
-        model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
-
-        model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
-        model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
-        model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
-
-    The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
-
-    .. _`Distributed training`:
-        https://github.com/NVIDIA/apex/tree/master/examples/imagenet#distributed-training
-
-    .. _`Imagenet example`:
-        https://github.com/NVIDIA/apex/tree/master/examples/imagenet
-
-    .. _`Advanced Amp Usage`:
-        https://nvidia.github.io/apex/advanced.html
-
-    .. _`Advanced Amp Usage topic`:
-        https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses
-
-    .. _`let us know`:
-        https://github.com/NVIDIA/apex/issues
-    """
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    _amp_state.opt_properties = Properties()
-    _amp_state.verbosity = verbosity
-
-    if not enabled:
-        if optimizers is None:
-            return models
-        else:
-            return models, optimizers
-
-    if not torch.backends.cudnn.enabled:
-        raise RuntimeError(
-            "Amp requires torch.backends.cudnn.enabled = True")
-
-    if opt_level not in opt_levels:
-        raise RuntimeError(
-            "Unexpected optimization level {}. ".format(opt_level) +
-            "Options are 'O0', 'O1', 'O2', 'O3'.  Note that in `O0`, `O1`, etc., the prefix O is the letter O, " +
-            "not the number zero.")
-    else:
-        _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
-        maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
-        maybe_print("Defaults for this optimization level are:", True)
-        for k, v in _amp_state.opt_properties.options.items():
-            maybe_print("{:22} : {}".format(k, v), True)
-
-    _amp_state.min_loss_scale = min_loss_scale
-    _amp_state.max_loss_scale = max_loss_scale
-
-    maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
-    # I chose to have the keyword arguments listed directly in the argument list,
-    # instead of **kwargs, so I can't use kwargs.items() here.
-    if enabled is not None:
-        _amp_state.opt_properties.enabled = enabled
-    if opt_level is not None:
-        _amp_state.opt_properties.opt_level = opt_level
-    if cast_model_type is not None:
-        _amp_state.opt_properties.cast_model_type = cast_model_type
-    if patch_torch_functions is not None:
-        _amp_state.opt_properties.patch_torch_functions = patch_torch_functions
-    if keep_batchnorm_fp32 is not None:
-        _amp_state.opt_properties.keep_batchnorm_fp32 = keep_batchnorm_fp32
-    if master_weights is not None:
-        _amp_state.opt_properties.master_weights = master_weights
-    if loss_scale is not None:
-        _amp_state.opt_properties.loss_scale = loss_scale
-
-    maybe_print("After processing overrides, optimization options are:", True)
-    for k, v in _amp_state.opt_properties.options.items():
-        maybe_print("{:22} : {}".format(k, v), True)
-
-    return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)
-
-
-def state_dict(destination=None):
-    if destination is None:
-        destination = OrderedDict()
-
-    for idx, loss_scaler in enumerate(_amp_state.loss_scalers):
-        destination['loss_scaler%d' % idx] = {
-            'loss_scale': loss_scaler.loss_scale(),
-            'unskipped': loss_scaler._unskipped,
-        }
-    return destination
-
-
-def load_state_dict(state_dict):
-    # Check if state_dict containes the same number of loss_scalers as current setup
-    if len(state_dict) != len(_amp_state.loss_scalers):
-        print('Warning: state_dict contains {} entries, while {} loss_scalers are used'.format(
-            len(state_dict), len(_amp_state.loss_scalers)))
-
-    state_dict = state_dict.copy()
-
-    nb_loss_scalers = len(_amp_state.loss_scalers)
-    unexpected_keys = []
-    # Initialize idx outside, since unexpected_keys will increase it if enumerate is used
-    idx = 0
-    for key in state_dict:
-        if 'loss_scaler' not in key:
-            unexpected_keys.append(key)
-        else:
-            if idx > (nb_loss_scalers - 1):
-                print('Skipping loss_scaler[{}], since num_losses was set to {}'.format(
-                    idx, nb_loss_scalers))
-                break
-            _amp_state.loss_scalers[idx]._loss_scale = state_dict[key]['loss_scale']
-            _amp_state.loss_scalers[idx]._unskipped = state_dict[key]['unskipped']
-            idx += 1
-
-    if len(unexpected_keys) > 0:
-        raise RuntimeError(
-            'Error(s) in loading state_dict. Unexpected key(s) in state_dict: {}. '.format(
-                ', '.join('"{}"'.format(k) for k in unexpected_keys)))
-
-
-# TODO:  is this necessary/useful?
-# def check_option_consistency(enabled=True,
-#                              opt_level=None,
-#                              cast_model_type=None,
-#                              patch_torch_functions=None,
-#                              keep_batchnorm_fp32=None,
-#                              master_weights=None,
-#                              loss_scale=None,
-#                              enable_ddp_interop=None,
-#                              hard_override=False):
-#     """
-#     Utility function that enables users to quickly check if the option combination they intend
-#     to use is permitted.  ``check_option_consistency`` does not require models or optimizers
-#     to be constructed, and can be called at any point in the script.  ``check_option_consistency``
-#     is totally self-contained; it does not set any amp global state or affect anything outside
-#     of itself.
-#     """
-#
-#     if not enabled:
-#         return
-#
-#     if opt_level not in opt_levels:
-#         raise RuntimeError("Unexpected optimization level.  Options are 'O0', 'O1', 'O2', 'O3'.")
-#     else:
-#         opt_properties = opt_levels[opt_level](Properties())
-#         print("Selected optimization level {}", opt_levels[opt_level].brief)
-#         print("Defaults for this optimization level are:")
-#         for k, v in opt_properties.options:
-#             print("{:22} : {}".format(k, v))
-#
-#     print("Processing user overrides (additional kwargs that are not None)...")
-#     for k, v in kwargs:
-#         if k not in _amp_state.opt_properties.options:
-#             raise RuntimeError("Unexpected kwarg {}".format(k))
-#         if v is not None:
-#             setattr(opt_properties, k, v)
-#
-#     print("After processing overrides, optimization options are:")
-#     for k, v in opt_properties.options:
-#         print("{:22} : {}".format(k, v))
diff --git a/apex/amp/handle.py b/apex/amp/handle.py
deleted file mode 100644
index 0be567ca4..000000000
--- a/apex/amp/handle.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import contextlib
-import warnings
-import sys
-import torch
-
-from . import utils
-from .opt import OptimWrapper
-from .scaler import LossScaler
-from ._amp_state import _amp_state, master_params, maybe_print
-
-if torch.distributed.is_available():
-    from ..parallel.LARC import LARC
-
-
-# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
-@contextlib.contextmanager
-def scale_loss(loss,
-               optimizers,
-               loss_id=0,
-               model=None,
-               delay_unscale=False,
-               delay_overflow_check=False):
-    """
-    On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
-    ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::
-
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-
-    On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
-    and unscaled, so that ``optimizer.step()`` can be called.
-
-    .. note::
-        If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
-        can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
-        any FP16 gradients are copied to FP32 master gradients before being unscaled.
-        ``optimizer.step()`` will then apply the unscaled master gradients to the master params.
-
-    .. warning::
-        If Amp is using explicit FP32 master params, only the FP32 master gradients will be
-        unscaled.  The direct ``.grad`` attributes of any FP16
-        model params will remain scaled after context manager exit.
-        This subtlety affects gradient clipping.  See "Gradient clipping" under
-        `Advanced Amp Usage`_ for best practices.
-
-    Args:
-        loss(Tensor):  Typically a scalar Tensor. The ``scaled_loss`` that the context
-            manager yields is simply ``loss.float()*loss_scale``, so in principle
-            ``loss`` could have more than one element, as long as you call
-            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
-        optimizers:  All optimizer(s) for which the current backward pass is creating gradients.
-            Must be an optimizer or list of optimizers returned from an earlier call
-            to ``amp.initialize``.  For example use with multiple optimizers, see
-            "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
-        loss_id(int, optional, default=0):  When used in conjunction with the ``num_losses`` argument
-            to ``amp.initialize``, enables Amp to use a different loss scale per loss.  ``loss_id``
-            must be an integer between 0 and ``num_losses`` that tells Amp which loss is
-            being used for the current backward pass.  See "Multiple models/optimizers/losses"
-            under `Advanced Amp Usage`_ for examples.  If ``loss_id`` is left unspecified, Amp
-            will use the default global loss scaler for this backward pass.
-        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
-            optimizations.
-        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is never necessary, and
-            the default value of ``False`` is strongly recommended.
-            If ``True``, Amp will not unscale the gradients or perform model->master
-            gradient copies on context manager exit.
-            ``delay_unscale=True`` is a minor ninja performance optimization and can result
-            in weird gotchas (especially with multiple models/optimizers/losses),
-            so only use it if you know what you're doing.
-            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
-            illustrates a situation where this CAN (but does not need to) be used.
-
-    .. warning::
-        If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
-        called yet after context manager exit, and must wait for another, later backward context
-        manager invocation with ``delay_unscale`` left to False.
-
-    .. _`Advanced Amp Usage`:
-        https://nvidia.github.io/apex/advanced.html
-    """
-    if not hasattr(_amp_state, "opt_properties"):
-        raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized.  "
-                           "model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
-                           "before `with amp.scale_loss`.")
-
-    if not _amp_state.opt_properties.enabled:
-        yield loss
-        return
-
-    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
-        optimizers = [optimizers]
-
-    loss_scaler = _amp_state.loss_scalers[loss_id]
-    loss_scale = loss_scaler.loss_scale()
-
-    if ((not _amp_state.opt_properties.master_weights)
-        and (not loss_scaler.dynamic)
-        and loss_scale == 1.0):
-        yield loss.float()
-        # Needing to drop the cache here as well is an ugly gotcha.
-        # But for now I think it's necessary to short-circuit.
-        # Probably ok to skip this if not delay_unscale
-        if _amp_state.opt_properties.patch_torch_functions:
-            _amp_state.handle._clear_cache()
-        return
-
-    if not delay_unscale:
-        if isinstance(optimizers, list):
-            for optimizer in optimizers:
-                if not optimizer._amp_stash.params_have_scaled_gradients:
-                    optimizer._prepare_amp_backward()
-
-    yield (loss.float())*loss_scale
-
-    if delay_unscale:
-        for optimizer in optimizers:
-            optimizer._amp_stash.params_have_scaled_gradients = True
-    else:
-        # FusedSGD may take care of unscaling as part of their step() methods.
-        # if not isinstance(optimizers, FP16_Optimizer_for_fused):
-            loss_scaler.clear_overflow_state()
-            for optimizer in optimizers:
-                optimizer._post_amp_backward(loss_scaler)
-                optimizer._amp_stash.params_have_scaled_gradients = False
-            # For future fused optimizers that enable sync-free dynamic loss scaling,
-            # should_skip will always be False.
-            should_skip = False if delay_overflow_check else loss_scaler.update_scale()
-            if should_skip:
-                for optimizer in optimizers:
-                    if not optimizer._amp_stash.already_patched:
-                        # Close on loss_scaler and loss_id as well, to be safe.  Probably not
-                        # necessary because amp.scale_loss is already creating a temporary scope.
-                        def patch_step(opt, loss_scaler, loss_id):
-                            opt_step = opt.step
-                            def skip_step(closure=None):
-                                if closure is not None:
-                                    raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
-                                maybe_print(("Gradient overflow.  Skipping step, loss scaler " +
-                                             "{} reducing loss scale to {}").format(loss_id,
-                                             loss_scaler.loss_scale()))
-                                # TODO:  I don't like the special casing for different optimizer implementations.
-                                # Maybe skip should delegate to a method owned by the optimizers themselves.
-                                if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
-                                    # Clear the master grads that wouldn't be zeroed by model.zero_grad()
-                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
-                                        param.grad = None
-                                if hasattr(opt, "most_recent_scale"):
-                                    opt.most_recent_scale = 1.0
-                                    opt.scale_set_by_backward = False
-                                opt.step = opt_step
-                                opt._amp_stash.already_patched = False
-                            return skip_step
-                        optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
-                        optimizer._amp_stash.already_patched = True
-
-    # Probably ok to skip this if not delay_unscale
-    if _amp_state.opt_properties.patch_torch_functions:
-        _amp_state.handle._clear_cache()
-
-
-# Free function version of AmpHandle.disable_casts, another step on the
-# path to removing the concept of "AmpHandle"
-@contextlib.contextmanager
-def disable_casts():
-    _amp_state.handle._is_active = False
-    yield
-    _amp_state.handle._is_active = True
-
-
-class AmpHandle(object):
-    def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
-        self._enable_caching = enable_caching
-        self._verbose = verbose
-        self._cache = dict()
-        self._default_scaler = LossScaler(loss_scale)
-        self._is_active = True
-        self._all_wrappers = []
-
-    def is_active(self):
-        return self._is_active
-
-    @contextlib.contextmanager
-    def _disable_casts(self):
-        self._is_active = False
-        yield
-        self._is_active = True
-
-    def wrap_optimizer(self, optimizer, num_loss=1):
-        self._default_scaler = None
-        return OptimWrapper(optimizer, self, num_loss)
-
-    @contextlib.contextmanager
-    def scale_loss(self, loss, optimizer):
-        raise RuntimeError("The old Amp API is no longer supported.  Please move to the new API, "
-            "documented here:  https://nvidia.github.io/apex/amp.html.  Transition guide:  "
-            "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")
-
-        if not self.is_active():
-            yield loss
-            return
-
-        if self._default_scaler is None:
-            raise RuntimeError(
-                'After calling `handle.wrap_optimizer()`, you must explicitly ' +
-                'use `optimizer.scale_loss(loss)`.')
-
-        # TODO: this code block is duplicated here and `opt.py`. Unify.
-        loss_scale = self._default_scaler.loss_scale()
-        yield loss * loss_scale
-
-        self._default_scaler.clear_overflow_state()
-        self._default_scaler.unscale(
-            master_params(optimizer),
-            master_params(optimizer),
-            loss_scale)
-        should_skip = self._default_scaler.update_scale()
-        if should_skip:
-            optimizer_step = optimizer.step
-            def skip_step():
-                maybe_print('Gradient overflow, skipping update')
-                optimizer.step = optimizer_step
-            optimizer.step = skip_step
-
-        self._clear_cache()
-
-    def _clear_cache(self):
-        self._cache.clear()
-
-    # Experimental support for saving / restoring uncasted versions of functions
-    def _save_func(self, mod, fn, func):
-        self._all_wrappers.append((mod, fn, func))
-
-    def _deactivate(self):
-        for mod, fn, func in self._all_wrappers:
-            utils.set_func(mod, fn, func)
-        self._all_wrappers = []
-
-    @property
-    def has_cache(self):
-        return self._enable_caching
-
-    @property
-    def cache(self):
-        return self._cache
-
-    def remove_cache(self, param):
-        if self.has_cache and param in self.cache:
-            del self.cache[param]
-
-    @property
-    def verbose(self):
-        return self._verbose
-
-class NoOpHandle(object):
-    def is_active(self):
-        return False
-
-    @contextlib.contextmanager
-    def _disable_casts(self):
-        yield
-
-    def wrap_optimizer(self, optimizer, num_loss=1):
-        return OptimWrapper(optimizer, self, num_loss)
-
-    @contextlib.contextmanager
-    def scale_loss(self, loss, optimizer):
-        yield loss
-
-    @property
-    def has_cache(self):
-        return False
-
-    @property
-    def verbose(self):
-        return False
-
-    def _clear_cache(self):
-        pass
-
-    def _deactivate(self):
-        pass
diff --git a/apex/amp/lists/__init__.py b/apex/amp/lists/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/apex/amp/lists/functional_overrides.py b/apex/amp/lists/functional_overrides.py
deleted file mode 100644
index dd009cec6..000000000
--- a/apex/amp/lists/functional_overrides.py
+++ /dev/null
@@ -1,80 +0,0 @@
-
-# TODO: think about the following two. They do weird things.
-# - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
-# - torch.nn.utils.weight_norm
-
-# Notes:
-# F.instance_norm uses batch_norm internally. Which correctly handles
-#   fp16 in/out with fp32 weights. So we shouldn't do anything for
-#   either of these.
-# F.normalize calls `input.norm()` internally, so it's redundant, but
-#   kept here in case impl. changes.
-# F.cosine_similarity is same: calls `x.norm()` internally.
-
-import torch.nn.functional
-
-MODULE = torch.nn.functional
-
-FP16_FUNCS = [
-    'conv1d',
-    'conv2d',
-    'conv3d',
-    'conv_transpose1d',
-    'conv_transpose2d',
-    'conv_transpose3d',
-    'conv_tbc', # Undocumented / maybe new?
-    'linear',
-]
-
-FP32_FUNCS = [
-
-    # Interpolation/Upsampling TODO:  Remove for 1.2
-    'interpolate',
-    'grid_sample',
-
-    # Pointwise
-    'softplus',
-    'softmin',
-    'log_softmax',
-    'softmax',
-    'gelu',
-    
-    # Normalization
-    'layer_norm',
-    'group_norm',
-    'local_response_norm',
-    'normalize',
-    'cosine_similarity',
-
-    # Loss functions
-    # TODO: which of these can be fp16?
-    'poisson_nll_loss',
-    'cosine_embedding_loss',
-    'cross_entropy',
-    'hinge_embedding_loss',
-    'kl_div',
-    'l1_loss',
-    'mse_loss',
-    'margin_ranking_loss',
-    'multilabel_margin_loss',
-    'multilabel_soft_margin_loss',
-    'multi_margin_loss',
-    'nll_loss',
-    'binary_cross_entropy_with_logits',
-    'smooth_l1_loss',
-    'soft_margin_loss',
-    'triplet_margin_loss',
-    'ctc_loss'
-]
-
-BANNED_FUNCS = [
-    ('binary_cross_entropy',
-     ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
-      "It requires that the output of the previous function be already a FloatTensor. \n\n"
-      "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
-      "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
-      "that is compatible with amp.\nAnother option is to add\n"
-      "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
-      "If you _really_ know what you are doing, you can disable this warning by passing "
-      "allow_banned=True to `amp.init()`."))
-]
diff --git a/apex/amp/lists/tensor_overrides.py b/apex/amp/lists/tensor_overrides.py
deleted file mode 100644
index 18f3e5dcf..000000000
--- a/apex/amp/lists/tensor_overrides.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from .. import compat
-from . import torch_overrides
-
-import importlib
-
-import torch
-
-# if compat.variable_is_tensor() and not compat.tensor_is_variable():
-MODULE = torch.Tensor
-# else:
-#     MODULE = torch.autograd.Variable
-
-
-FP16_FUNCS = compat.filter_attrs(MODULE, [
-    '__matmul__',
-])
-
-FP32_FUNCS = compat.filter_attrs(MODULE, [
-    '__ipow__',
-    '__pow__',
-    '__rpow__',
-
-    # Cast to fp32 before transfer to CPU
-    'cpu',
-])
-
-CASTS = compat.filter_attrs(MODULE, [
-    '__add__',
-    '__div__',
-    '__eq__',
-    '__ge__',
-    '__gt__',
-    '__iadd__',
-    '__idiv__',
-    '__imul__',
-    '__isub__',
-    '__itruediv__',
-    '__le__',
-    '__lt__',
-    '__mul__',
-    '__ne__',
-    '__radd__',
-    '__rdiv__',
-    '__rmul__',
-    '__rsub__',
-    '__rtruediv__',
-    '__sub__',
-    '__truediv__',
-])
-
-# None of these, but here to make code cleaner.
-SEQUENCE_CASTS = []
-
-# We need to grab all the methods from torch_overrides and add them to
-# the Tensor lists as well, as almost all methods are duplicated
-# between `torch` and `torch.Tensor` (and check with `hasattr`,
-# because a few random ones aren't defined on Tensor)
-_self_mod = importlib.import_module(__name__)
-for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
-    lst = getattr(_self_mod, attrname)
-    for fn in getattr(torch_overrides, attrname):
-        if hasattr(MODULE, fn):
-            lst.append(fn)
diff --git a/apex/amp/lists/torch_overrides.py b/apex/amp/lists/torch_overrides.py
deleted file mode 100644
index 7dedb05a8..000000000
--- a/apex/amp/lists/torch_overrides.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import torch
-
-from .. import utils
-
-MODULE = torch
-
-FP16_FUNCS = [
-    # Low level functions wrapped by torch.nn layers.
-    # The wrapper layers contain the weights which are then passed in as a parameter
-    # to these functions.
-    'conv1d',
-    'conv2d',
-    'conv3d',
-    'conv_transpose1d',
-    'conv_transpose2d',
-    'conv_transpose3d',
-    'conv_tbc',
-    'prelu',
-
-    # BLAS
-    'addmm',
-    'addmv',
-    'addr',
-    'matmul',
-    'mm',
-    'mv',
-]
-
-FP32_FUNCS = [
-    # Pointwise
-    'acos',
-    'asin',
-    'cosh',
-    'erfinv',
-    'exp',
-    'expm1',
-    'log',
-    'log10',
-    'log2',
-    'reciprocal',
-    'rsqrt',
-    'sinh',
-    'tan',
-
-    # Other math
-    'pow',
-
-    # Reduction
-    'cumprod',
-    'cumsum',
-    'dist',
-    # 'mean',
-    'norm',
-    'prod',
-    'std',
-    'sum',
-    'var',
-
-    # Misc
-    'renorm'
-]
-
-version_strings = torch.__version__.split('.')
-version_major = version_strings[0]
-version_minor = version_strings[1]
-version_num = float(version_major + "." + version_minor)
-# Before torch 1.1, mean must be blacklisted.
-if version_num < 1.1:
-    FP32_FUNCS.append('mean')
-
-# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
-# check the CUDA version -- if at least 9.1, then put the bmm
-# functions on the fp16 list. Otherwise, put them on the fp32 list.
-_bmms = ['addbmm',
-         'baddbmm',
-         'bmm']
-
-if utils.is_cuda_enabled():
-  # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
-  if utils.get_cuda_version() >= (9, 1, 0):
-      FP16_FUNCS.extend(_bmms)
-  else:
-      FP32_FUNCS.extend(_bmms)
-
-# Multi-tensor fns that may need type promotion
-CASTS = [
-    # Multi-tensor math
-    'addcdiv',
-    'addcmul',
-    'atan2',
-    'cross',
-    'bilinear',
-    'dot',
-
-    # Element-wise _or_ tensor-wise math
-    'add',
-    'div',
-    'mul',
-
-    # Comparison
-    'eq',
-    'equal',
-    'ge',
-    'gt',
-    'le',
-    'lt',
-    'ne'
-]
-
-# Functions that take sequence arguments. We need to inspect the whole
-# sequence and cast to the widest type.
-SEQUENCE_CASTS = [
-    'cat',
-    'stack'
-]
diff --git a/apex/amp/opt.py b/apex/amp/opt.py
deleted file mode 100644
index baf311684..000000000
--- a/apex/amp/opt.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import contextlib
-import warnings
-
-from .scaler import LossScaler, master_params
-from ._amp_state import maybe_print
-
-import numpy as np
-
-class OptimWrapper(object):
-    def __init__(self, optimizer, amp_handle, num_loss):
-        self._optimizer = optimizer
-        self._amp_handle = amp_handle
-        self._num_loss = num_loss
-        self._loss_idx = 0
-        self._skip_next = [False] * num_loss
-        self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]
-
-    @contextlib.contextmanager
-    def scale_loss(self, loss):
-        if not self._amp_handle.is_active():
-            yield loss
-            return
-
-        # When there are multiple losses per-optimizer, we need
-        # to save out current grad accumulation, since we won't be
-        # able to unscale this particulare loss once the grads are
-        # all mixed together.
-        cached_grads = []
-        if self._loss_idx > 0:
-            for p in master_params(self._optimizer):
-                if p.grad is not None:
-                    cached_grads.append(p.grad.data.detach().clone())
-                else:
-                    cached_grads.append(None)
-            self._optimizer.zero_grad()
-
-        loss_scale = self._cur_loss_scaler().loss_scale()
-        yield loss * loss_scale
-
-        self._cur_loss_scaler().clear_overflow_state()
-        self._cur_loss_scaler().unscale(
-            master_params(self._optimizer),
-            master_params(self._optimizer),
-            loss_scale)
-        self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
-        self._loss_idx += 1
-
-        if len(cached_grads) > 0:
-            for p, cached_grad in zip(master_params(self._optimizer),
-                                      cached_grads):
-                if cached_grad is not None:
-                    p.grad.data.add_(cached_grad)
-            cached_grads = []
-
-    def _cur_loss_scaler(self):
-        assert 0 <= self._loss_idx < self._num_loss
-        return self._loss_scaler[self._loss_idx]
-
-    def step(self, closure=None):
-        if not self._amp_handle.is_active():
-            return self._optimizer.step(closure=closure)
-
-        self._loss_idx = 0
-
-        for group in self._optimizer.param_groups:
-            for p in group['params']:
-                self._amp_handle.remove_cache(p)
-
-        if closure is not None:
-            raise NotImplementedError(
-                'The `closure` argument is unsupported by the amp ' +
-                'optimizer wrapper.')
-        if any(self._skip_next):
-            maybe_print('Gradient overflow, skipping update')
-            self._skip_next = [False] * self._num_loss
-        else:
-            return self._optimizer.step(closure=closure)
-
-    # Forward any attribute lookups
-    def __getattr__(self, attr):
-        return getattr(self._optimizer, attr)
-
-    # Forward all torch.optim.Optimizer methods
-    def __getstate__(self):
-        return self._optimizer.__getstate__()
-
-    def __setstate__(self):
-        return self._optimizer.__setstate__()
-
-    def __repr__(self):
-        return self._optimizer.__repr__()
-
-    def state_dict(self):
-        return self._optimizer.state_dict()
-
-    def load_state_dict(self, state_dict):
-        return self._optimizer.load_state_dict(state_dict)
-
-    def zero_grad(self):
-        return self._optimizer.zero_grad()
-
-    def add_param_group(self, param_group):
-        return self._optimizer.add_param_group(param_group)
diff --git a/apex/amp/rnn_compat.py b/apex/amp/rnn_compat.py
deleted file mode 100644
index d062ae265..000000000
--- a/apex/amp/rnn_compat.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from . import utils, wrap
-
-import torch
-_VF = torch._C._VariableFunctions
-RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
-
-def _gen_VF_wrapper(name):
-    def wrapper(*args, **kwargs):
-        return getattr(_VF, name)(*args, **kwargs)
-    return wrapper
-
-# Some python magic to generate an object that has the rnn cell functions
-# defined on it, all of which call into corresponding _VF version.
-# Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
-# imported at module scope within torch.nn.modules.rnn).  This should
-# not affect third-party importers of _VF.py.
-class VariableFunctionsShim(object):
-    def __init__(self):
-        for name in RNN_NAMES:
-            for suffix in ['', '_cell']:
-               fn_name = name + suffix
-               setattr(self, fn_name, _gen_VF_wrapper(fn_name))
-
-def has_old_rnns():
-    try:
-        torch.nn.backends.thnn.backend.LSTMCell
-        return True
-    except:
-        return False
-
-def whitelist_rnn_cells(handle, verbose):
-    # Different module + function names in old/new RNN cases
-    if has_old_rnns():
-        fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
-        mod = torch.nn.backends.thnn.backend
-    else:
-        fn_names = [x + '_cell' for x in RNN_NAMES]
-        mod = torch.nn.modules.rnn._VF
-        assert isinstance(mod, VariableFunctionsShim)
-
-    # Insert casts on cell functions
-    for fn in fn_names:
-        wrap.cached_cast(mod, fn, utils.maybe_half, handle,
-                         try_caching=True, verbose=verbose)
-
-    if has_old_rnns():
-        # Special handling of `backward` for fused gru / lstm:
-        # The `backward` method calls Tensor.sum() (blacklist) internally,
-        # and then the resulting grad_input has the wrong type.
-        # TODO: where else is this a problem?
-        for rnn_type in ['GRUFused', 'LSTMFused']:
-            mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
-            wrap.disable_casts(mod, 'backward', handle)
diff --git a/apex/amp/scaler.py b/apex/amp/scaler.py
deleted file mode 100644
index 99888bc6f..000000000
--- a/apex/amp/scaler.py
+++ /dev/null
@@ -1,217 +0,0 @@
-import torch
-from ..multi_tensor_apply import multi_tensor_applier
-from ._amp_state import _amp_state, master_params, maybe_print
-from itertools import product
-
-def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
-    # Exception handling for 18.04 compatibility
-    if check_overflow:
-        cpu_sum = float(model_grad.float().sum())
-        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-            return True
-
-    if master_grad is not model_grad: # copy_ probably internally short-circuits this
-        master_grad.copy_(model_grad)
-    if scale != 1.0:
-        master_grad.mul_(scale)
-    return False
-
-def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
-    # Exception handling for 18.04 compatibility
-    if check_overflow:
-        cpu_sum = float(model_grad.float().sum())
-        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-            return True
-
-    # if master_grad is not model_grad: # copy_ probably internally short-circuits this
-    #     master_grad.copy_(model_grad)
-    assert stashed_grad.dtype == master_grad.dtype
-    converted_model_grad = model_grad.data.to(master_grad.dtype)
-    master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
-    return False
-
-class LossScaler(object):
-    warned_no_fused_kernel = False
-    warned_unscaling_non_fp32_grad = False
-    has_fused_kernel = False
-
-    def __init__(self,
-                 loss_scale,
-                 init_scale=2.**16,
-                 scale_factor=2.,
-                 scale_window=2000,
-                 min_loss_scale=None,
-                 max_loss_scale=2.**24):
-        if loss_scale == "dynamic":
-            self.dynamic = True
-            self._loss_scale = min(max_loss_scale, init_scale)
-        else:
-            self.dynamic = False
-            self._loss_scale = loss_scale
-        self._max_loss_scale = max_loss_scale
-        self._min_loss_scale = min_loss_scale
-        self._scale_seq_len = scale_window
-        self._unskipped = 0
-        self._has_overflow = False
-        self._overflow_buf = torch.cuda.IntTensor([0])
-        if multi_tensor_applier.available:
-            import amp_C
-            LossScaler.has_fused_kernel = multi_tensor_applier.available
-            LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
-            LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
-        else:
-            if not LossScaler.warned_no_fused_kernel:
-                maybe_print(
-                    "Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
-                    "possibly because apex was installed without --cuda_ext --cpp_ext. "
-                    "Using Python fallback.  Original ImportError was: " +
-                    repr(multi_tensor_applier.import_err),
-                    True)
-            LossScaler.has_fused_kernel = False
-            LossScaler.warned_no_fused_kernel = True
-
-    def loss_scale(self):
-        return self._loss_scale
-
-    def unscale_python(self, model_grads, master_grads, scale):
-        for model, master in zip(model_grads, master_grads):
-            if model is not None:
-                if not LossScaler.warned_unscaling_non_fp32_grad:
-                    if master.dtype != torch.float32:
-                        maybe_print(
-                            "Attempting to unscale a grad with type {} ".format(master.type()) +
-                            "Unscaling non-fp32 grads may indicate an error. "
-                            "When using Amp, you don't need to call .half() on your model.")
-                        LossScaler.warned_unscaling_non_fp32_grad = True
-                self._has_overflow = scale_check_overflow_python(model,
-                                                                 master,
-                                                                 1./scale,
-                                                                 self.dynamic)
-                if self._has_overflow and self.dynamic:
-                    break
-
-    # unused_scale keeps some of the old API alive for hopefully a short time.
-    def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
-        if self._has_overflow:
-            return
-
-        scale = self._loss_scale
-        if scale_override is not None:
-            scale = scale_override
-
-        if scale == 1.0 and models_are_masters and not self.dynamic:
-            return
-
-        if LossScaler.has_fused_kernel:
-            # if (not LossScaler.warned_unscaling_non_fp32_grad
-            #     and master_grads[0].dtype == torch.float16):
-            #     print("Warning:  unscaling grads that are not FP32. "
-            #           "Unscaling non-fp32 grads may indicate an error. "
-            #           "When using Amp, you don't need to call .half() on your model.")
-            #     # Setting this to True unconditionally allows the possibility of an escape
-            #     # if never-before-seen non-fp32 grads are created in some later iteration.
-            #     LossScaler.warned_unscaling_non_fp32_grad = True
-            multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
-                                 self._overflow_buf,
-                                 [model_grads, master_grads],
-                                 1./scale)
-        else:
-            self.unscale_python(model_grads, master_grads, scale)
-
-        # Defer to update_scale
-        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-        #     self._has_overflow = self._overflow_buf.item()
-
-    def unscale_with_stashed_python(self,
-                                    model_grads,
-                                    stashed_master_grads,
-                                    master_grads,
-                                    a,
-                                    b):
-        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
-            if model is None and stashed is None:
-                continue
-            else:
-                if not LossScaler.warned_unscaling_non_fp32_grad:
-                    if master.dtype != torch.float32:
-                        maybe_print(
-                            "Attempting to unscale a grad with type {} ".format(master.type()) +
-                            "Unscaling non-fp32 grads may indicate an error. "
-                            "When using Amp, you don't need to call .half() on your model.")
-                        LossScaler.warned_unscaling_non_fp32_grad = True
-                self._has_overflow = axpby_check_overflow_python(model,
-                                                                 stashed,
-                                                                 master,
-                                                                 a,
-                                                                 b,
-                                                                 self.dynamic)
-                if self._has_overflow and self.dynamic:
-                    break
-
-    def unscale_with_stashed(self,
-                             model_grads,
-                             stashed_master_grads,
-                             master_grads,
-                             scale_override=None):
-        if self._has_overflow:
-            return
-
-        grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
-        if scale_override is not None:
-            grads_have_scale, stashed_have_scale, out_scale = scale_override
-
-        if LossScaler.has_fused_kernel:
-            if (not LossScaler.warned_unscaling_non_fp32_grad
-                and master_grads[0].dtype == torch.float16):
-                print("Warning:  unscaling grads that are not FP32. "
-                      "Unscaling non-fp32 grads may indicate an error. "
-                      "When using Amp, you don't need to call .half() on your model.")
-                # Setting this to True unconditionally allows the possibility of an escape
-                # if never-before-seen non-fp32 grads are created in some later iteration.
-                LossScaler.warned_unscaling_non_fp32_grad = True
-            multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
-                                 self._overflow_buf,
-                                 [model_grads, stashed_master_grads, master_grads],
-                                 out_scale/grads_have_scale,   # 1./scale,
-                                 out_scale/stashed_have_scale, # 1.0,
-                                 0) # check only arg 0, aka the incoming model grads, for infs
-        else:
-            self.unscale_with_stashed_python(model_grads,
-                                             stashed_master_grads,
-                                             master_grads,
-                                             out_scale/grads_have_scale,
-                                             out_scale/stashed_have_scale)
-
-        # Defer to update_scale
-        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-        #     self._has_overflow = self._overflow_buf.item()
-
-    def clear_overflow_state(self):
-        self._has_overflow = False
-        if self.has_fused_kernel:
-            self._overflow_buf.zero_()
-
-    # Separate so unscale() can be called more that once before updating.
-    def update_scale(self):
-        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-            self._has_overflow = self._overflow_buf.item()
-
-        if self._has_overflow and self.dynamic:
-            should_skip = True
-            if(self._min_loss_scale):
-                self._loss_scale = max(self._min_loss_scale, self._loss_scale/2.)
-            else:
-                self._loss_scale = self._loss_scale/2.
-            self._unskipped = 0
-        else:
-            should_skip = False
-            self._unskipped += 1
-
-        if self._unskipped == self._scale_seq_len and self.dynamic:
-            self._loss_scale = min(self._max_loss_scale, self._loss_scale*2.)
-            self._unskipped = 0
-
-        return should_skip
diff --git a/apex/amp/utils.py b/apex/amp/utils.py
deleted file mode 100644
index 0590cd70a..000000000
--- a/apex/amp/utils.py
+++ /dev/null
@@ -1,210 +0,0 @@
-from . import compat
-
-import functools
-import itertools
-
-import torch
-
-def is_cuda_enabled():
-    return torch.version.cuda is not None
-
-def get_cuda_version():
-    return tuple(int(x) for x in torch.version.cuda.split('.'))
-
-def is_fp_tensor(x):
-    if is_nested(x):
-        # Fast-fail version of all(is_fp_tensor)
-        for y in x:
-            if not is_fp_tensor(y):
-                return False
-        return True
-    return compat.is_tensor_like(x) and compat.is_floating_point(x)
-
-def is_nested(x):
-    return isinstance(x, tuple) or isinstance(x, list)
-
-def should_cache(x):
-    if is_nested(x):
-        # Fast-fail version of all(should_cache)
-        for y in x:
-            if not should_cache(y):
-                return False
-        return True
-    return isinstance(x, torch.nn.parameter.Parameter) and \
-        type_string(x) == 'FloatTensor'
-
-def collect_fp_tensor_types(args, kwargs):
-    def collect_types(x, types):
-        if is_nested(x):
-            for y in x:
-                collect_types(y, types)
-        else:
-            types.add(type_string(x))
-
-    all_args = itertools.chain(args, kwargs.values())
-    types = set()
-    for x in all_args:
-        if is_fp_tensor(x):
-            collect_types(x, types)
-    return types
-
-def type_string(x):
-    return x.type().split('.')[-1]
-
-def maybe_half(x, name='', verbose=False):
-    if is_nested(x):
-        return type(x)([maybe_half(y) for y in x])
-
-    if not x.is_cuda or type_string(x) == 'HalfTensor':
-        return x
-    else:
-        if verbose:
-            print('Float->Half ({})'.format(name))
-        return x.half()
-
-def maybe_float(x, name='', verbose=False):
-    if is_nested(x):
-        return type(x)([maybe_float(y) for y in x])
-
-    if not x.is_cuda or type_string(x) == 'FloatTensor':
-        return x
-    else:
-        if verbose:
-            print('Half->Float ({})'.format(name))
-        return x.float()
-
-# NB: returneds casted `args`, mutates `kwargs` in-place
-def casted_args(cast_fn, args, kwargs):
-    new_args = []
-    for x in args:
-        if is_fp_tensor(x):
-            new_args.append(cast_fn(x))
-        else:
-            new_args.append(x)
-    for k in kwargs:
-        val = kwargs[k]
-        if is_fp_tensor(val):
-            kwargs[k] = cast_fn(val)
-    return new_args
-
-def cached_cast(cast_fn, x, cache):
-    if is_nested(x):
-        return type(x)([cached_cast(y) for y in x])
-    if x in cache:
-        cached_x = cache[x]
-        if x.requires_grad and cached_x.requires_grad:
-            # Make sure x is actually cached_x's autograd parent.
-            if cached_x.grad_fn.next_functions[1][0].variable is not x:
-                raise RuntimeError("x and cache[x] both require grad, but x is not "
-                                   "cache[x]'s parent.  This is likely an error.")
-        # During eval, it's possible to end up caching casted weights with
-        # requires_grad=False.  On the next training iter, if cached_x is found
-        # and reused from the cache, it will not actually have x as its parent.
-        # Therefore, we choose to invalidate the cache (and force refreshing the cast)
-        # if x.requires_grad and cached_x.requires_grad do not match.
-        #
-        # During eval (i.e. running under with torch.no_grad()) the invalidation
-        # check would cause the cached value to be dropped every time, because
-        # cached_x would always be created with requires_grad=False, while x would
-        # still have requires_grad=True.  This would render the cache effectively
-        # useless during eval.  Therefore, if we are running under the no_grad()
-        # context manager (torch.is_grad_enabled=False) we elide the invalidation
-        # check, and use the cached value even though its requires_grad flag doesn't
-        # match.  During eval, we don't care that there's no autograd-graph
-        # connection between x and cached_x.
-        if torch.is_grad_enabled() and x.requires_grad != cached_x.requires_grad:
-            del cache[x]
-        else:
-            return cached_x
-
-    casted_x = cast_fn(x)
-    cache[x] = casted_x
-    return casted_x
-
-def verbosify(cast_fn, fn_name, verbose):
-    if verbose:
-        return functools.partial(cast_fn, name=fn_name, verbose=verbose)
-    else:
-        return cast_fn
-
-def as_inplace(fns):
-    for x in fns:
-        yield x + '_'
-
-def has_func(mod, fn):
-    if isinstance(mod, dict):
-        return fn in mod
-    else:
-        return hasattr(mod, fn)
-
-def get_func(mod, fn):
-    if isinstance(mod, dict):
-        return mod[fn]
-    else:
-        return getattr(mod, fn)
-
-def set_func(mod, fn, new_fn):
-    if isinstance(mod, dict):
-        mod[fn] = new_fn
-    else:
-        setattr(mod, fn, new_fn)
-
-def set_func_save(handle, mod, fn, new_fn):
-    cur_fn = get_func(mod, fn)
-    handle._save_func(mod, fn, cur_fn)
-    set_func(mod, fn, new_fn)
-
-# A couple problems get solved here:
-# - The flat_weight buffer is disconnected from autograd graph,
-#   so the fp16 weights need to be derived from the input weights
-#   to this forward call, not the flat buffer.
-# - The ordering of weights in the flat buffer is...idiosyncratic.
-# First problem is solved with combination of set_ (to set up
-# correct storage) and copy_ (so the fp16 weight derives from the
-# fp32 one in autograd.
-# Second is solved by doing ptr arithmetic on the fp32 weights
-# to derive the correct offset.
-#
-# TODO: maybe this should actually use
-# `torch._cudnn_rnn_flatten_weight`? But then I need to call
-# on first iter and cache the right offsets. Ugh.
-def synthesize_flattened_rnn_weights(fp32_weights,
-                                     fp16_flat_tensor,
-                                     rnn_fn='',
-                                     verbose=False):
-    fp16_weights = []
-    fp32_base_ptr = fp32_weights[0][0].data_ptr()
-    for layer_weights in fp32_weights:
-        fp16_layer_weights = []
-        for w_fp32 in layer_weights:
-            w_fp16 = w_fp32.new().half()
-            offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
-            w_fp16.set_(fp16_flat_tensor.storage(),
-                        offset,
-                        w_fp32.shape)
-            w_fp16.copy_(w_fp32)
-            if verbose:
-                print('Float->Half ({})'.format(rnn_fn))
-            fp16_layer_weights.append(w_fp16)
-        fp16_weights.append(fp16_layer_weights)
-    return fp16_weights
-
-# Roughly same as above, just the `fp32_weights` aren't nested.
-# Code kept separate for readability.
-def new_synthesize_flattened_rnn_weights(fp32_weights,
-                                         fp16_flat_tensor,
-                                         rnn_fn='',
-                                         verbose=False):
-    fp16_weights = []
-    fp32_base_ptr = fp32_weights[0].data_ptr()
-    for w_fp32 in fp32_weights:
-        w_fp16 = w_fp32.new().half()
-        offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
-        w_fp16.set_(fp16_flat_tensor.storage(),
-                    offset,
-                    w_fp32.shape)
-        w_fp16.copy_(w_fp32)
-        if verbose:
-            print('Float->Half ({})'.format(rnn_fn))
-        fp16_weights.append(w_fp16)
-    return fp16_weights
diff --git a/apex/amp/wrap.py b/apex/amp/wrap.py
deleted file mode 100644
index 559d0558d..000000000
--- a/apex/amp/wrap.py
+++ /dev/null
@@ -1,276 +0,0 @@
-from . import compat
-from . import utils
-from ._amp_state import _amp_state
-from . import rnn_compat
-
-import functools
-
-import torch
-
-def make_cast_wrapper(orig_fn, cast_fn, handle,
-                      try_caching=False):
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        if not handle.is_active():
-            return orig_fn(*args, **kwargs)
-
-        if try_caching and handle.has_cache:
-            args = list(args)
-            for i in range(len(args)):
-                if utils.should_cache(args[i]):
-                    args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
-            for k in kwargs:
-                if utils.should_cache(kwargs[k]):
-                    kwargs[k] = utils.cached_cast(cast_fn, kwargs[k], handle.cache)
-        new_args = utils.casted_args(cast_fn,
-                                     args,
-                                     kwargs)
-        return orig_fn(*new_args, **kwargs)
-    return wrapper
-
-def cached_cast(mod, fn, cast_fn, handle,
-                try_caching=False, verbose=False):
-    if not utils.has_func(mod, fn):
-        return
-
-    orig_fn = utils.get_func(mod, fn)
-    cast_fn = utils.verbosify(cast_fn, fn, verbose)
-    wrapper = make_cast_wrapper(orig_fn, cast_fn, handle, try_caching)
-    utils.set_func_save(handle, mod, fn, wrapper)
-
-# `handle` arg is unused, but simplifies API to make `make_cast_wrapper`
-# Annoyingly, make_promote_wrapper still uses the global handle.  Once everyone
-# is on the new API and I am free to get rid of handle, I can clean this up.
-def make_promote_wrapper(orig_fn, cast_fn, handle=None):
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        if not _amp_state.handle.is_active():
-            return orig_fn(*args, **kwargs)
-
-        types = utils.collect_fp_tensor_types(args, kwargs)
-
-        if len(types) <= 1:
-            return orig_fn(*args, **kwargs)
-        elif len(types) == 2 and types == set(['HalfTensor', 'FloatTensor']):
-            new_args = utils.casted_args(cast_fn,
-                                         args,
-                                         kwargs)
-            return orig_fn(*new_args, **kwargs)
-        else:
-            raise NotImplementedError('Do not know how to handle ' +
-                                      'these types to promote: {}'
-                                      .format(types))
-    return wrapper
-
-def promote(mod, fn, handle, verbose=False):
-    orig_fn = utils.get_func(mod, fn)
-    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
-    wrapper = make_promote_wrapper(orig_fn, maybe_float)
-    utils.set_func_save(handle, mod, fn, wrapper)
-
-def sequence_promote(mod, fn, handle, verbose=False):
-    orig_fn = utils.get_func(mod, fn)
-    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
-    @functools.wraps(orig_fn)
-    def wrapper(seq, *args, **kwargs):
-        if not _amp_state.handle.is_active():
-            return orig_fn(seq, *args, **kwargs)
-
-        types = set([utils.type_string(x) for x in seq])
-        if len(types) <= 1:
-            return orig_fn(seq, *args, **kwargs)
-        elif types == set(['HalfTensor', 'FloatTensor']):
-            cast_seq = utils.casted_args(maybe_float,
-                                         seq, {})
-            return orig_fn(cast_seq, *args, **kwargs)
-        else:
-            # TODO: other mixed-type cases aren't due to amp.
-            #       Just pass through?
-            return orig_fn(seq, *args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-
-def promote_match_arg0(mod, fn, handle, verbose=False):
-    if not utils.has_func(mod, fn):
-        return
-
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(arg0, *args, **kwargs):
-        assert compat.is_tensor_like(arg0)
-        if not _amp_state.handle.is_active():
-            return orig_fn(arg0, *args, **kwargs)
-
-        if utils.type_string(arg0) == 'HalfTensor':
-            cast_fn = utils.maybe_half
-        elif utils.type_string(arg0) == 'FloatTensor':
-            cast_fn = utils.maybe_float
-        else:
-            return orig_fn(arg0, *args, **kwargs)
-        cast_fn = utils.verbosify(cast_fn, fn, verbose)
-        new_args = utils.casted_args(cast_fn, args, kwargs)
-        return orig_fn(arg0, *new_args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-
-def err_if_any_half(mod, fn, handle, custom_err_msg=None):
-    if not utils.has_func(mod, fn):
-        return
-
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        types = utils.collect_fp_tensor_types(args, kwargs)
-        if 'HalfTensor' in types:
-            if custom_err_msg:
-                raise NotImplementedError(custom_err_msg)
-            else:
-                raise NotImplementedError('Cannot call in-place function ' +
-                                          '{} with fp16 arguments.'.format(fn))
-        else:
-            return orig_fn(*args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-
-def err_if_arg0_half(mod, fn, handle, verbose=False):
-    if not utils.has_func(mod, fn):
-        return
-
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(arg0, *args, **kwargs):
-        assert compat.is_tensor_like(arg0)
-        if utils.type_string(arg0) == 'HalfTensor':
-            raise NotImplementedError('Cannot call in-place method ' +
-                                      '{} on fp16 Tensors.'.format(fn))
-        else:
-            cast_fn = utils.verbosify(utils.maybe_float, fn, verbose)
-            new_args = utils.casted_args(cast_fn, args, kwargs)
-            return orig_fn(arg0, *new_args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-
-# Current RNN approach:
-# - Wrap top-level `RNN` function in thnn backend
-# - Will call into either CudnnRNN or AutogradRNN
-#  - Each of these are factory functions that return a per-iter
-#    `forward` function
-# - We interpose on the factory function to:
-#   1) Interpose on the actual forward function and put in casts
-#   2) Insert an fp16 `flat_weight` if necessary
-def rnn_cast(backend, fn, handle, verbose=False):
-    orig_rnn = utils.get_func(backend, fn)
-    @functools.wraps(orig_rnn)
-    def rnn_wrapper(*args, **kwargs):
-        flat_weight = kwargs.get('flat_weight')
-        if flat_weight is not None:
-            # We replace `flat_weight` with an uninitialized fp16
-            # Tensor. The "actual" weight tensors (provided in `forward`),
-            # will then be set up as ptrs into the buffer and have the
-            # corresponding fp32 values copied in.
-            # We need to call `copy` on the "actual" weights so that the
-            # autograd graph correctly backprops from the wgrads computed
-            # inside cuDNN (on fp16 weights) into the fp32 weights.
-            assert utils.type_string(flat_weight) == 'FloatTensor'
-            if compat.tensor_is_float_tensor() or compat.tensor_is_variable():
-                # Pre-0.4. A little slower, since it zeros out memory.
-                flat_weight_fp16 = flat_weight.new().half().resize_(flat_weight.shape)
-            else:
-                flat_weight_fp16 = torch.empty_like(flat_weight,
-                                                    dtype=torch.float16)
-            kwargs['flat_weight'] = flat_weight_fp16
-        else:
-            flat_weight_fp16 = None
-
-        forward = orig_rnn(*args, **kwargs)
-        @functools.wraps(forward)
-        def fwd_wrapper(*fargs, **fkwargs):
-            assert len(fargs) == 3 or len(fargs) == 4
-            inputs, weights, hiddens = fargs[:3]
-            assert utils.is_fp_tensor(inputs)
-            assert isinstance(weights, list)
-            cast_fn = utils.verbosify(utils.maybe_half,
-                                      fn,
-                                      verbose)
-            new_args = []
-
-            # 0) Inputs
-            new_args.append(cast_fn(inputs))
-
-            # 1) Weights
-            if flat_weight_fp16 is not None:
-                fp16_weights = utils.synthesize_flattened_rnn_weights(
-                    weights, flat_weight_fp16, fn, verbose)
-            else:
-                fp16_weights = [[cast_fn(w) for w in layer]
-                                for layer in weights]
-            new_args.append(fp16_weights)
-
-            # 2) Inputs: either a tuple (for LSTM) or single tensor
-            if isinstance(hiddens, tuple):
-                new_args.append(tuple(cast_fn(x) for x in hiddens))
-            elif utils.is_fp_tensor(hiddens):
-                new_args.append(cast_fn(hiddens))
-            else:
-                # Hiddens can, in principle, be `None` -- pass through
-                new_args.append(hiddens)
-
-            # 3) Batch sizes (0.4 or later only)
-            if len(fargs) == 4:
-                new_args.append(fargs[3])
-
-            return forward(*new_args, **fkwargs)
-        return fwd_wrapper
-    utils.set_func_save(handle, backend, fn, rnn_wrapper)
-
-def new_rnn_cast(fn, handle, verbose=False):
-    # Forward+backward compatibility around https://github.com/pytorch/pytorch/pull/15744
-    # For rnn backend calls that route through _rnn_impls, we must patch the ref
-    # that _rnn_impls stashed.  For rnn backend calls that directly invoke
-    # _VF.<backend>, e.g. _VF.lstm, we can patch onto VariableFunctionsShim,
-    # which in turn has patched the ref named "_VF" in torch.nn.modules.rnn.
-    if utils.has_func(torch.nn.modules.rnn._rnn_impls, fn):
-        mod = torch.nn.modules.rnn._rnn_impls
-    else:
-        mod = torch.nn.modules.rnn._VF
-        assert isinstance(mod, rnn_compat.VariableFunctionsShim)
-        fn = fn.lower()
-    orig_fn = utils.get_func(mod, fn)
-    cast_fn = utils.verbosify(utils.maybe_half, fn, verbose)
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        # Exact call signature from modules/rnn.py
-        assert len(args) == 9
-        assert len(kwargs) == 0
-
-        if not _amp_state.handle.is_active():
-            return orig_fn(*args, **kwargs)
-
-        if isinstance(args[6], bool):
-            params_idx = 2 # Not PackedSequence case
-        else:
-            params_idx = 3 # PackedSequence case
-
-        new_args = []
-        for i, arg in enumerate(args):
-            if i == params_idx:
-                num_params = sum([x.numel() for x in arg])
-                fp16_weight_buf = args[0].new_empty((num_params,),
-                                                    dtype=torch.half)
-                casted_weights = utils.new_synthesize_flattened_rnn_weights(
-                    arg, fp16_weight_buf, fn, verbose)
-                new_args.append(casted_weights)
-            elif utils.is_fp_tensor(arg):
-                new_args.append(cast_fn(arg))
-            else:
-                new_args.append(arg)
-
-        return orig_fn(*new_args)
-    utils.set_func_save(handle, mod, fn, wrapper)
-
-def disable_casts(mod, fn, handle):
-    if not utils.has_func(mod, fn):
-        return
-
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        with handle._disable_casts():
-            return orig_fn(*args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
diff --git a/apex/fp16_utils/README.md b/apex/fp16_utils/README.md
deleted file mode 100644
index 941de1794..000000000
--- a/apex/fp16_utils/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.
-
-#### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
-
-#### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
-
-#### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
-
-#### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
-
-
-fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  
-
-#### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
-
-The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
diff --git a/apex/fp16_utils/__init__.py b/apex/fp16_utils/__init__.py
deleted file mode 100644
index c7bb1f537..000000000
--- a/apex/fp16_utils/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from .fp16util import (
-    BN_convert_float,
-    network_to_half,
-    prep_param_lists,
-    model_grads_to_master_grads,
-    master_params_to_model_params,
-    tofp16,
-    to_python_float,
-    clip_grad_norm,
-    convert_module,
-    convert_network,
-    FP16Model,
-)
-
-from .fp16_optimizer import FP16_Optimizer
-from .loss_scaler import LossScaler, DynamicLossScaler
diff --git a/apex/fp16_utils/fp16_optimizer.py b/apex/fp16_utils/fp16_optimizer.py
deleted file mode 100755
index 15873c972..000000000
--- a/apex/fp16_utils/fp16_optimizer.py
+++ /dev/null
@@ -1,557 +0,0 @@
-import torch
-from torch import nn
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from ..amp._amp_state import _amp_state, maybe_print
-from ..amp.scaler import LossScaler
-from ..multi_tensor_apply import multi_tensor_applier
-from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
-
-# TODO:  Update overflow check + downscale to use Carl's fused kernel.
-class FP16_Optimizer(object):
-    def __init__(self,
-                 init_optimizer,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=True):
-        print("Warning:  FP16_Optimizer is deprecated and dangerous, and will be deleted soon.  "
-              "If it still works, you're probably getting lucky.  "
-              "For mixed precision, use the documented API https://nvidia.github.io/apex/amp.html, with opt_level=O1.")
-
-        from apex import deprecated_warning
-        deprecated_warning("apex.fp16_utils is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-
-        self.verbose = verbose
-
-        self.optimizer = init_optimizer
-        # init_state_dict sets up an alternative way to cast per-param state tensors.
-        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
-        # init_state_dict = init_optimizer.state_dict()
-
-        self.fp16_groups = []
-        self.fp32_from_fp16_groups = []
-        self.fp32_from_fp32_groups = []
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
-                                         .format(param.size()))
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        param_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                        # Reset existing state dict key to the new master param.
-                        # We still need to recast per-param state tensors, if any, to FP32.
-                        if param in self.optimizer.state:
-                           self.optimizer.state[master_param] = self.optimizer.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
-                                         .format(param.size()))
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-                    else:
-                        raise TypeError("Wrapped parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                        "Received {}".format(param.type()))
-
-            self.fp16_groups.append(fp16_params_this_group)
-            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-        self.all_fp16_params = []
-        for group in self.fp16_groups:
-            self.all_fp16_params += group
-
-        self.all_fp32_from_fp16_params = []
-        for group in self.fp32_from_fp16_groups:
-            self.all_fp32_from_fp16_params += group
-
-        self.all_fp32_from_fp32_params = []
-        for group in self.fp32_from_fp32_groups:
-            self.all_fp32_from_fp32_params += group
-
-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
-        # alternative way to cast per-param state tensors:
-        # self.optimizer.load_state_dict(init_state_dict)
-
-        if dynamic_loss_scale:
-            self.dynamic_loss_scale = True
-            if dynamic_loss_args is not None:
-                self.loss_scaler = LossScaler("dynamic", **dynamic_loss_args)
-            else:
-                self.loss_scaler = LossScaler("dynamic")
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(static_loss_scale)
-
-        self.overflow = False
-        self.first_closure_call_this_step = True
-
-        self.clip_grad_norm = clip_grad_norm
-
-        # TODO:  Centralize exposure and import error checking for the C backend.
-        if multi_tensor_applier.available:
-            import amp_C
-            self.multi_tensor_scale = amp_C.multi_tensor_scale
-            self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
-
-    # Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
-    # of having to support FP16_Optimizer separately, for the time being.
-    def maybe_print(self, msg):
-        if self.verbose:
-            print(msg)
-
-    def __getstate__(self):
-        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
-
-    def __setstate__(self, state):
-        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
-
-    def zero_grad(self, set_grads_to_None=False):
-        """
-        Zero fp32 and fp16 parameter grads.
-        """
-        # In principle, only the .grad attributes of the model params need to be zeroed,
-        # because gradients are copied into the FP32 master params.  However, we zero
-        # all gradients owned by the optimizer, just to be safe:
-        for group in self.optimizer.param_groups:
-             for p in group['params']:
-                 if set_grads_to_None:
-                     p.grad = None
-                 else:
-                     if p.grad is not None:
-                         p.grad.detach_()
-                         p.grad.zero_()
-
-        # Zero fp16 gradients owned by the model:
-        for fp16_group in self.fp16_groups:
-            for param in fp16_group:
-                if set_grads_to_None:
-                    param.grad = None
-                else:
-                    if param.grad is not None:
-                        param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
-                        param.grad.zero_()
-
-    # Should not be used anymore.
-    # def _check_overflow(self):
-    #     params = []
-    #     for group in self.fp16_groups:
-    #         for param in group:
-    #             params.append(param)
-    #     for group in self.fp32_from_fp32_groups:
-    #         for param in group:
-    #             params.append(param)
-    #     self.overflow = self.loss_scaler.has_overflow(params)
-
-    # def _update_scale(self, has_overflow=False):
-    #     self.loss_scaler.update_scale(has_overflow)
-
-    def _master_params_to_model_params(self):
-        if multi_tensor_applier.available:
-            if len(self.all_fp16_params) > 0:
-                multi_tensor_applier(
-                    self.multi_tensor_scale,
-                    self._dummy_overflow_buf,
-                    [self.all_fp32_from_fp16_params, self.all_fp16_params],
-                    1.0)
-        else:
-            for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-                master_params_to_model_params(fp16_group, fp32_from_fp16_group)
-
-    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
-    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
-    # def _model_grads_to_master_grads(self):
-    #     for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-    #         model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
-
-    # def _downscale_master(self):
-    #     if self.loss_scale != 1.0:
-    #         for group in self.optimizer.param_groups:
-    #             for param in group['params']:
-    #                 if param.grad is not None:
-    #                     param.grad.data.mul_(1./self.loss_scale)
-
-    def clip_master_grads(self, max_norm, norm_type=2):
-        """
-        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
-
-        Args:
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the current fp32 gradients (viewed as a single vector).
-
-        .. warning::
-            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
-        """
-        if not self.overflow:
-            fp32_params = []
-            for param_group in self.optimizer.param_groups:
-                for param in param_group['params']:
-                    fp32_params.append(param)
-            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
-        else:
-            return -1
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
-        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
-        return state_dict
-
-    def load_state_dict(self, state_dict):
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-
-        Example::
-
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.overflow = state_dict['overflow']
-        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
-        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 2.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-        for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
-            for current, saved in zip(current_group, saved_group):
-                current.data.copy_(saved.data)
-
-    def step(self, closure=None): # could add clip option.
-        """
-        If no closure is supplied, :attr:`step` should be called after
-        ``fp16_optimizer_obj.backward(loss)``.
-        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
-        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
-        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
-        another forward pass using their model.
-
-        If a closure is supplied, :attr:`step` may be called without a prior call to
-        :attr:`backward(loss)`.
-        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
-        However, the user should take care that any ``loss.backward()`` call within the closure
-        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
-
-        Args:
-           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
-
-        Example with closure::
-
-            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
-            # existing pytorch optimizer.
-            for input, target in dataset:
-                def closure():
-                    optimizer.zero_grad()
-                    output = model(input)
-                    loss = loss_fn(output, target)
-                    # loss.backward() becomes:
-                    optimizer.backward(loss)
-                    return loss
-                optimizer.step(closure)
-
-        .. warning::
-            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
-
-        .. _`ordinary Pytorch optimizer use`:
-            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
-        """
-
-        scale = self.loss_scaler.loss_scale()
-        # To consider:  Should this be in step(), or update_master_grads?  It works either way,
-        # but I should make it consistent with the Amp control flow, which updates the scale
-        # during backward context manager exit.
-        # self._update_scale(self.overflow)
-
-        if self.overflow:
-            # Using _amp_state.maybe_print instead of self.print here is intentional.
-            maybe_print("Gradient overflow.  Skipping step, reducing " +
-                "loss scale to {}".format(self.loss_scaler.loss_scale()))
-            return
-
-        if closure is not None:
-            retval = self._step_with_closure(closure)
-        else:
-            # torch.cuda.nvtx.range_push("pytorch optimizer step")
-            retval = self.optimizer.step()
-            # torch.cuda.nvtx.range_pop()
-
-        self._master_params_to_model_params()
-
-        return retval
-
-    def _step_with_closure(self, closure):
-        def wrapped_closure():
-            # helpful for debugging
-            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
-            #       .format(self.first_closure_call_this_step))
-            if self.first_closure_call_this_step:
-                # We expect that the fp16 params are initially fresh on entering self.step(),
-                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
-                # is called within self.optimizer.step().
-                self.first_closure_call_this_step = False
-            else:
-                # If self.optimizer.step() internally calls wrapped_closure more than once,
-                # it may update the fp32 params after each call.  However, self.optimizer
-                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
-                # we can't rely on self.optimizer to refresh the fp16 params.  We need
-                # to handle that manually:
-                self._master_params_to_model_params()
-            # Our API expects the user to give us ownership of the backward() call by
-            # replacing all calls to loss.backward() with optimizer.backward(loss).
-            # This requirement holds whether or not the call to backward() is made within a closure.
-            # If the user is properly calling optimizer.backward(loss) within "closure,"
-            # calling closure() here will give the fp32 master params fresh gradients
-            # for the optimizer to play with, so all wrapped_closure needs to do is call
-            # closure() and return the loss.
-            temp_loss = closure()
-            while(self.overflow):
-                scale = self.loss_scaler.loss_scale()
-                # self._update_scale(self.overflow) # now done at the end of backward
-                print("OVERFLOW within closure! Skipping step, reducing loss scale to {}".format(
-                      self.loss_scaler.loss_scale()))
-                temp_loss = closure()
-            return temp_loss
-
-        retval = self.optimizer.step(wrapped_closure)
-
-        self.first_closure_call_this_step = True
-
-        return retval
-
-    def backward(self, loss, update_master_grads=True, retain_graph=False):
-        """
-        :attr:`backward` performs the following conceptual steps:
-
-        1. fp32_loss = loss.float() (see first Note below)
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
-        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
-        5. Finally, master grads are divided by loss_scale.
-
-        In this way, after :attr:`backward`, the master params have fresh gradients,
-        and :attr:`step` may be called.
-
-        .. note::
-            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
-            This provides some additional safety against overflow if the user has supplied an
-            fp16 loss value.
-            However, for maximum overflow safety, the user should
-            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
-            :attr:`backward`.
-
-        .. warning::
-            The gradients found in a model's leaves after the call to
-            :attr:`backward` should not be regarded as valid in general,
-            because it's possible
-            they have been scaled (and in the case of dynamic loss scaling,
-            the scale factor may change over time).
-            If the user wants to inspect gradients after a call to :attr:`backward`,
-            only the master gradients should be regarded as valid.  These can be retrieved via
-            :attr:`inspect_master_grad_data()`.
-
-        Args:
-            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
-            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
-            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
-
-        Example::
-
-            # Ordinary operation:
-            optimizer.backward(loss)
-
-            # Naive operation with multiple losses (technically valid, but less efficient):
-            # fp32 grads will be correct after the second call,  but
-            # the first call incurs an unnecessary fp16->fp32 grad copy.
-            optimizer.backward(loss1)
-            optimizer.backward(loss2)
-
-            # More efficient way to handle multiple losses:
-            # The fp16->fp32 grad copy is delayed until fp16 grads from all
-            # losses have been accumulated.
-            optimizer.backward(loss1, update_master_grads=False)
-            optimizer.backward(loss2, update_master_grads=False)
-            optimizer.update_master_grads()
-        """
-        # To consider:  try multiple backward passes using retain_grad=True to find
-        # a loss scale that works.  After you find a loss scale that works, do a final dummy
-        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
-        # discarding the iteration,  but probably wouldn't improve overall efficiency.
-        scaled_loss = loss.float()*self.loss_scaler.loss_scale()
-        scaled_loss.backward(retain_graph=retain_graph)
-        if update_master_grads:
-            self.update_master_grads()
-
-    def update_master_grads(self):
-        # torch.cuda.nvtx.range_push("update_master_grads")
-        """
-        Copy the ``.grad`` attribute from stored references to fp16 parameters to
-        the ``.grad`` attribute of the fp32 master parameters that are directly
-        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
-        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
-        """
-        # if self.dynamic_loss_scale:
-        #     self._check_overflow()
-        #     if self.overflow: return
-        # self._model_grads_to_master_grads()
-        # self._downscale_master()
-        # Use the one-shot multi-tensor apply kernel
-        self.loss_scaler.clear_overflow_state()
-        if len(self.all_fp16_params) > 0:
-            # print("Model grads before")
-            # print([param.grad.data for param in self.all_fp16_params])
-            # I'm ONLY writing this as an incremental way to make some tests pass until
-            # I can refactor the tests as well.
-            # FP16_Optimizer should not be used by anyone.
-            model_grads = []
-            master_grads = []
-            for model_param, master_param in zip(self.all_fp16_params,
-                                                 self.all_fp32_from_fp16_params):
-                if model_param.grad is not None:
-                    model_grads.append(model_param.grad)
-                    if master_param.grad is None:
-                        master_param.grad = torch.empty_like(master_param)
-                    master_grads.append(master_param.grad)
-            self.loss_scaler.unscale(
-                model_grads,
-                master_grads,
-                self.loss_scaler.loss_scale())
-            # print("Master grads after")
-            # print([param.grad.data for param in self.all_fp32_from_fp16_params])
-        if len(self.all_fp32_from_fp32_params) > 0:
-            model_grads = []
-            master_grads = []
-            for model_param, master_param in zip(self.all_fp32_from_fp32_params,
-                                                 self.all_fp32_from_fp32_params):
-                if model_param.grad is not None:
-                    model_grads.append(model_param.grad)
-                    master_grads.append(master_param.grad)
-            # print("Model grads before")
-            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
-            self.loss_scaler.unscale(
-                model_grads,
-                master_grads,
-                self.loss_scaler.loss_scale())
-            # print("Master grads after")
-            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
-        # quit()
-        self.overflow = self.loss_scaler.update_scale()
-        # torch.cuda.nvtx.range_pop()
-
-
-    def inspect_master_grad_data(self):
-        """
-        When running with :class:`FP16_Optimizer`,
-        ``.grad`` attributes of a model's fp16 leaves should not be
-        regarded as truthful, because they might be scaled.
-        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
-        the fp32 master params' ``.grad``
-        attributes will contain valid gradients properly divided by the loss scale.  However,
-        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
-        nonintuitive.  :attr:`inspect_master_grad_data`
-        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
-
-        Returns:
-            List of lists (one list for each parameter group).  The list for each parameter group
-            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
-        """
-        if self.overflow:
-            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
-                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
-            return None
-        else:
-            # The optimizer owns only references to master params.
-            master_grads_data = []
-            for param_group in self.optimizer.param_groups:
-                master_grads_this_group = []
-                for param in param_group['params']:
-                    if param.grad is not None:
-                        master_grads_this_group.append(param.grad.data)
-                    else:
-                        master_grads_this_group.append(None)
-                master_grads_data.append(master_grads_this_group)
-            return master_grads_data
-
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale()
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler._loss_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
diff --git a/apex/fp16_utils/fp16util.py b/apex/fp16_utils/fp16util.py
deleted file mode 100644
index 325abd1b6..000000000
--- a/apex/fp16_utils/fp16util.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-
-class tofp16(nn.Module):
-    """
-    Utility module that implements::
-
-        def forward(self, input):
-            return input.half()
-    """
-
-    def __init__(self):
-        super(tofp16, self).__init__()
-
-    def forward(self, input):
-        return input.half()
-
-
-def BN_convert_float(module):
-    """
-    Utility function for network_to_half().
-
-    Retained for legacy purposes.
-    """
-    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-        module.float()
-    for child in module.children():
-        BN_convert_float(child)
-    return module
-
-
-def network_to_half(network):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-
-    Retained for legacy purposes. It is recommended to use FP16Model.
-    """
-    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
-
-
-def convert_module(module, dtype):
-    """
-    Converts a module's immediate parameters and buffers to dtype.
-    """
-    for param in module.parameters(recurse=False):
-        if param is not None:
-            if param.data.dtype.is_floating_point:
-                param.data = param.data.to(dtype=dtype)
-            if param._grad is not None and param._grad.data.dtype.is_floating_point:
-                param._grad.data = param._grad.data.to(dtype=dtype)
-
-    for buf in module.buffers(recurse=False):
-        if buf is not None and buf.data.dtype.is_floating_point:
-            buf.data = buf.data.to(dtype=dtype)
-
-
-def convert_network(network, dtype):
-    """
-    Converts a network's parameters and buffers to dtype.
-    """
-    for module in network.modules():
-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-            continue
-        convert_module(module, dtype)
-        if isinstance(module, torch.nn.RNNBase) or isinstance(module, torch.nn.modules.rnn.RNNBase):
-            module.flatten_parameters()
-    return network
-
-
-class FP16Model(nn.Module):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    """
-
-    def __init__(self, network):
-        from apex import deprecated_warning
-        deprecated_warning("apex.fp16_utils is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-        super(FP16Model, self).__init__()
-        self.network = convert_network(network, dtype=torch.half)
-
-    def forward(self, *inputs):
-        inputs = tuple(t.half() for t in inputs)
-        return self.network(*inputs)
-
-
-def backwards_debug_hook(grad):
-    raise RuntimeError("master_params recieved a gradient in the backward pass!")
-
-def prep_param_lists(model, flat_master=False):
-    """
-    Creates a list of FP32 master parameters for a given model, as in
-    `Training Neural Networks with Mixed Precision:  Real Examples`_.
-
-    Args:
-        model (torch.nn.Module): Existing Pytorch model
-        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
-    Returns:
-        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
-
-    Example::
-
-        model_params, master_params = prep_param_lists(model)
-
-    .. warning::
-        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
-
-    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
-        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
-    """
-    model_params = [param for param in model.parameters() if param.requires_grad]
-
-    if flat_master:
-        # Give the user some more useful error messages
-        try:
-            # flatten_dense_tensors returns a contiguous flat array.
-            # http://pytorch.org/docs/master/_modules/torch/_utils.html
-            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
-        except:
-            print("Error in prep_param_lists:  model may contain a mixture of parameters "
-                      "of different types.  Use flat_master=False, or use F16_Optimizer.")
-            raise
-        master_params = torch.nn.Parameter(master_params)
-        master_params.requires_grad = True
-        # master_params.register_hook(backwards_debug_hook)
-        if master_params.grad is None:
-            master_params.grad = master_params.new(*master_params.size())
-        return model_params, [master_params]
-    else:
-        master_params = [param.clone().float().detach() for param in model_params]
-        for param in master_params:
-            param.requires_grad = True
-        return model_params, master_params
-
-
-def model_grads_to_master_grads(model_params, master_params, flat_master=False):
-    """
-    Copy model gradients to master gradients.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
-    """
-    if flat_master:
-        # The flattening may incur one more deep copy than is necessary.
-        master_params[0].grad.data.copy_(
-            _flatten_dense_tensors([p.grad.data for p in model_params]))
-    else:
-        for model, master in zip(model_params, master_params):
-            if model.grad is not None:
-                if master.grad is None:
-                    master.grad = Variable(master.data.new(*master.data.size()))
-                master.grad.data.copy_(model.grad.data)
-            else:
-                master.grad = None
-
-
-def master_params_to_model_params(model_params, master_params, flat_master=False):
-    """
-    Copy master parameters to model parameters.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
-    """
-    if flat_master:
-        for model, master in zip(model_params,
-                                 _unflatten_dense_tensors(master_params[0].data, model_params)):
-            model.data.copy_(master)
-    else:
-        for model, master in zip(model_params, master_params):
-            model.data.copy_(master.data)
-
-# Backward compatibility fixes
-
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
-    clip_grad_norm = torch.nn.utils.clip_grad_norm
-else:
-    clip_grad_norm = torch.nn.utils.clip_grad_norm_
diff --git a/apex/fp16_utils/loss_scaler.py b/apex/fp16_utils/loss_scaler.py
deleted file mode 100644
index 7c7ea2416..000000000
--- a/apex/fp16_utils/loss_scaler.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import torch
-
-# item() is a recent addition, so this helps with backward compatibility.
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-class LossScaler:
-    """
-    Class that manages a static loss scale.  This class is intended to interact with
-    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
-    :class:`FP16_Optimizer`'s constructor.
-
-    Args:
-        scale (float, optional, default=1.0):  The loss scale.
-    """
-
-    def __init__(self, scale=1):
-        from apex import deprecated_warning
-        deprecated_warning("apex.fp16_utils is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-        self.cur_scale = scale
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        return False
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        return False
-
-    def update_scale(self, overflow):
-        pass
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss*self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-class DynamicLossScaler:
-    """
-    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
-    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
-    operates, because the default options can be changed using the
-    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
-
-    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
-    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
-    occurred.
-    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients detected,
-    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
-    always using the highest loss scale possible without incurring overflow.
-
-    Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
-    """
-
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        for p in params:
-            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    # `overflow` is boolean indicating whether the gradient overflowed
-    def update_scale(self, overflow):
-        if overflow:
-            # self.cur_scale /= self.scale_factor
-            self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss*self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-##############################################################
-# Example usage below here -- assuming it's in a separate file
-##############################################################
-"""
-TO-DO separate out into an example.
-if __name__ == "__main__":
-    import torch
-    from torch.autograd import Variable
-    from dynamic_loss_scaler import DynamicLossScaler
-
-    # N is batch size; D_in is input dimension;
-    # H is hidden dimension; D_out is output dimension.
-    N, D_in, H, D_out = 64, 1000, 100, 10
-
-    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-    x = Variable(torch.randn(N, D_in), requires_grad=False)
-    y = Variable(torch.randn(N, D_out), requires_grad=False)
-
-    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
-    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
-    parameters = [w1, w2]
-
-    learning_rate = 1e-6
-    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
-    loss_scaler = DynamicLossScaler()
-
-    for t in range(500):
-        y_pred = x.mm(w1).clamp(min=0).mm(w2)
-        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
-        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
-        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
-        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
-
-        # Run backprop
-        optimizer.zero_grad()
-        loss.backward()
-
-        # Check for overflow
-        has_overflow = DynamicLossScaler.has_overflow(parameters)
-
-        # If no overflow, unscale grad and update as usual
-        if not has_overflow:
-            for param in parameters:
-                param.grad.data.mul_(1. / loss_scaler.loss_scale)
-            optimizer.step()
-        # Otherwise, don't do anything -- ie, skip iteration
-        else:
-            print('OVERFLOW!')
-
-        # Update loss scale for next iteration
-        loss_scaler.update_scale(has_overflow)
-
-"""
diff --git a/apex/parallel/LARC.py b/apex/parallel/LARC.py
index 4a93fcd65..23bab53fd 100644
--- a/apex/parallel/LARC.py
+++ b/apex/parallel/LARC.py
@@ -5,10 +5,10 @@
 class LARC(object):
     """
     :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC,
-    in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive 
+    in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive
     local learning rate for each individual parameter. The algorithm is designed to improve
     convergence of large batch training.
-     
+
     See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate.
 
     In practice it modifies the gradients of parameters as a proxy for modifying the learning rate
@@ -62,7 +62,7 @@ def param_groups(self):
     @param_groups.setter
     def param_groups(self, value):
         self.optim.param_groups = value
-    
+
     def state_dict(self):
         return self.optim.state_dict()
 
diff --git a/apex/parallel/README.md b/apex/parallel/README.md
deleted file mode 100644
index e7910d82f..000000000
--- a/apex/parallel/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-## Distributed Data Parallel
-
-distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library.
-
-`apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with
-computation in the backward pass and bucketing smaller transfers to reduce the total number of
-transfers required.
-
-multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs.
-
-#### [API Documentation](https://nvidia.github.io/apex/parallel.html)
-
-#### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
-
-#### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
-
-#### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex)
-
-### Synchronized Batch Normalization
-
-`apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`.
-It reduces stats on the first (channel) dimension of the Tensor and accepts
-arbitrary spatial dimensions.
-
-#### Installation
-
-Apex provides two sync BN implementation:
-
-1. There is the Python-only implementation, which is the default implementation
-when install with `python setup.py install`.
-It uses PyTorch primitive operations and distributed communication package from
-`torch.distributed`.
-
-   - _Python-only implementation requires input tensor to be of same data type as
-layer_
-
-2. We also provide implementation with kernels through CUDA/C++ extension with
-improved performance. We are experimenting with Welford and Kahan for reduction
-hoping to get better accuracy.
-   To use the kernel implementation, user need to install Apex with CUDA extension
-enabled `python setup.py install --cuda_ext`.
-
-   - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn.
-This is required to run imagenet example in fp16._
-
-   - _Currently kernel implementation only supports GPU._
-
-#### HowTo
-
-1. User could use `apex.parallel.SyncBatchNorm` by building their module with
-the layer explicitly.
-
-```
-import apex
-input_t = torch.randn(3, 5, 20).cuda()
-sbn = apex.parallel.SyncBatchNorm(5).cuda()
-output_t = sbn(input)
-```
-
-2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`.
-
-```
-# model is an instance of torch.nn.Module
-import apex
-sync_bn_model = apex.parallel.convert_syncbn_model(model)
-```
diff --git a/apex/parallel/__init__.py b/apex/parallel/__init__.py
index d6c8b0f06..8b1378917 100644
--- a/apex/parallel/__init__.py
+++ b/apex/parallel/__init__.py
@@ -1,97 +1 @@
-import torch
 
-if hasattr(torch.distributed, 'ReduceOp'):
-    ReduceOp = torch.distributed.ReduceOp
-elif hasattr(torch.distributed, 'reduce_op'):
-    ReduceOp = torch.distributed.reduce_op
-else:
-    ReduceOp = torch.distributed.deprecated.reduce_op
-
-from .distributed import DistributedDataParallel, Reducer
-# This is tricky because I'd like SyncBatchNorm to be exposed the same way
-# for both the cuda-enabled and python-fallback versions, and I don't want
-# to suppress the error information.
-try:
-    import syncbn
-    from .optimized_sync_batchnorm import SyncBatchNorm
-except ImportError as err:
-    from .sync_batchnorm import SyncBatchNorm
-    SyncBatchNorm.syncbn_import_error = err
-
-def convert_syncbn_model(module, process_group=None, channel_last=False):
-    '''
-    Recursively traverse module and its children to replace all instances of
-    ``torch.nn.modules.batchnorm._BatchNorm`` with :class:`apex.parallel.SyncBatchNorm`.
-
-    All ``torch.nn.BatchNorm*N*d`` wrap around
-    ``torch.nn.modules.batchnorm._BatchNorm``, so this function lets you easily switch
-    to use sync BN.
-
-    Args:
-        module (torch.nn.Module): input module
-
-    Example::
-
-        >>> # model is an instance of torch.nn.Module
-        >>> import apex
-        >>> sync_bn_model = apex.parallel.convert_syncbn_model(model)
-    '''
-    from apex import deprecated_warning
-    deprecated_warning("apex.parallel.convert_syncbn_model is deprecated and will be removed by the end of February 2023. Use `torch.nn.SyncBatchNorm.convert_sync_batchnorm`.")
-    mod = module
-    if isinstance(module, torch.nn.modules.instancenorm._InstanceNorm):
-        return module
-    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
-        mod = SyncBatchNorm(module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats, process_group, channel_last=channel_last)
-        mod.running_mean = module.running_mean
-        mod.running_var = module.running_var
-        mod.num_batches_tracked = module.num_batches_tracked
-        if module.affine:
-            mod.weight.data = module.weight.data.clone().detach()
-            mod.bias.data = module.bias.data.clone().detach()
-    for name, child in module.named_children():
-        mod.add_module(name, convert_syncbn_model(child,
-                                                  process_group=process_group,
-                                                  channel_last=channel_last))
-    # TODO(jie) should I delete model explicitly?
-    del module
-    return mod
-
-def create_syncbn_process_group(group_size):
-    '''
-    Creates process groups to be used for syncbn of a give ``group_size`` and returns
-    process group that current GPU participates in.
-
-    ``group_size`` must divide the total number of GPUs (world_size).
-
-    ``group_size`` of 0 would be considered as =world_size. In this case ``None`` will be returned.
-
-    ``group_size`` of 1 would be equivalent to using non-sync bn, but will still carry the overhead.
-
-    Args:
-        group_size (int): number of GPU's to collaborate for sync bn
-
-    Example::
-
-        >>> # model is an instance of torch.nn.Module
-        >>> import apex
-        >>> group = apex.parallel.create_syncbn_process_group(group_size)
-    '''
-
-    if group_size==0:
-        return None
-
-    world_size = torch.distributed.get_world_size()
-    assert(world_size >= group_size)
-    assert(world_size % group_size == 0)
-
-    group=None
-    for group_num in (range(world_size//group_size)):
-        group_ids = range(group_num*group_size, (group_num+1)*group_size)
-        cur_group = torch.distributed.new_group(ranks=group_ids)
-        if (torch.distributed.get_rank()//group_size == group_num):
-            group = cur_group
-            #can not drop out and return here, every process must go through creation of all subgroups
-
-    assert(group is not None)
-    return group
diff --git a/apex/parallel/distributed.py b/apex/parallel/distributed.py
deleted file mode 100644
index 1c530d510..000000000
--- a/apex/parallel/distributed.py
+++ /dev/null
@@ -1,643 +0,0 @@
-from collections import OrderedDict
-import copy
-import importlib
-from itertools import chain
-
-import torch
-import torch.distributed as dist
-from torch.nn.modules import Module
-from torch.autograd import Variable
-
-from ..multi_tensor_apply import multi_tensor_applier
-
-imported_flatten_impl = False
-
-def import_flatten_impl():
-    global flatten_impl, unflatten_impl, imported_flatten_impl
-    try:
-        import apex_C
-        flatten_impl = apex_C.flatten
-        unflatten_impl = apex_C.unflatten
-    except ImportError:
-        print("Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten.")
-        flatten_impl = torch._utils._flatten_dense_tensors
-        unflatten_impl = torch._utils._unflatten_dense_tensors
-    imported_flatten_impl = True
-
-def flatten(bucket):
-    if not imported_flatten_impl:
-        import_flatten_impl()
-    return flatten_impl(bucket)
-
-def unflatten(coalesced, bucket):
-    if not imported_flatten_impl:
-        import_flatten_impl()
-    return unflatten_impl(coalesced, bucket)
-
-# apply_dist_call requires that tensors in 'bucket' are all the same type.
-def apply_flat_dist_call(bucket, call, extra_args=None):
-
-    coalesced = flatten(bucket)
-
-    if extra_args is not None:
-        call(coalesced, *extra_args)
-    else:
-        call(coalesced)
-
-    if call is dist.all_reduce:
-        coalesced /= dist.get_world_size()
-
-    for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
-        buf.copy_(synced)
-
-def split_half_float_double(tensors):
-    dtypes = ["torch.cuda.HalfTensor",  "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor"]
-    buckets = []
-    for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t.type() == dtype]
-        if bucket:
-            buckets.append(bucket)
-    return buckets
-
-def split_by_type(tensors):
-    buckets = OrderedDict()
-    for tensor in tensors:
-        tp = tensor.type()
-        if tp not in buckets:
-            buckets[tp] = []
-        buckets[tp].append(tensor)
-    return buckets
-
-# flat_dist_call organizes 'tensors' by type.
-def flat_dist_call(tensors, call, extra_args=None):
-    buckets = split_by_type(tensors)
-
-    for tp in buckets:
-        bucket = buckets[tp]
-        apply_flat_dist_call(bucket, call, extra_args)
-
-
-def extract_tensors(maybe_tensor, tensor_list):
-    if torch.is_tensor(maybe_tensor):
-        tensor_list.append(maybe_tensor)
-    else:
-        try:
-            for item in maybe_tensor:
-                extract_tensors(item, tensor_list)
-        except TypeError:
-            return
-
-
-class Reducer(object):
-    """
-    :class:`apex.parallel.Reducer` is a simple class that helps allreduce a module's parameters
-    across processes.  :class:`Reducer` is intended to give the user additional control:
-    Unlike :class:`DistributedDataParallel`, :class:`Reducer` will not automatically allreduce
-    parameters during ``backward()``.
-    Instead, :class:`Reducer` waits for the user to call ``<reducer_instance>.reduce()`` manually.
-    This enables, for example, delaying the allreduce to be carried out every
-    several iterations instead of every single iteration.
-
-    Like :class:`DistributedDataParallel`, :class:`Reducer` averages any tensors it allreduces
-    over the number of participating processes.
-
-    :class:`Reducer` is designed to work with the upstream launch utility script
-    ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
-    When used with this launcher, :class:`Reducer` assumes 1:1 mapping of processes to GPUs.
-    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
-
-    Args:
-        module_or_grads_list: Either a network definition (module) being run in multi-gpu/distributed mode, or an iterable of gradients to be reduced.  If a module is passed in, the Reducer constructor will sync the parameters across processes (broadcasting from rank 0) to make sure they're all initialized with the same values.  If a list of gradients (that came from some module) is passed in, the user is responsible for manually syncing that module's parameters at the beginning of training.
-    """
-
-    def __init__(self, module_or_grads_list):
-        if isinstance(module_or_grads_list, Module):
-            self.module = module_or_grads_list
-            flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
-
-        else:
-            self.module = None
-            self.grads = []
-            extract_tensors(module_or_grads_list, self.grads)
-
-    def reduce(self):
-        if self.module:
-            grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
-            flat_dist_call(grads, dist.all_reduce)
-        else:
-            flat_dist_call(self.grads, dist.all_reduce)
-
-
-class DistributedDataParallel(Module):
-    """
-    :class:`apex.parallel.DistributedDataParallel` is a module wrapper that enables
-    easy multiprocess distributed data parallel training, similar to ``torch.nn.parallel.DistributedDataParallel``.  Parameters are broadcast across participating processes on initialization, and gradients are
-    allreduced and averaged over processes during ``backward()``.
-
-    :class:`DistributedDataParallel` is optimized for use with NCCL.  It achieves high performance by
-    overlapping communication with computation during ``backward()`` and bucketing smaller gradient
-    transfers to reduce the total number of transfers required.
-
-    :class:`DistributedDataParallel` is designed to work with the upstream launch utility script
-    ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
-    When used with this launcher, :class:`DistributedDataParallel` assumes 1:1 mapping of processes to GPUs.
-    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
-
-    https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed shows detailed usage.
-    https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows another example
-    that combines :class:`DistributedDataParallel` with mixed precision training.
-
-    Args:
-        module: Network definition to be run in multi-gpu/distributed mode.
-        message_size (int, default=1e7): Minimum number of elements in a communication bucket.
-        delay_allreduce (bool, default=False):  Delay all communication to the end of the backward pass.  This disables overlapping communication with computation.
-        allreduce_trigger_params (list, optional, default=None):  If supplied, should contain a list of parameters drawn from the model.  Allreduces will be kicked off whenever one of these parameters receives its gradient (as opposed to when a bucket of size message_size is full).  At the end of backward(), a cleanup allreduce to catch any remaining gradients will also be performed automatically.  If allreduce_trigger_params is supplied, the message_size argument will be ignored.
-        allreduce_always_fp32 (bool, default=False):  Convert any FP16 gradients to FP32 before allreducing.  This can improve stability for widely scaled-out runs.
-        gradient_average (bool, default=True):  Option to toggle whether or not DDP averages the allreduced gradients over processes.  For proper scaling, the default value of True is recommended.
-        gradient_predivide_factor (float, default=1.0):  Allows perfoming the average of gradients over processes partially before and partially after the allreduce.  Before allreduce:  ``grads.mul_(1.0/gradient_predivide_factor)``.  After allreduce:  ``grads.mul_(gradient_predivide_factor/world size)``.  This can reduce the stress on the dynamic range of FP16 allreduces for widely scaled-out runs.
-
-    .. warning::
-        If ``gradient_average=False``, the pre-allreduce division (``grads.mul_(1.0/gradient_predivide_factor)``) will still be applied, but the post-allreduce gradient averaging (``grads.mul_(gradient_predivide_factor/world size)``) will be omitted.
-
-    """
-
-    def __init__(self,
-                 module,
-                 message_size=10000000,
-                 delay_allreduce=False,
-                 shared_param=None,
-                 allreduce_trigger_params=None,
-                 retain_allreduce_buffers=False,
-                 allreduce_always_fp32=False,
-                 num_allreduce_streams=1,
-                 allreduce_communicators=None,
-                 gradient_average=True,
-                 gradient_predivide_factor=1.0,
-                 gradient_average_split_factor=None,
-                 prof=False):
-        super(DistributedDataParallel, self).__init__()
-        from apex import deprecated_warning
-        deprecated_warning("apex.parallel.DistributedDataParallel is deprecated and will be removed by the end of February 2023.")
-
-        # Backward/forward compatibility around
-        # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
-        # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
-        if hasattr(dist, "get_backend"):
-            self._backend = dist.get_backend()
-            if hasattr(dist, "DistBackend"):
-                self.backend_enum_holder = dist.DistBackend
-            else:
-                self.backend_enum_holder = dist.Backend
-        else:
-            self._backend = dist._backend
-            self.backend_enum_holder = dist.dist_backend
-
-        self.warn_on_half = True if self._backend == self.backend_enum_holder.GLOO else False
-
-        self.prof = prof
-
-        self.allreduce_different_streams = (num_allreduce_streams > 1)
-        self.num_allreduce_streams = num_allreduce_streams
-        self.allreduce_communicators = allreduce_communicators
-        if self.allreduce_communicators:
-            assert len(allreduce_communicators[0]) == num_allreduce_streams
-            assert len(allreduce_communicators[0]) == len(allreduce_communicators[1])
-            assert self.allreduce_different_streams
-
-        if self.allreduce_different_streams and delay_allreduce:
-            raise ValueError("self.allreduce_different_streams may only be used if delay_allreduce=False.")
-
-        if shared_param is not None:
-            raise ValueError("shared_param is no longer supported as an option.  It was misleadingly named from the start.  It turns out overlapping communication with computation should work fine with shared parameters.  If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.")
-
-        self.world_size = float(dist.get_world_size())
-
-        self.retain_allreduce_buffers = retain_allreduce_buffers
-        self.allreduce_always_fp32 = allreduce_always_fp32
-        self.gradient_average = gradient_average
-        self.gradient_predivide_factor = gradient_predivide_factor
-
-        self.custom_allreduce_triggers = False
-        if allreduce_trigger_params is not None:
-            if delay_allreduce:
-                raise ValueError("Setting allreduce_trigger_params is only valid if delay_allreduce=False.")
-            self.custom_allreduce_triggers = True
-            self.allreduce_trigger_params = set([id(param) for param in allreduce_trigger_params])
-
-        self.delay_allreduce = delay_allreduce
-        self.message_size = message_size
-
-        self.main_stream = torch.cuda.current_stream()
-
-        self.bucket_streams = []
-        self.bucket_events = []
-
-        self.module = module
-
-        self._disable_allreduce = False
-
-        if self._backend == self.backend_enum_holder.NCCL:
-            for param in self.module.parameters():
-                assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."
-
-        self.active_params = []
-
-        self.param_type_to_tmp_i = {"torch.cuda.HalfTensor" : 0,
-                                    "torch.cuda.FloatTensor" : 1,
-                                    "torch.cuda.DoubleTensor" : 2}
-
-        if multi_tensor_applier.available:
-            # TODO:  I really need to centralize the C++ backed imports
-            import amp_C
-            self.multi_tensor_scale = amp_C.multi_tensor_scale
-            self._overflow_buf = torch.cuda.IntTensor([0])
-
-        self.create_hooks()
-
-        flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
-
-
-    def __setstate__(self, state):
-        super(DistributedDataParallel, self).__setstate__(state)
-        if self.allreduce_different_streams and delay_allreduce:
-            raise ValueError("self.allreduce_different_streams may only be used if delay_allreduce=False.")
-
-        if self.delay_allreduce:
-            self.needs_refresh = True
-
-        self.bucket_streams = []
-        self.bucket_events = []
-
-
-    def __getstate__(self):
-        attrs = copy.copy(self.__dict__)
-        if self._backend != self.backend_enum_holder.NCCL:
-            del attrs['self.bucket_streams']
-            del attrs['self.bucket_events']
-            return attrs
-
-    def enable_allreduce(self):
-        self._disable_allreduce = False
-
-    def disable_allreduce(self):
-        self._disable_allreduce = True
-
-    # Broadcast rank 0's bucket structure across all processes, and have all processes
-    # regenerate their bucket structures to match.
-    def sync_bucket_structure(self):
-        # Append leftover buckets
-        for tmp_bucket in self.tmp_buckets:
-            if len(tmp_bucket) > 0:
-                self.active_i_buckets.append(tmp_bucket)
-
-        self.num_buckets = len(self.active_i_buckets)
-        self.bucket_sizes = [len(bucket) for bucket in self.active_i_buckets]
-
-        info_tensor = torch.cuda.IntTensor([self.num_buckets] +
-                                           self.bucket_sizes +
-                                           list(chain(*self.active_i_buckets)))
-
-        dist.broadcast(info_tensor, 0)
-
-        info = [int(entry) for entry in info_tensor]
-
-        self.num_buckets = info[0]
-        self.bucket_sizes = info[1:self.num_buckets + 1]
-        self.buckets = [[None for _ in range(self.bucket_sizes[i])]
-                        for i in range(self.num_buckets)]
-        # Technically, active_i_buckets' work is done.  But the information is still useful to
-        # keep around.  Therefore, refresh active_i_buckets based on rank 0 as well.
-        self.active_i_buckets = [[None for _ in range(self.bucket_sizes[i])]
-                                 for i in range(self.num_buckets)]
-
-        flattened_buckets = info[self.num_buckets + 1:]
-        flat_i = 0
-        for bucket_idx in range(self.num_buckets):
-            for bucket_loc in range(self.bucket_sizes[bucket_idx]):
-                param_i = flattened_buckets[flat_i]
-                self.active_i_buckets[bucket_idx][bucket_loc] = param_i
-                self.param_id_to_bucket[id(self.active_params[param_i])] = (bucket_idx, bucket_loc)
-                flat_i += 1
-
-
-    def create_hooks(self):
-        # Fallback hook that's only called at the end of backward.
-        # Used if you deliberately want to delay allreduces to the end, or to refresh the
-        # bucket structure that will be used to overlap communication with computation in later
-        # iterations.
-        def allreduce_params():
-            # Bucket record refresh
-            if not self.delay_allreduce:
-                if self.needs_refresh:
-                    self.sync_bucket_structure()
-
-                    self.needs_refresh = False
-
-            self.allreduce_fallback()
-
-
-        def overlapping_backward_epilogue():
-            for stream, event in zip(self.bucket_streams, self.bucket_events):
-                stream.record_event(event)
-                torch.cuda.current_stream().wait_event(event)
-
-            # Sanity checks that all the buckets were kicked off
-            if self.next_bucket != self.num_buckets:
-                raise RuntimeError("In epilogue, next_bucket ({}) != num_buckets ({}).  ".format(
-                                   self.next_bucket, self.num_buckets),
-                                   "This probably indicates some buckets were not allreduced.")
-
-            for actual, expected in zip(self.buckets_ready_size, self.bucket_sizes):
-                if actual != expected:
-                    raise RuntimeError("Some param buckets were not allreduced.")
-
-
-        self.grad_accs = []
-        for param in self.module.parameters():
-            if param.requires_grad:
-                def wrapper(param):
-                    param_tmp = param.expand_as(param)
-                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
-
-                    def allreduce_hook(*unused):
-                        if self.prof:
-                            torch.cuda.nvtx.range_push("allreduce_hook")
-
-                        if not self._disable_allreduce:
-                            if self.delay_allreduce or self.needs_refresh:
-                                # TODO:  How do we want to handle multiple backward passes between
-                                # each forward, e.g., backward passes with retain_graph=True?
-                                # needs_refresh and callback_queued are both vulnerable states.
-                                if not self.delay_allreduce and self.needs_refresh:
-                                    # Use the backward pass to build the bucket structure on the fly.
-                                    active_i = self.param_id_to_active_i[id(param)]
-
-                                    # Float, half, and double tensors are grouped into buckets separately.
-                                    current_type = self.param_type_to_tmp_i[param.type()]
-
-                                    self.tmp_buckets[current_type].append(active_i)
-
-                                    ship_tmp_bucket = False
-                                    if self.custom_allreduce_triggers:
-                                        if id(param) in self.allreduce_trigger_params:
-                                            ship_tmp_bucket = True
-                                    else:
-                                        self.tmp_numels[current_type] += param.numel()
-                                        if self.tmp_numels[current_type] >= self.message_size:
-                                            ship_tmp_bucket = True
-
-                                    # To consider:  If custom_allreduce_triggers are in use, ship all
-                                    # tmp_buckets, not just tmp_buckets[current_type].
-                                    if ship_tmp_bucket:
-                                        self.active_i_buckets.append(self.tmp_buckets[current_type])
-                                        self.tmp_buckets[current_type] = []
-                                        self.tmp_numels[current_type] = 0
-
-                                if not self.callback_queued:
-                                    Variable._execution_engine.queue_callback(allreduce_params)
-                                    self.callback_queued = True
-                            else:
-                                if not self.callback_queued:
-                                    Variable._execution_engine.queue_callback(overlapping_backward_epilogue)
-                                    self.callback_queued = True
-
-                                self.comm_ready_buckets(param)
-
-                        if self.prof:
-                            torch.cuda.nvtx.range_pop()
-
-                    grad_acc.register_hook(allreduce_hook)
-                    self.grad_accs.append(grad_acc)
-
-                wrapper(param)
-
-
-    def _stream_this_bucket(self, bucket_idx):
-        if self.allreduce_different_streams:
-            return self.bucket_streams[bucket_idx%self.num_allreduce_streams]
-        else:
-            return self.bucket_streams[0]
-
-
-    def _event_this_bucket(self, bucket_idx):
-        if self.allreduce_different_streams:
-            return self.bucket_events[bucket_idx%self.num_allreduce_streams]
-        else:
-            return self.bucket_events[0]
-
-
-    def allreduce_bucket(self, bucket, bucket_idx, force_default_stream):
-        tensor = flatten(bucket)
-
-        if force_default_stream:
-            bucket_stream = self.main_stream
-        else:
-            bucket_stream = self._stream_this_bucket(bucket_idx)
-            bucket_event = self._event_this_bucket(bucket_idx)
-            torch.cuda.current_stream().record_event(bucket_event)
-            bucket_stream.wait_event(bucket_event)
-
-        with torch.cuda.stream(bucket_stream):
-            # self.main_stream.wait_stream(torch.cuda.current_stream())
-            # torch.cuda.synchronize()
-
-            tensor_to_allreduce = tensor
-
-            if self.allreduce_always_fp32:
-                tensor_to_allreduce = tensor.float()
-
-            if self.gradient_predivide_factor != 1.0:
-                tensor_to_allreduce.mul_(1./self.gradient_predivide_factor)
-
-            if self.allreduce_different_streams and not force_default_stream:
-                dist.all_reduce(tensor_to_allreduce, group=self.bucket_pgs[bucket_idx%self.num_allreduce_streams])
-            else:
-                dist.all_reduce(tensor_to_allreduce)
-
-            if self.gradient_average:
-                tensor_to_allreduce.mul_(self.gradient_predivide_factor/self.world_size)
-
-            if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-                tensor.copy_(tensor_to_allreduce)
-
-            if not self.retain_allreduce_buffers:
-                if multi_tensor_applier.available:
-                    multi_tensor_applier(
-                        self.multi_tensor_scale,
-                        self._overflow_buf,
-                        [unflatten(tensor, bucket), bucket],
-                        1.0)
-                else:
-                    for buf, synced in zip(bucket, unflatten(tensor, bucket)):
-                        buf.copy_(synced)
-
-            # I think we actually do need this here.  After allreduce_bucket returns, tensor will
-            # eventually go out of scope and die, at which point it could otherwise be freed for
-            # further reuse by the main stream while the allreduce/div/unflatten are underway in bucket_stream.
-            tensor.record_stream(bucket_stream)
-
-        return tensor
-
-
-    def allreduce_maybe_retain(self, bucket, bucket_idx, force_default_stream=False):
-        allreduced = self.allreduce_bucket(bucket, bucket_idx, force_default_stream)
-        if self.retain_allreduce_buffers:
-            if self.allreduce_buffers[bucket_idx] is not None:
-                raise RuntimeError("The backward pass is attempting to replace an already-filled "
-                                   "allreduce buffer.  This is almost certainly an error.")
-            self.allreduce_buffers[bucket_idx] = allreduced
-            for view, grad in zip(unflatten(allreduced, bucket), bucket):
-                grad.data = view
-            # for buf, synced in zip(bucket, unflatten(allreduced, bucket)):
-            #     buf.copy_(synced)
-
-
-    def allreduce_fallback(self):
-        for stream, event in zip(self.bucket_streams, self.bucket_events):
-            stream.record_event(event)
-            torch.cuda.current_stream().wait_event(event)
-
-        if self.retain_allreduce_buffers:
-            grads = [param.grad for param in self.module.parameters() if param.grad is not None]
-        else:
-            grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
-
-        split_buckets = split_half_float_double(grads)
-
-        # If retain_allreduce_buffers is True and delay_allreduce is False,
-        # this will only be done during the first backward pass, ignored by the
-        # training script, and overwritten in the next forward pass.  So it's harmless.
-        if self.retain_allreduce_buffers:
-            self.allreduce_buffers = [None for _ in range(len(split_buckets))]
-
-        for i, bucket in enumerate(split_buckets):
-            allreduced = self.allreduce_maybe_retain(bucket, i, force_default_stream=True)
-
-
-    def comm_ready_buckets(self, param):
-        # Need to do this in every hook for compatibility with Ruberry's streaming backward PR.
-        # self.reduction_stream.wait_stream(torch.cuda.current_stream())
-        if self.prof:
-            torch.cuda.nvtx.range_push("comm_ready_buckets")
-
-        bucket_idx, bucket_loc = self.param_id_to_bucket[id(param)]
-
-        if self.buckets[bucket_idx][bucket_loc] is not None:
-            raise RuntimeError("The backward pass is attempting to replace an already-filled "
-                               "bucket slot.  This is almost certainly an error.")
-
-        if self.retain_allreduce_buffers:
-            self.buckets[bucket_idx][bucket_loc] = param.grad
-        else:
-            self.buckets[bucket_idx][bucket_loc] = param.grad.data
-
-        self.buckets_ready_size[bucket_idx] += 1
-
-        if self.buckets_ready_size[bucket_idx] == self.bucket_sizes[bucket_idx]:
-            if bucket_idx == self.next_bucket:
-                self.allreduce_maybe_retain(self.buckets[bucket_idx], bucket_idx)
-
-                self.next_bucket += 1
-
-                # Reversing upstream's logic here, because we constructed our buckets based on
-                # the order things were received during backward.
-                if len(self.ready_buckets_not_reduced) > 0:
-                    sorted_todo = sorted(self.ready_buckets_not_reduced)
-                    for i in sorted_todo:
-                        # Nothing can be reduced now
-                        if i > self.next_bucket:
-                            break
-                        elif i == self.next_bucket:
-                            self.allreduce_maybe_retain(self.buckets[i], i)
-                            self.ready_buckets_not_reduced.remove(i)
-                            self.next_bucket += 1
-                        else:
-                            raise ValueError("i should always be >= next_bucket")
-            else:
-                self.ready_buckets_not_reduced.add(bucket_idx)
-
-        if self.prof:
-            torch.cuda.nvtx.range_pop()
-
-
-    def forward(self, *inputs, **kwargs):
-        result = self.module(*inputs, **kwargs)
-
-        if self.prof:
-            torch.cuda.nvtx.range_push("forward pass DDP logic")
-
-        if not self._disable_allreduce:
-            if not self.delay_allreduce:
-                param_list = [param for param in self.module.parameters() if param.requires_grad]
-
-                # Conditions under which to refresh self.record
-                # Forward has the authority to set needs_refresh to True, but only allreduce_params
-                # in backward has the authority to set needs_refresh to False.
-                # Parentheses are not necessary for correct order of operations, but make the intent clearer.
-                if ((not self.active_params) or
-                    (len(param_list) != len(self.active_params)) or
-                    any([param1 is not param2 for param1, param2 in zip(param_list, self.active_params)])):
-                    self.needs_refresh = True
-
-                if self.needs_refresh:
-                    self.active_i_buckets = []
-                    self.buckets = []
-                    self.tmp_buckets = [[], [], []] # [running half, float, double buckets]
-                    self.tmp_numels = [0, 0, 0]
-                    self.bucket_sizes = []
-                    self.param_id_to_active_i = {id(param) : i for i, param in enumerate(param_list)}
-                    self.param_id_to_bucket = {}
-                    self.bucket_pgs = []
-                    self.bucket_streams = []
-                    self.bucket_events = []
-                else:
-                    # self.buckets = [[None for _ in range(self.bucket_sizes[i])]
-                    #                 for i in range(self.num_buckets)]
-                    if not self.buckets:
-                        self.buckets = [[None for _ in range(self.bucket_sizes[i])]
-                                        for i in range(self.num_buckets)]
-                    else:
-                        assert len(self.buckets) == self.num_buckets, "len(buckets) = {}, expected {}".format(
-                            len(self.buckets), self.num_buckets)
-                        for b, bucket in enumerate(self.buckets):
-                            assert len(bucket) == self.bucket_sizes[b], "len(buckets[{}]) = {}, expected {})".format(
-                                b, len(buckets[b]), self.bucket_sizes[b])
-                            for i in range(len(bucket)):
-                                bucket[i] = None
-
-                    if self.allreduce_communicators:
-                        self.bucket_pgs = self.allreduce_communicators[0]
-                        self.bucket_streams = self.allreduce_communicators[1]
-                        self.bucket_events = [torch.cuda.Event(enable_timing=False,
-                                            blocking=False) for _ in range(self.num_allreduce_streams)]
-                    else:
-                        if self.allreduce_different_streams:
-                            if not self.bucket_pgs:
-                                self.bucket_pgs = [dist.new_group() for _ in range(self.num_allreduce_streams)]
-                                for i, bg in enumerate(self.bucket_pgs):
-                                    print("rank {} created group {} with backend {}".format(
-                                          dist.get_rank(), i, dist.get_backend(bg)))
-                        if self.allreduce_different_streams:
-                            if not self.bucket_streams:
-                                self.bucket_streams = [torch.cuda.Stream() for _ in range(self.num_allreduce_streams)]
-                                self.bucket_events = [torch.cuda.Event(enable_timing=False,
-                                                      blocking=False) for _ in range(self.num_allreduce_streams)]
-                        else:
-                            if not self.bucket_streams:
-                                self.bucket_streams = [torch.cuda.Stream()]
-                                self.bucket_events = [torch.cuda.Event(enable_timing=False, blocking=False)]
-
-                    self.buckets_ready_size = [0 for i in range(self.num_buckets)]
-                    if(self.retain_allreduce_buffers):
-                        self.allreduce_buffers = [None for _ in range(self.num_buckets)]
-                    self.next_bucket = 0
-                    self.ready_buckets_not_reduced = set()
-
-                self.active_params = param_list
-
-            self.callback_queued = False
-
-        if self.prof:
-            torch.cuda.nvtx.range_pop()
-
-        return result
diff --git a/apex/parallel/multiproc.py b/apex/parallel/multiproc.py
deleted file mode 100644
index ff743df20..000000000
--- a/apex/parallel/multiproc.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-import sys
-import subprocess
-
-def docstring_hack():
-    """
-    Multiproc file which will launch a set of processes locally for multi-gpu
-    usage: python -m apex.parallel.multiproc main.py ...
-    """
-    pass
-
-argslist = list(sys.argv)[1:]
-world_size = torch.cuda.device_count()
-
-if '--world-size' in argslist:
-    world_size = int(argslist[argslist.index('--world-size')+1])
-else:
-    argslist.append('--world-size')
-    argslist.append(str(world_size))
-
-workers = []
-
-for i in range(world_size):
-    if '--rank' in argslist:
-        argslist[argslist.index('--rank')+1] = str(i)
-    else:
-        argslist.append('--rank')
-        argslist.append(str(i))
-    stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
-    print(argslist)
-    p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
-    workers.append(p)
-
-for p in workers:
-    p.wait()
diff --git a/apex/parallel/optimized_sync_batchnorm.py b/apex/parallel/optimized_sync_batchnorm.py
deleted file mode 100644
index 65cf5eabf..000000000
--- a/apex/parallel/optimized_sync_batchnorm.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import torch
-from torch.nn.modules.batchnorm import _BatchNorm
-from torch.nn import functional as F
-
-import syncbn
-from .optimized_sync_batchnorm_kernel import SyncBatchnormFunction
-
-
-class SyncBatchNorm(_BatchNorm):
-    """
-    synchronized batch normalization module extented from `torch.nn.BatchNormNd`
-    with the added stats reduction across multiple processes.
-    :class:`apex.parallel.SyncBatchNorm` is designed to work with
-    `DistributedDataParallel`.
-
-    When running in training mode, the layer reduces stats across all processes
-    to increase the effective batchsize for normalization layer. This is useful
-    in applications where batch size is small on a given process that would
-    diminish converged accuracy of the model. The model uses collective
-    communication package from `torch.distributed`.
-
-    When running in evaluation mode, the layer falls back to
-    `torch.nn.functional.batch_norm`
-
-    Args:
-        num_features: :math:`C` from an expected input of size
-            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
-        eps: a value added to the denominator for numerical stability.
-            Default: 1e-5
-        momentum: the value used for the running_mean and running_var
-            computation. Can be set to ``None`` for cumulative moving average
-            (i.e. simple average). Default: 0.1
-        affine: a boolean value that when set to ``True``, this module has
-            learnable affine parameters. Default: ``True``
-        track_running_stats: a boolean value that when set to ``True``, this
-            module tracks the running mean and variance, and when set to ``False``,
-            this module does not track such statistics and always uses batch
-            statistics in both training and eval modes. Default: ``True``
-        process_group: pass in a process group within which the stats of the
-            mini-batch is being synchronized. ``None`` for using default process
-            group
-        channel_last: a boolean value that when set to ``True``, this module
-            take the last dimension of the input tensor to be the channel
-            dimension. Default: False
-
-    Examples::
-        >>> # channel first tensor
-        >>> sbn = apex.parallel.SyncBatchNorm(100).cuda()
-        >>> inp = torch.randn(10, 100, 14, 14).cuda()
-        >>> out = sbn(inp)
-        >>> inp = torch.randn(3, 100, 20).cuda()
-        >>> out = sbn(inp)
-        >>> # channel last tensor
-        >>> sbn = apex.parallel.SyncBatchNorm(100, channel_last=True).cuda()
-        >>> inp = torch.randn(10, 14, 14, 100).cuda()
-    """
-
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False, fuse_relu=False):
-        super(SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
-        self.process_group = process_group
-        self.channel_last = channel_last
-        self.fuse_relu = fuse_relu
-
-    def _specify_process_group(self, process_group):
-        self.process_group = process_group
-
-    def _specify_channel_last(self, channel_last):
-        self.channel_last = channel_last
-
-    def forward(self, input, z = None):
-        # if input.dim() == 2, we switch to channel_last for efficient memory accessing
-        channel_last = self.channel_last if input.dim() != 2 else True
-
-        if not self.training and self.track_running_stats and not channel_last and not self.fuse_relu and z == None:
-            # fall back to pytorch implementation for inference
-            return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
-        else:
-            exponential_average_factor = 0.0
-            if self.training and self.track_running_stats:
-                self.num_batches_tracked += 1
-                if self.momentum is None:
-                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
-                else:
-                    exponential_average_factor = self.momentum
-            return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last, self.fuse_relu)
diff --git a/apex/parallel/optimized_sync_batchnorm_kernel.py b/apex/parallel/optimized_sync_batchnorm_kernel.py
deleted file mode 100644
index 616847149..000000000
--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import torch
-from torch.autograd.function import Function
-
-import syncbn
-from apex.parallel import ReduceOp
-
-class SyncBatchnormFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input, z, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False, fuse_relu = False):
-        input = input.contiguous()
-        world_size = 0
-
-        mean = None
-        var_biased = None
-        inv_std = None
-        var = None
-        out = None
-        count = None
-        if track_running_stats:
-            if channel_last:
-                count = int(input.numel()/input.size(-1))
-                mean, var_biased = syncbn.welford_mean_var_c_last(input)
-                num_channels = input.size(-1)
-            else:
-                count = int(input.numel()/input.size(1))
-                mean, var_biased = syncbn.welford_mean_var(input)
-                num_channels = input.size(1)
-
-            if torch.distributed.is_initialized():
-                if not process_group:
-                    process_group = torch.distributed.group.WORLD
-                device = mean.device
-                world_size = torch.distributed.get_world_size(process_group)
-
-                count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill_(count)
-                combined = torch.cat([mean.view(-1), var_biased.view(-1), count_t], dim=0)
-                combined_list = [torch.empty_like(combined) for k in range(world_size)]
-                torch.distributed.all_gather(combined_list, combined, process_group)
-                combined = torch.stack(combined_list, dim=0)
-                mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
-                count_all = count_all.view(-1)
-                mean, var, inv_std = syncbn.welford_parallel(mean_all, invstd_all, count_all.to(torch.int32), eps)
-            else:
-                device = mean.device
-                count_all = torch.cuda.IntTensor([count], device=device)
-                inv_std = 1.0 / torch.sqrt(var_biased + eps)
-                var = var_biased * (count) / (count-1)
-
-            if count == 1 and world_size < 2:
-                raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
-
-            r_m_inc = mean if running_mean.dtype != torch.float16 else mean.half()
-            r_v_inc = var if running_variance.dtype != torch.float16 else var.half()
-            running_mean.data = running_mean.data * (1-momentum) + momentum*r_m_inc
-            running_variance.data = running_variance.data * (1-momentum) + momentum*r_v_inc
-        else:
-            mean = running_mean.data
-            inv_std = 1.0 / torch.sqrt(running_variance.data + eps)
-
-        ctx.save_for_backward(input, weight, mean, inv_std, z, bias, count_all.to(torch.int32))
-        ctx.process_group = process_group
-        ctx.channel_last = channel_last
-        ctx.world_size = world_size
-        ctx.fuse_relu = fuse_relu
-
-        if channel_last:
-            out = syncbn.batchnorm_forward_c_last(input, z, mean, inv_std, weight, bias, fuse_relu)
-        else:
-            out = syncbn.batchnorm_forward(input, mean, inv_std, weight, bias)
-
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_output = grad_output.contiguous()
-        # mini batch mean & var are calculated by forward path.
-        # mu = 1./N*np.sum(h, axis = 0)
-        # var = 1./N*np.sum((h-mu)**2, axis = 0)
-        saved_input, weight, mean, inv_std, z, bias, count = ctx.saved_tensors
-        process_group = ctx.process_group
-        channel_last = ctx.channel_last
-        world_size = ctx.world_size
-        fuse_relu = ctx.fuse_relu
-        grad_input = grad_z = grad_weight = grad_bias = None
-
-        if fuse_relu:
-            grad_output = syncbn.relu_bw_c_last(grad_output, saved_input, z, mean, inv_std, weight, bias)
-        if isinstance(z, torch.Tensor) and ctx.needs_input_grad[1]:
-            grad_z = grad_output.clone()
-
-        # TODO: update kernel to not pre_divide by item_num
-        if channel_last:
-            sum_dy, sum_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn_c_last(grad_output, saved_input, mean, inv_std, weight)
-        else:
-            sum_dy, sum_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output, saved_input, mean, inv_std, weight)
-
-        # calculate grad_input
-        if ctx.needs_input_grad[0]:
-
-            if torch.distributed.is_initialized():
-                num_channels = sum_dy.shape[0]
-                combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
-                torch.distributed.all_reduce(
-                    combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
-                sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
-
-            if channel_last:
-                grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, sum_dy, sum_dy_xmu, count)
-            else:
-                grad_input = syncbn.batchnorm_backward(grad_output, saved_input, mean, inv_std, weight, sum_dy, sum_dy_xmu, count)
-
-        if weight is None or not ctx.needs_input_grad[2]:
-            grad_weight = None
-
-        if weight is None or not ctx.needs_input_grad[3]:
-            grad_bias = None
-
-        return grad_input, grad_z, grad_weight, grad_bias, None, None, None, None, None, None, None, None
diff --git a/apex/parallel/sync_batchnorm.py b/apex/parallel/sync_batchnorm.py
deleted file mode 100644
index fd1844363..000000000
--- a/apex/parallel/sync_batchnorm.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import torch
-from torch.nn.modules.batchnorm import _BatchNorm
-from torch.nn import functional as F
-
-from .sync_batchnorm_kernel import SyncBatchnormFunction
-from apex.parallel import ReduceOp
-
-
-class SyncBatchNorm(_BatchNorm):
-    """
-    synchronized batch normalization module extented from ``torch.nn.BatchNormNd``
-    with the added stats reduction across multiple processes.
-    :class:`apex.parallel.SyncBatchNorm` is designed to work with
-    ``DistributedDataParallel``.
-
-    When running in training mode, the layer reduces stats across all processes
-    to increase the effective batchsize for normalization layer. This is useful
-    in applications where batch size is small on a given process that would
-    diminish converged accuracy of the model. The model uses collective
-    communication package from ``torch.distributed``.
-
-    When running in evaluation mode, the layer falls back to
-    ``torch.nn.functional.batch_norm``.
-
-    Args:
-        num_features: :math:`C` from an expected input of size
-            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
-        eps: a value added to the denominator for numerical stability.
-            Default: 1e-5
-        momentum: the value used for the running_mean and running_var
-            computation. Can be set to ``None`` for cumulative moving average
-            (i.e. simple average). Default: 0.1
-        affine: a boolean value that when set to ``True``, this module has
-            learnable affine parameters. Default: ``True``
-        track_running_stats: a boolean value that when set to ``True``, this
-            module tracks the running mean and variance, and when set to ``False``,
-            this module does not track such statistics and always uses batch
-            statistics in both training and eval modes. Default: ``True``
-
-    Example::
-
-        >>> sbn = apex.parallel.SyncBatchNorm(100).cuda()
-        >>> inp = torch.randn(10, 100, 14, 14).cuda()
-        >>> out = sbn(inp)
-        >>> inp = torch.randn(3, 100, 20).cuda()
-        >>> out = sbn(inp)
-    """
-
-    warned = False
-
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False):
-        from apex import deprecated_warning
-        deprecated_warning("apex.parallel.SyncBatchNorm is deprecated and will be removed by the end of February 2023. Use `torch.nn.SyncBatchNorm`.")
-        if channel_last == True:
-            raise AttributeError("channel_last is not supported by primitive SyncBatchNorm implementation. Try install apex with `--cuda_ext` if channel_last is desired.")
-
-        if not SyncBatchNorm.warned:
-            if hasattr(self, "syncbn_import_error"):
-                print("Warning:  using Python fallback for SyncBatchNorm, possibly because apex was installed without --cuda_ext.  The exception raised when attempting to import the cuda backend was: ", self.syncbn_import_error)
-            else:
-                print("Warning:  using Python fallback for SyncBatchNorm")
-            SyncBatchNorm.warned = True
-
-        super(SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
-        self.process_group = process_group
-
-    def _specify_process_group(self, process_group):
-        self.process_group = process_group
-
-    def forward(self, input):
-        torch.cuda.nvtx.range_push("sync_bn_fw_with_mean_var")
-        mean = None
-        var = None
-        cast = None
-        out = None
-
-        # casting to handle mismatch input type to layer type
-        if self.running_mean is not None:
-            if self.running_mean.dtype != input.dtype:
-                input = input.to(self.running_mean.dtype)
-                cast = input.dtype
-        elif self.weight is not None:
-            if self.weight.dtype != input.dtype:
-                input = input.to(self.weight.dtype)
-                cast = input.dtype
-
-        if not self.training and self.track_running_stats:
-            # fall back to pytorch implementation for inference
-            torch.cuda.nvtx.range_pop()
-            out = F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
-        else:
-            process_group = self.process_group
-            world_size = 1
-            if not self.process_group:
-                process_group = torch.distributed.group.WORLD
-            self.num_batches_tracked += 1
-            with torch.no_grad():
-                channel_first_input = input.transpose(0, 1).contiguous()
-                squashed_input_tensor_view = channel_first_input.view(
-                    channel_first_input.size(0), -1)
-                # total number of data points for each variance entry. Used to calculate unbiased variance estimate
-                m = None
-                local_m = float(squashed_input_tensor_view.size()[1])
-                local_mean = torch.mean(squashed_input_tensor_view, 1)
-                local_sqr_mean = torch.pow(
-                    squashed_input_tensor_view, 2).mean(1)
-                if torch.distributed.is_initialized():
-                    world_size = torch.distributed.get_world_size(process_group)
-                    torch.distributed.all_reduce(
-                        local_mean, ReduceOp.SUM, process_group)
-                    mean = local_mean / world_size
-                    torch.distributed.all_reduce(
-                        local_sqr_mean, ReduceOp.SUM, process_group)
-                    sqr_mean = local_sqr_mean / world_size
-                    m = local_m * world_size
-                else:
-                    m = local_m
-                    mean = local_mean
-                    sqr_mean = local_sqr_mean
-                # var(x) = E (( x - mean_x ) ** 2)
-                #        = 1 / N * sum ( x - mean_x ) ** 2
-                #        = 1 / N * sum (x**2) - mean_x**2
-                var = sqr_mean - mean.pow(2)
-
-                if self.running_mean is not None:
-                    self.running_mean = self.momentum * mean + \
-                        (1 - self.momentum) * self.running_mean
-                if self.running_var is not None:
-                    # as noted by the paper, we used unbiased variance estimate of the mini-batch
-                    # Var[x] = m / (m-1) * Eb (sample_variance)
-                    self.running_var = m / \
-                        (m-1) * self.momentum * var + \
-                        (1 - self.momentum) * self.running_var
-            torch.cuda.nvtx.range_pop()
-            out = SyncBatchnormFunction.apply(input, self.weight, self.bias, mean, var, self.eps, process_group, world_size)
-        return out.to(cast)
diff --git a/apex/parallel/sync_batchnorm_kernel.py b/apex/parallel/sync_batchnorm_kernel.py
deleted file mode 100644
index e407a63da..000000000
--- a/apex/parallel/sync_batchnorm_kernel.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import torch
-from torch.autograd.function import Function
-
-from apex.parallel import ReduceOp
-
-
-class SyncBatchnormFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input, weight, bias, running_mean, running_variance, eps, process_group, world_size):
-        torch.cuda.nvtx.range_push("sync_BN_fw")
-        # transpose it to channel last to support broadcasting for input with different rank
-        c_last_input = input.transpose(1, -1).contiguous().clone()
-
-        ctx.save_for_backward(c_last_input, weight, bias,
-                              running_mean, running_variance)
-        ctx.eps = eps
-        ctx.process_group = process_group
-        ctx.world_size = world_size
-
-        c_last_input = (c_last_input - running_mean) / \
-            torch.sqrt(running_variance + eps)
-
-        if weight is not None:
-            c_last_input = c_last_input * weight
-        if bias is not None:
-            c_last_input = c_last_input + bias
-
-        torch.cuda.nvtx.range_pop()
-        return c_last_input.transpose(1, -1).contiguous().clone()
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        torch.cuda.nvtx.range_push("sync_BN_bw")
-        # mini batch mean & var are calculated by forward path.
-        # mu = 1./N*np.sum(h, axis = 0)
-        # var = 1./N*np.sum((h-mu)**2, axis = 0)
-        c_last_input, weight, bias, running_mean, running_variance = ctx.saved_tensors
-
-        eps = ctx.eps
-        process_group = ctx.process_group
-        world_size = ctx.world_size
-        grad_input = grad_weight = grad_bias = None
-        num_features = running_mean.size()[0]
-
-        # transpose it to channel last to support broadcasting for input with different rank
-        torch.cuda.nvtx.range_push("carilli field")
-        c_last_grad = grad_output.transpose(1, -1).contiguous()
-        # squash non-channel dimension so we can easily calculate mean
-        c_grad = c_last_grad.view(-1, num_features).contiguous()
-        torch.cuda.nvtx.range_pop()
-
-        # calculate grad_input
-        if ctx.needs_input_grad[0]:
-            # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0)
-            #     - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0))
-            mean_dy = c_grad.mean(0)
-            mean_dy_xmu = (c_last_grad * (c_last_input -
-                                          running_mean)).view(-1, num_features).mean(0)
-            if torch.distributed.is_initialized():
-                torch.distributed.all_reduce(
-                    mean_dy, ReduceOp.SUM, process_group)
-                mean_dy = mean_dy / world_size
-                torch.distributed.all_reduce(
-                    mean_dy_xmu, ReduceOp.SUM, process_group)
-                mean_dy_xmu = mean_dy_xmu / world_size
-            c_last_grad_input = (c_last_grad - mean_dy - (c_last_input - running_mean) / (
-                running_variance + eps) * mean_dy_xmu) / torch.sqrt(running_variance + eps)
-            if weight is not None:
-                c_last_grad_input.mul_(weight)
-            grad_input = c_last_grad_input.transpose(1, -1).contiguous()
-
-        # calculate grad_weight
-        grad_weight = None
-        if weight is not None and ctx.needs_input_grad[1]:
-            # dgamma = np.sum((h - mu) * (var + eps)**(-1. / 2.) * dy, axis=0)
-            grad_weight = ((c_last_input - running_mean) / torch.sqrt(
-                running_variance + eps) * c_last_grad).view(-1, num_features).sum(0)
-
-        # calculate grad_bias
-        grad_bias = None
-        if bias is not None and ctx.needs_input_grad[2]:
-            # dbeta = np.sum(dy, axis=0)
-            grad_bias = c_grad.sum(0)
-
-        torch.cuda.nvtx.range_pop()
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None
diff --git a/examples/simple/distributed/README.md b/examples/simple/distributed/README.md
deleted file mode 100644
index 0d939cbbf..000000000
--- a/examples/simple/distributed/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-**distributed_data_parallel.py** and **run.sh** show an example using Amp with
-[apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
-[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
-and the Pytorch multiprocess launcher script,
-[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
-The use of `Amp` with DistributedDataParallel does not need to change from ordinary 
-single-process use.  The only gotcha is that wrapping your model with `DistributedDataParallel` must
-come after the call to `amp.initialize`.  Test via
-```bash
-bash run.sh
-```
-
-**This is intended purely as an instructional example, not a performance showcase.**
diff --git a/examples/simple/distributed/distributed_data_parallel.py b/examples/simple/distributed/distributed_data_parallel.py
deleted file mode 100644
index b364405df..000000000
--- a/examples/simple/distributed/distributed_data_parallel.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import torch
-import argparse
-import os
-from apex import amp
-# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
-from apex.parallel import DistributedDataParallel
-
-parser = argparse.ArgumentParser()
-# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
-# automatically by torch.distributed.launch.
-parser.add_argument("--local_rank", default=0, type=int)
-args = parser.parse_args()
-
-# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
-# the 'WORLD_SIZE' environment variable will also be set automatically.
-args.distributed = False
-if 'WORLD_SIZE' in os.environ:
-    args.distributed = int(os.environ['WORLD_SIZE']) > 1
-
-if args.distributed:
-    # FOR DISTRIBUTED:  Set the device according to local_rank.
-    torch.cuda.set_device(args.local_rank)
-
-    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
-    # environment variables, and requires that you use init_method=`env://`.
-    torch.distributed.init_process_group(backend='nccl',
-                                         init_method='env://')
-
-torch.backends.cudnn.benchmark = True
-
-N, D_in, D_out = 64, 1024, 16
-
-# Each process receives its own batch of "fake input data" and "fake target data."
-# The "training loop" in each process just uses this fake batch over and over.
-# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
-# example of distributed data sampling for both training and validation.
-x = torch.randn(N, D_in, device='cuda')
-y = torch.randn(N, D_out, device='cuda')
-
-model = torch.nn.Linear(D_in, D_out).cuda()
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-
-model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
-
-if args.distributed:
-    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
-    # apex.parallel.DistributedDataParallel.
-    model = DistributedDataParallel(model)
-    # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
-    # model = torch.nn.parallel.DistributedDataParallel(model,
-    #                                                   device_ids=[args.local_rank],
-    #                                                   output_device=args.local_rank)
-
-loss_fn = torch.nn.MSELoss()
-
-for t in range(500):
-    optimizer.zero_grad()
-    y_pred = model(x)
-    loss = loss_fn(y_pred, y)
-    with amp.scale_loss(loss, optimizer) as scaled_loss:
-        scaled_loss.backward()
-    optimizer.step()
-
-if args.local_rank == 0:
-    print("final loss = ", loss)
diff --git a/examples/simple/distributed/run.sh b/examples/simple/distributed/run.sh
deleted file mode 100644
index 7a2d85f0a..000000000
--- a/examples/simple/distributed/run.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
diff --git a/tests/L0/run_amp/__init__.py b/tests/L0/run_amp/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/L0/run_amp/test_add_param_group.py b/tests/L0/run_amp/test_add_param_group.py
deleted file mode 100644
index d3e90c433..000000000
--- a/tests/L0/run_amp/test_add_param_group.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-from apex.amp import _amp_state
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn import Parameter
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-class MyModel(torch.nn.Module):
-    def __init__(self, unique):
-        super(MyModel, self).__init__()
-        self.weight0 = Parameter(unique +
-            torch.arange(2, device='cuda', dtype=torch.float32))
-        self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
-
-    @staticmethod
-    def ops(input, weight0, weight1):
-        return ((input*(weight0.float()))*(weight1.float())).sum()
-
-    def forward(self, input):
-        return self.ops(input, self.weight0, self.weight1)
-
-
-# Abandon all hope, ye who enter here.
-
-
-class TestAddParamGroup(unittest.TestCase):
-    def setUp(self):
-        self.x = torch.ones((2), device='cuda', dtype=torch.float32)
-        common_init(self)
-
-    def tearDown(self):
-        pass
-
-    def zero_grad(self, models, optimizer, how_to_zero):
-        if how_to_zero == "none":
-            for model in models:
-                for param in model.parameters():
-                    param.grad = None
-        elif how_to_zero == "model":
-            for model in models:
-                model.zero_grad()
-        elif how_to_zero == "optimizer":
-            optimizer.zero_grad()
-
-    def test_add_param_group(self):
-        for opt_level in ("O0", "O1", "O2", "O3"):
-          for zero_before_add in (True, False):
-            for try_accumulation in (True, False):
-              model0 = MyModel(1)
-              model1 = MyModel(2)
-
-              optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                          momentum=0.125)
-
-              optimizer.zero_grad()
-              loss = model0(self.x)
-              loss.backward()
-              optimizer.step()
-
-              if zero_before_add:
-                  optimizer.zero_grad()
-              optimizer.add_param_group({'params' : model1.parameters(), 'lr' : 0.5})
-              if not zero_before_add:
-                  optimizer.zero_grad()
-
-              loss = model0(self.x) + model1(self.x)
-              loss.backward(retain_graph=try_accumulation)
-              if try_accumulation:
-                  loss.backward()
-              optimizer.step()
-
-              # Once more to make sure the new params pick up momemtums properly
-              optimizer.zero_grad()
-              loss = model0(self.x) + model1(self.x)
-              loss.backward(retain_graph=try_accumulation)
-              if try_accumulation:
-                  loss.backward()
-              optimizer.step()
-
-              reference_params = [param.data.clone() for param in model0.parameters()] + \
-                                 [param.data.clone() for param in model1.parameters()]
-
-              for how_to_zero in "none", "model", "optimizer":
-                  model0 = MyModel(1)
-                  model1 = MyModel(2)
-
-                  optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                              momentum=0.125)
-
-                  _amp_state.allow_incoming_model_not_fp32 = True
-                  [model0, model1], optimizer = amp.initialize([model0, model1],
-                      optimizer,
-                      opt_level=opt_level,
-                      verbosity=0,
-                      cast_model_type=False)
-                  _amp_state.allow_incoming_model_not_fp32 = False
-
-                  _amp_state.loss_scalers[0]._loss_scale = 4.0
-
-                  self.zero_grad([model0, model1], optimizer, how_to_zero)
-                  loss = model0(self.x)
-                  with amp.scale_loss(loss, optimizer) as scaled_loss:
-                      scaled_loss.backward()
-                  optimizer.step()
-
-                  if zero_before_add:
-                      self.zero_grad([model0, model1], optimizer, how_to_zero)
-                  optimizer.add_param_group({'params' : model1.parameters(), 'lr' : 0.5})
-                  if not zero_before_add:
-                      self.zero_grad([model0, model1], optimizer, how_to_zero)
-
-                  loss = model0(self.x) + model1(self.x)
-                  with amp.scale_loss(loss, optimizer) as scaled_loss:
-                      scaled_loss.backward(retain_graph=try_accumulation)
-                  if try_accumulation:
-                      with amp.scale_loss(loss, optimizer) as scaled_loss:
-                          scaled_loss.backward()
-                  optimizer.step()
-
-                  # Once more to make sure the new params pick up momentums properly
-                  self.zero_grad([model0, model1], optimizer, how_to_zero)
-                  loss = model0(self.x) + model1(self.x)
-                  with amp.scale_loss(loss, optimizer) as scaled_loss:
-                      scaled_loss.backward(retain_graph=try_accumulation)
-                  if try_accumulation:
-                      with amp.scale_loss(loss, optimizer) as scaled_loss:
-                          scaled_loss.backward()
-                  optimizer.step()
-
-                  final_params = [param.data.clone() for param in model0.parameters()] + \
-                                 [param.data.clone() for param in model1.parameters()]
-
-                  for reference, final in zip(reference_params, final_params):
-                      self.assertTrue(torch.allclose(reference.to(final.dtype), final),
-                                      "opt_level = {}, how_to_zero = {}, zero_before_add = {}".format(
-                                      opt_level, how_to_zero, zero_before_add))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_basic_casts.py b/tests/L0/run_amp/test_basic_casts.py
deleted file mode 100644
index 5d4d81d1a..000000000
--- a/tests/L0/run_amp/test_basic_casts.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-def run_layer_test(test_case, fns, expected, input_shape, test_backward=True):
-    for fn, typ in it.product(fns, expected.keys()):
-        x = torch.randn(input_shape, dtype=typ).requires_grad_()
-        y = fn(x)
-        test_case.assertEqual(y.type(), expected[typ])
-        if test_backward:
-            y.float().sum().backward()
-            test_case.assertEqual(x.grad.type(), MATCH_INPUT[typ])
-
-class TestBasicCasts(unittest.TestCase):
-    def setUp(self):
-        self.handle = amp.init(enabled=True)
-        common_init(self)
-
-    def tearDown(self):
-        self.handle._deactivate()
-
-    def test_linear_is_half(self):
-        m = nn.Linear(self.h, self.h)
-        f = ft.partial(F.linear, weight=m.weight, bias=m.bias)
-        run_layer_test(self, [m, f], ALWAYS_HALF, (self.b, self.h))
-
-    def test_conv2d_is_half(self):
-        m = nn.Conv2d(self.c, self.c, self.k)
-        f = ft.partial(F.conv2d, weight=m.weight, bias=m.bias)
-        run_layer_test(self, [m, f], ALWAYS_HALF, (self.b, self.c, self.h, self.h))
-
-    def test_softmax_is_float(self):
-        m = nn.Softmax(dim=1)
-        f = ft.partial(F.softmax, dim=1)
-        run_layer_test(self, [m, f], ALWAYS_FLOAT, (self.b, self.h))
-
-    def test_group_norm_is_float(self):
-        m = nn.GroupNorm(num_groups=4, num_channels=self.c)
-        run_layer_test(self, [m], ALWAYS_FLOAT, (self.b, self.c, self.h, self.h))
-
-    def test_mse_loss_is_float(self):
-        shape = (self.b, self.h)
-        target = torch.randn(shape)
-        mod = nn.MSELoss()
-        m = lambda x: mod(x, target)
-        f = ft.partial(F.mse_loss, target=target)
-        run_layer_test(self, [m], ALWAYS_FLOAT, shape)
-
-    def test_relu_is_match(self):
-        run_layer_test(self, [nn.ReLU(), F.relu], MATCH_INPUT, (self.b, self.h))
-
-    def test_batch_norm_is_match(self):
-        m = nn.BatchNorm2d(num_features=self.c)
-        f = ft.partial(F.batch_norm, running_mean=m.running_mean, running_var=m.running_var,
-                       weight=m.weight, bias=m.bias, training=True)
-        run_layer_test(self, [m], MATCH_INPUT, (self.b, self.c, self.h, self.h))
-
-        # Test forward-only for BN inference
-        m.eval()
-        f = ft.partial(F.batch_norm, running_mean=m.running_mean, running_var=m.running_var,
-                       weight=m.weight, bias=m.bias, training=False)
-        run_layer_test(self, [m, f], MATCH_INPUT, (self.b, self.c, self.h, self.h),
-                            test_backward=False)
-
-class TestBannedMethods(unittest.TestCase):
-    def setUp(self):
-        self.handle = amp.init(enabled=True)
-        common_init(self)
-
-    def tearDown(self):
-        self.handle._deactivate()
-
-    def bce_common(self, assertion):
-        shape = (self.b, self.h)
-        target = torch.rand(shape)
-        mod = nn.BCELoss()
-        m = lambda x: mod(x, target)
-        f = ft.partial(F.binary_cross_entropy, target=target)
-        for fn in [m, f]:
-            x = torch.rand(shape, dtype=torch.half)
-            assertion(fn, x)
-
-    def test_bce_raises_by_default(self):
-        assertion = lambda fn, x: self.assertRaises(NotImplementedError, fn, x)
-        self.bce_common(assertion)
-
-    def test_bce_is_float_with_allow_banned(self):
-        self.handle._deactivate()
-        self.handle = amp.init(enabled=True, allow_banned=True)
-        assertion = lambda fn, x: self.assertEqual(fn(x).type(), FLOAT)
-        self.bce_common(assertion)
-
-class TestTensorCasts(unittest.TestCase):
-    def setUp(self):
-        self.handle = amp.init(enabled=True)
-        common_init(self)
-
-    def tearDown(self):
-        self.handle._deactivate()
-
-    def test_matmul_method_is_half(self):
-        other = torch.randn(self.h, self.h)
-        lhs = lambda x: x.matmul(other)
-        rhs = lambda x: other.matmul(x)
-        run_layer_test(self, [lhs, rhs], ALWAYS_HALF, (self.h, self.h))
-
-    def test_matmul_op_is_half(self):
-        other = torch.randn(self.h, self.h)
-        lhs = lambda x: x @ other
-        rhs = lambda x: other @ x
-        run_layer_test(self, [lhs, rhs], ALWAYS_HALF, (self.h, self.h))
-
-    def test_pow_method_is_float(self):
-        fn = lambda x: x.pow(2.)
-        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
-
-    def test_pow_op_is_float(self):
-        fn = lambda x: x ** 2.
-        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
-
-    def test_cpu_is_float(self):
-        fn = lambda x: x.cpu()
-        always_cpu_float = {torch.float: 'torch.FloatTensor',
-                            torch.half: 'torch.FloatTensor'}
-        run_layer_test(self, [fn], always_cpu_float, (self.b, self.h))
-
-    def test_sum_is_float(self):
-        fn = lambda x: x.sum()
-        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
-
-    # TODO: maybe more tests on disabled casting?
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_cache.py b/tests/L0/run_amp/test_cache.py
deleted file mode 100644
index b58d2665f..000000000
--- a/tests/L0/run_amp/test_cache.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-from apex.amp import _amp_state
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-def get_reference_grad(i, w, ops):
-    # Creating new tensors ensures, among other things, that the new tensors are not in the cache.
-    # In fact, they are guaranteed not to use the cache because they are not torch.nn.Parameters.
-    fp32_i = i.detach().clone().float()
-    fp32_w = w.detach().clone().float().requires_grad_()
-    loss = ops(fp32_i, fp32_w)
-    loss.backward()
-    return fp32_w.grad
-
-class WhitelistModule(torch.nn.Module):
-    def __init__(self, dtype):
-        super(WhitelistModule, self).__init__()
-        self.weight = torch.nn.Parameter(torch.arange(8*8, device='cuda', dtype=dtype).view(8,8))
-
-    @staticmethod
-    def ops(input, weight):
-        return (input.mm(weight)).mm(weight).sum()
-
-    def forward(self, input):
-        return self.ops(input, self.weight)
-
-
-class BlacklistModule(torch.nn.Module):
-    def __init__(self, dtype):
-        super(BlacklistModule, self).__init__()
-        self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
-
-    @staticmethod
-    def ops(input, weight):
-        return (input + torch.pow(weight, 2) + torch.pow(weight, 2)).sum()
-
-    def forward(self, input):
-        return self.ops(input, self.weight)
-
-
-class PromoteModule(torch.nn.Module):
-    def __init__(self, dtype):
-        super(PromoteModule, self).__init__()
-        self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
-
-    @staticmethod
-    def ops(input, weight):
-        return ((input*weight)*weight).sum()
-
-    def forward(self, input):
-        return self.ops(input, self.weight)
-
-class TestCache(unittest.TestCase):
-    def setUp(self):
-        self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32)
-        common_init(self)
-
-    def tearDown(self):
-        pass
-
-    def train_eval_train_test(self, module, t):
-        model = module(t).cuda()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
-
-        _amp_state.allow_incoming_model_not_fp32 = True
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
-        _amp_state.allow_incoming_model_not_fp32 = False
-        
-        def training_step():
-            for param in model.parameters():
-                param.grad = None
-        
-            loss = model(self.x).sum()
-            _amp_state.loss_scalers[0]._loss_scale = 4.0
-            with amp.scale_loss(loss, optimizer) as scaled_loss:
-                scaled_loss.backward()
-        
-            self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1)
-            self.assertEqual(model.weight.grad.type(), model.weight.type())
-        
-            reference_grad = get_reference_grad(self.x, model.weight, model.ops)
-        
-            # Currently there's no difference in the allclose calls, so no need for branching,
-            # but I'm keeping this in case we want different tolerances for fp16 and fp32 checks. 
-            if model.weight.grad.type() == "torch.cuda.HalfTensor":
-                self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
-            elif model.weight.grad.type() == "torch.cuda.FloatTensor":
-                self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
-            else:
-                raise RuntimeError("model.weight.grad.type = {}".format(model.weight.grad.type()))
-
-            model.weight.data -= 1.
-        
-        # Simulates first epoch
-        training_step()
-        
-        # Simulates eval
-        with torch.no_grad():
-            loss = model(self.x).sum()
-        
-        # Simulates resuming training after eval
-        training_step()
-
-        _amp_state.handle._deactivate()
-   
-    # I could easily have these as a set of for loops in a single test,
-    # instead of going for granularity.
-    def test_whitelist_module_fp16_weight(self):
-        self.train_eval_train_test(WhitelistModule, torch.float16)
-
-    def test_whitelist_module_fp32_weight(self):
-        self.train_eval_train_test(WhitelistModule, torch.float32)
-
-    def test_blacklist_module_fp16_weight(self):
-        self.train_eval_train_test(BlacklistModule, torch.float16)
-
-    def test_blacklist_module_fp32_weight(self):
-        self.train_eval_train_test(BlacklistModule, torch.float32)
-
-    def test_promote_module_fp16_weight(self):
-        self.train_eval_train_test(PromoteModule, torch.float16)
-
-    def test_promote_module_fp32_weight(self):
-        self.train_eval_train_test(PromoteModule, torch.float32)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_checkpointing.py b/tests/L0/run_amp/test_checkpointing.py
deleted file mode 100644
index 921985cd7..000000000
--- a/tests/L0/run_amp/test_checkpointing.py
+++ /dev/null
@@ -1,267 +0,0 @@
-import unittest
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-
-from apex import amp
-
-
-from utils import common_init, FLOAT
-
-
-class MyModel(torch.nn.Module):
-    def __init__(self):
-        super(MyModel, self).__init__()
-        self.conv1 = nn.Conv2d(3, 6, 3, 1, 1)
-        self.bn1 = nn.BatchNorm2d(6)
-        self.param = nn.Parameter(torch.randn(1))
-
-    def forward(self, x):
-        x = x * self.param
-        x = F.relu(self.conv1(x))
-        x = self.bn1(x)
-        return x
-
-
-class TestCheckpointing(unittest.TestCase):
-    def setUp(self):
-        self.initial_lr = 1e-3
-        self.test_opt_levels = ("O0", "O1", "O2", "O3")
-
-    def seed(self):
-        torch.manual_seed(2809)
-        torch.backends.cudnn.benchmark = False
-        torch.backends.cudnn.deterministic = True
-
-    def check_state_dict_fp32(self, state_dict):
-        for key in state_dict:
-            if 'num_batches_tracked' in key:
-                continue
-            param = state_dict[key]
-            self.assertEqual(param.type(), FLOAT,
-                             'Parameter in state_dict not FLOAT')
-
-    def train_step(self, model, optimizer, data, loss_ids):
-        optimizer.zero_grad()        
-
-        output = model(data)
-
-        # Call backward for num_losses-1
-        for idx in loss_ids:
-            loss = output.mean()
-            with amp.scale_loss(loss, optimizer, loss_id=idx) as scaled_loss:
-                scaled_loss.backward(retain_graph=True)
-
-        optimizer.step()
-        return output
-
-    def compare_models(self, modelA, modelB, test_setup=''):
-        state_dictA = modelA.state_dict()
-        state_dictB = modelB.state_dict()
-        self.assertEqual(len(state_dictA), len(state_dictB),
-                         'state_dicts have different lengths' + test_setup)
-        for key in state_dictA:
-            paramA = state_dictA[key]
-            paramB = state_dictB[key]
-            self.assertTrue((paramA==paramB).all(),
-                msg='Parameters in state_dices not equal.' +
-                    'key: {}\nparam: {}\nrestored: {}\ndiff: {} for {}'.format(
-                        key, paramA, paramB, paramA - paramB, test_setup))
-
-    def test_restoring(self):
-        nb_epochs = 10
-        nb_epochs_restore = nb_epochs // 2
-        for opt_level in self.test_opt_levels:
-            for res_opt_level in self.test_opt_levels:
-                for amp_before_load in [True, False]:
-                    for num_losses in range(1, 3):
-                        test_setup = ('#' * 75 + '\n' + \
-                              f'opt_level {opt_level}\n' + \
-                              f'restore_opt_level {res_opt_level}\n' + \
-                              f'amp_before_load {amp_before_load}\n' + \
-                              f'num_losses {num_losses}\n')
-
-                        self.seed()
-
-                        # Create reference model
-                        model = MyModel().to('cuda')
-
-                        optimizer = optim.SGD(model.parameters(),
-                                              lr=self.initial_lr)
-
-                        # Initialize with num_losses*2 for the original model and the restored one
-                        model, optimizer = amp.initialize(
-                            model, optimizer, opt_level=opt_level,
-                            num_losses=num_losses*2, verbosity=0)
-
-                        # Compare training behavior for same restore option
-                        # We cannot really generalize it, since a saved model in O0
-                        # would introduce a skipped step in O1, which will raise an error
-                        if opt_level == res_opt_level:
-                            # train for nb_epochs and restore after nb_epochs_restore
-                            for epoch in range(nb_epochs):
-    
-                                x = torch.randn(16, 3, 24, 24, device='cuda')
-                                output = self.train_step(
-                                    model, optimizer, x, range(num_losses))
-                                # Initialize model one step before comparing.
-                                # Otherwise the batchnorm layers will be updated 
-                                # additionally in restore_model
-                                if epoch == (nb_epochs_restore - 1):
-                                    # Load model and optimizer
-                                    checkpoint = {
-                                        'model': model.state_dict(),
-                                        'optimizer': optimizer.state_dict(),
-                                        'amp': amp.state_dict()
-                                    }
-                                    # Check state_dict for FP32 tensors
-                                    self.check_state_dict_fp32(checkpoint['model'])
-
-                                    # Restore model
-                                    restore_model = MyModel().to('cuda')
-                                    restore_optimizer = optim.SGD(
-                                        restore_model.parameters(),
-                                        lr=self.initial_lr)
-
-                                    if amp_before_load:
-                                        restore_model, restore_optimizer = amp.initialize(
-                                            restore_model,
-                                            restore_optimizer,
-                                            opt_level=res_opt_level,
-                                            num_losses=num_losses*2,
-                                            verbosity=0)
-
-                                    restore_model.load_state_dict(checkpoint['model'])
-                                    restore_optimizer.load_state_dict(checkpoint['optimizer'])
-                                    # FIXME: We cannot test the amp.state_dict in the same script
-                                    # amp.load_state_dict(checkpoint['amp'])
-
-                                    if not amp_before_load:
-                                        restore_model, restore_optimizer = amp.initialize(
-                                            restore_model,
-                                            restore_optimizer,
-                                            opt_level=res_opt_level,
-                                            num_losses=num_losses*2,
-                                            verbosity=0)
-
-                                elif epoch >= nb_epochs_restore:
-                                    restore_output = self.train_step(
-                                        restore_model,
-                                        restore_optimizer,
-                                        x,
-                                        range(num_losses, num_losses*2))
-                                    self.assertTrue(
-                                        torch.allclose(output.float(), restore_output.float()),
-                                        'Output of reference and restored models differ for ' + test_setup)
-                                    self.compare_models(model, restore_model, test_setup)
-                        # if opt_level != res_opt_level
-                        else:
-                            # skip tests for different opt_levels
-                            continue
-
-    def test_loss_scale_decrease(self):
-        num_losses = 3
-        nb_decrease_loss_scales = [0, 1, 2]
-        for opt_level in self.test_opt_levels:
-            #print('#' * 75 + f'\n opt_level {opt_level}\n')
-            # Create new tmp copy for this run
-            nb_decrease_loss_scales_tmp = list(nb_decrease_loss_scales)
-
-            model = MyModel().to('cuda')
-        
-            optimizer = optim.SGD(model.parameters(),
-                                  lr=self.initial_lr)
-        
-            model, optimizer = amp.initialize(
-                model, optimizer, opt_level=opt_level, num_losses=num_losses,
-                verbosity=0)
-
-            if amp._amp_state.opt_properties.loss_scale != 'dynamic':
-                #print('Static loss scale set. Skipping opt_level.')
-                continue
-        
-            # force to skip some updates to decrease the loss_scale
-            initial_loss_scales = []
-            for idx in range(num_losses):
-                initial_loss_scales.append(
-                    amp._amp_state.loss_scalers[idx].loss_scale())
-            
-            for _ in range(len(nb_decrease_loss_scales)):
-                x = torch.randn(16, 3, 24, 24, device='cuda')
-                for idx in range(num_losses):
-                    while nb_decrease_loss_scales_tmp[idx] > 0:
-                        optimizer.zero_grad()
-                        output = model(x * 2**17)
-                        loss = output.mean()            
-                    
-                        with amp.scale_loss(loss, optimizer, loss_id=idx) as scaled_loss:
-                            scaled_loss.backward(retain_graph=True)
-                        optimizer.step()
-                        nb_decrease_loss_scales_tmp[idx] -= 1
-                
-            # Check loss scales afterwards
-            updated_loss_scales = []
-            for idx in range(num_losses):
-                updated_loss_scales.append(
-                    amp._amp_state.loss_scalers[idx].loss_scale())
-            for factor, update_ls, init_ls in zip(nb_decrease_loss_scales,
-                                                  updated_loss_scales,
-                                                  initial_loss_scales):
-                self.assertEqual(update_ls, init_ls / 2**factor)
-
-            # Check state dict
-            amp_state_dict = amp.state_dict()
-            for scaler_idx, factor, init_ls in zip(amp_state_dict,
-                                                   nb_decrease_loss_scales,
-                                                   initial_loss_scales):
-                scaler = amp_state_dict[scaler_idx]
-                self.assertEqual(scaler['loss_scale'], init_ls / 2**factor)
-                unskipped_target = 0
-                self.assertEqual(scaler['unskipped'], unskipped_target)
-
-    def test_state_dict(self):
-        for opt_level in self.test_opt_levels:
-            # Skip O3
-            if opt_level == 'O3':
-                continue
-
-            model = MyModel().to('cuda')
-            optimizer = optim.Adam(model.parameters(), lr=1e-3)
-            model, optimizer = amp.initialize(
-                model, optimizer, opt_level=opt_level, verbosity=0)
-
-            # Export state_dict and check for Half
-            state_dict = model.state_dict()
-            for key in state_dict:
-                self.assertFalse('Half' in state_dict[key].type())
-
-            # Check, if model is still trainable
-            # Create dummy data
-            data = torch.randn(10, 3, 4, 4, device='cuda')
-            target = torch.randn(10, 6, 4, 4, device='cuda')
-            
-            # Get initnial loss
-            optimizer.zero_grad()
-            output = model(data)
-            loss = F.mse_loss(output, target)
-            with amp.scale_loss(loss, optimizer) as scaled_loss:
-                scaled_loss.backward()
-            optimizer.step()
-            last_loss = loss.item()
-
-            # train for some epochs
-            for epoch in range(10):
-                optimizer.zero_grad()
-                output = model(data)
-                loss = F.mse_loss(output, target)
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-                optimizer.step()
-                self.assertTrue(loss.item() < last_loss)
-                last_loss = loss.item()
-
-if __name__=='__main__':
-    unittest.main()
-        
diff --git a/tests/L0/run_amp/test_fused_sgd.py b/tests/L0/run_amp/test_fused_sgd.py
deleted file mode 100644
index 7f592128d..000000000
--- a/tests/L0/run_amp/test_fused_sgd.py
+++ /dev/null
@@ -1,794 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-from apex.amp import _amp_state
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn import Parameter
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-
-try:
-  import amp_C
-  disabled = False
-  from apex.optimizers import FusedSGD as FusedSGD
-except ImportError as err:
-  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
-  disabled = True
-
-
-class MyModel(torch.nn.Module):
-    def __init__(self, unique):
-        super(MyModel, self).__init__()
-        self.weight0 = Parameter(unique +
-            torch.arange(2, device='cuda', dtype=torch.float32))
-        self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
-
-    @staticmethod
-    def ops(input, weight0, weight1):
-        return ((input*(weight0.float()))*(weight1.float())).sum()
-
-    def forward(self, input):
-        return self.ops(input, self.weight0, self.weight1)
-
-# Abandon all hope, ye who enter here.
-
-# This is hands down the ugliest code I have ever written, but it succeeds in testing
-# multiple models/optimizers/losses fairly thoroughly.  Many of the different test cases
-# require slightly divergent code in a way that seems near-impossible to genericize into a simple
-# cross product or nested loops.
-
-class TestMultipleModelsOptimizersLosses(unittest.TestCase):
-    def setUp(self):
-        self.x = torch.ones((2), device='cuda', dtype=torch.float32)
-        common_init(self)
-
-    def tearDown(self):
-        pass
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    def test_2models2losses1optimizer(self):
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-
-        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                     {'params' : model1.parameters(), 'lr' : 0.5}],
-                                    momentum=0.125)
-
-        reference_grads = []
-        for i in range(2):
-            optimizer.zero_grad()
-            loss0 = model0(self.x)
-            loss1 = model1(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()])
-
-            optimizer.step()
-
-        final_params = [param.data.clone() for param in model0.parameters()] + \
-                       [param.data.clone() for param in model1.parameters()]
-
-        for materialize_master_grads in (False, True):
-          for opt_level in ("O0", "O1", "O2", "O3"):
-            for how_to_zero in ("none", "model", "optimizer"):
-              for use_multiple_loss_scalers in (False, True):
-                if opt_level == "O1" or opt_level == "O2":
-                    inject_inf_iters = (-1, 0, 1)
-                else:
-                    inject_inf_iters = (-1,)
-
-                for inject_inf in inject_inf_iters:
-                  if inject_inf >= 0:
-                     inject_inf_locs = ("fp16", "fp32")
-                     which_backwards = (0, 1)
-                  else:
-                     inject_inf_locs = ("fdsa",)
-                     which_backwards = (None,)
-
-                  for inject_inf_loc in inject_inf_locs:
-                    for which_backward in which_backwards:
-                        if use_multiple_loss_scalers:
-                            num_losses = 2
-                            loss_ids = [0, 1]
-                        else:
-                            num_losses = 1
-                            loss_ids = [0, 0]
-
-                        if inject_inf >= 0:
-                            iters = 3
-                        else:
-                            iters = 2
-
-                        model0 = MyModel(1)
-                        model1 = MyModel(2)
-
-                        models = [model0, model1]
-
-                        optimizer = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                              {'params' : model1.parameters(), 'lr' : 0.5}],
-                                             momentum=0.125,
-                                             materialize_master_grads=materialize_master_grads)
-
-                        _amp_state.allow_incoming_model_not_fp32 = True
-                        [model0, model1], optimizer = amp.initialize(
-                            [model0, model1],
-                            optimizer,
-                            opt_level=opt_level,
-                            verbosity=0,
-                            cast_model_type=False,
-                            num_losses=num_losses)
-                        _amp_state.allow_incoming_model_not_fp32 = False
-
-                        _amp_state.loss_scalers[0]._loss_scale = 4.0
-                        if use_multiple_loss_scalers:
-                            _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                        unskipped = 0
-                        for i in range(iters):
-                            if how_to_zero == "none":
-                                for model in models:
-                                    for param in model.parameters():
-                                        param.grad = None
-                            elif how_to_zero == "model":
-                                for model in models:
-                                    model.zero_grad()
-                            else:
-                                optimizer.zero_grad()
-
-                            loss0 = model0(self.x)
-                            loss1 = model1(self.x)
-
-                            with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 0:
-                                    if inject_inf_loc == "fp32":
-                                        model0.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        model0.weight1.grad[0] = float('inf')
-                            with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 1:
-                                    if inject_inf_loc == "fp32":
-                                        model1.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        model1.weight1.grad[0] = float('inf')
-
-                            if i != inject_inf:
-                                master_params = amp.master_params(optimizer)
-                                for param, reference_grad in zip(master_params, reference_grads[unskipped]):
-                                    if opt_level == "O2" and not materialize_master_grads:
-                                        continue
-                                    else:
-                                        self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()),
-                                                        "opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))
-                                unskipped += 1
-                            optimizer.step()
-
-                        model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
-                        for model, master, reference in zip(
-                                model_params,
-                                amp.master_params(optimizer),
-                                final_params):
-                            self.assertTrue(torch.allclose(model, reference))
-                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                        if opt_level == "O1":
-                            _amp_state.handle._deactivate()
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    def test_3models2losses1optimizer(self):
-
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-        model2 = MyModel(3)
-
-        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                     {'params' : model1.parameters(), 'lr' : 0.5},
-                                     {'params' : model2.parameters(), 'lr' : 0.125}],
-                                     momentum=0.125)
-
-        reference_grads = []
-        for i in range(2):
-            optimizer.zero_grad()
-            loss0 = model0(self.x) + model2(self.x)
-            loss1 = model1(self.x) + model2(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()] +
-                                   [param.grad.data.clone() for param in model2.parameters()])
-
-            optimizer.step()
-
-
-        final_params = [param.data.clone() for param in model0.parameters()] + \
-                       [param.data.clone() for param in model1.parameters()] + \
-                       [param.data.clone() for param in model2.parameters()]
-
-        for materialize_master_grads in (False, True):
-          for opt_level in ("O0", "O1", "O2", "O3"):
-            for how_to_zero in ("none", "model", "optimizer"):
-              for use_multiple_loss_scalers in (False, True):
-                if opt_level == "O1" or opt_level == "O2":
-                    inject_inf_iters = (-1, 0, 1)
-                else:
-                    inject_inf_iters = (-1,)
-
-                for inject_inf in inject_inf_iters:
-                  if inject_inf >= 0:
-                     inject_inf_locs = ("fp16", "fp32")
-                     which_backwards = (0, 1)
-                  else:
-                     inject_inf_locs = ("fdsa",)
-                     which_backwards = (None,)
-
-                  for inject_inf_loc in inject_inf_locs:
-                    for which_backward in which_backwards:
-                      if use_multiple_loss_scalers:
-                          num_losses = 2
-                          loss_ids = [0, 1]
-                      else:
-                          num_losses = 1
-                          loss_ids = [0, 0]
-
-                      if inject_inf >= 0:
-                          iters = 3
-                          if which_backward == 0:
-                              which_models = (0, 2)
-                          elif which_backward == 1:
-                              which_models = (1, 2)
-                      else:
-                          iters = 2
-                          which_models = (None,)
-
-                      for which_model in which_models:
-                          model0 = MyModel(1)
-                          model1 = MyModel(2)
-                          model2 = MyModel(3)
-
-                          models = [model0, model1, model2]
-
-                          optimizer = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                               {'params' : model1.parameters(), 'lr' : 0.5},
-                                               {'params' : model2.parameters(), 'lr' : 0.125}],
-                                               momentum=0.125,
-                                               materialize_master_grads=materialize_master_grads)
-
-                          _amp_state.allow_incoming_model_not_fp32 = True
-                          [model0, model1, model2], optimizer = amp.initialize(
-                              [model0, model1, model2],
-                              optimizer,
-                              opt_level=opt_level,
-                              verbosity=0,
-                              cast_model_type=False,
-                              num_losses=num_losses)
-                          _amp_state.allow_incoming_model_not_fp32 = False
-
-                          _amp_state.loss_scalers[0]._loss_scale = 4.0
-                          if use_multiple_loss_scalers:
-                              _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                          unskipped = 0
-                          for i in range(iters):
-                              if how_to_zero == "none":
-                                  for model in models:
-                                      for param in model.parameters():
-                                          param.grad = None
-                              elif how_to_zero == "model":
-                                  for model in models:
-                                      model.zero_grad()
-                              else:
-                                  optimizer.zero_grad()
-
-                              loss0 = model0(self.x) + model2(self.x)
-                              loss1 = model1(self.x) + model2(self.x)
-
-                              with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
-                                  scaled_loss.backward()
-                                  if i == inject_inf and which_backward == 0:
-                                      if which_model == 0:
-                                          inj_model = model0
-                                      elif which_model == 2:
-                                          inj_model = model2
-                                      else:
-                                          raise RuntimeError(which_model + " invalid for loss 0")
-                                      if inject_inf_loc == "fp32":
-                                          inj_model.weight0.grad[0] = float('inf')
-                                      elif inject_inf_loc == "fp16":
-                                          inj_model.weight1.grad[0] = float('inf')
-                              with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
-                                  scaled_loss.backward()
-                                  if i == inject_inf and which_backward == 1:
-                                      if which_model == 1:
-                                          inj_model = model1
-                                      elif which_model == 2:
-                                          inj_model = model2
-                                      else:
-                                          raise RuntimeError(which_model + " invalid for loss 1 ")
-                                      if inject_inf_loc == "fp32":
-                                          inj_model.weight0.grad[0] = float('inf')
-                                      elif inject_inf_loc == "fp16":
-                                          inj_model.weight1.grad[0] = float('inf')
-
-                              if i != inject_inf:
-                                  master_params = amp.master_params(optimizer)
-                                  for param, reference_grad in zip(master_params, reference_grads[unskipped]):
-                                      if opt_level == "O2" and not materialize_master_grads:
-                                          continue
-                                      else:
-                                          self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()),
-                                          "opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} which_model {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, which_model, use_multiple_loss_scalers))
-                                  unskipped += 1
-
-                              optimizer.step()
-
-                          model_params = [p for p in model0.parameters()] + \
-                                         [p for p in model1.parameters()] + \
-                                         [p for p in model2.parameters()]
-                          for model, master, reference in zip(
-                                  model_params,
-                                  amp.master_params(optimizer),
-                                  final_params):
-                              self.assertTrue(torch.allclose(model, reference))
-                              self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                          if opt_level == "O1":
-                              _amp_state.handle._deactivate()
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    def test_2models2losses2optimizers(self):
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-
-        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                      momentum=0.125)
-        optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
-                                      momentum=0.25)
-
-        # Don't do it like this:  reference_grads = [[]]*5
-        # because then it creates a list of 5 references to the same "[]" and appending
-        # to any of them effectively makes you append to all of them, which multiplies
-        # the resulting size of reference_grads by 5x and needless to say makes the test fail.
-        reference_grads = [[], [], [], [], []]
-        final_params = [None, None, None, None, None]
-        for i in range(2):
-            optimizer0.zero_grad()
-            optimizer1.zero_grad()
-            loss0 = model0(self.x)
-            loss1 = model1(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()])
-
-            optimizer0.step()
-            optimizer1.step()
-
-        final_params[0] = [param.data.clone() for param in model0.parameters()] + \
-                          [param.data.clone() for param in model1.parameters()]
-
-        def what_got_skipped(which_iter, which_backward):
-            if which_iter == 0 and which_backward == 0:
-                return 1
-            if which_iter == 0 and which_backward == 1:
-                return 2
-            if which_iter == 1 and which_backward == 0:
-                return 3
-            if which_iter == 1 and which_backward == 1:
-                return 4
-            return 0
-
-        for which_iter in (0,1):
-            for which_backward in (0,1):
-                model0 = MyModel(1)
-                model1 = MyModel(2)
-
-                optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                              momentum=0.125)
-                optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
-                                              momentum=0.25)
-
-                for i in range(3):
-                    optimizer0.zero_grad()
-                    optimizer1.zero_grad()
-                    loss0 = model0(self.x)
-                    loss1 = model1(self.x)
-                    loss0.backward()
-                    loss1.backward()
-
-                    if i != which_iter:
-                        reference_grads[what_got_skipped(which_iter, which_backward)].append(
-                            [param.grad.data.clone() for param in model0.parameters()] +
-                            [param.grad.data.clone() for param in model1.parameters()])
-
-                    if i == which_iter:
-                        if which_backward == 0:
-                            optimizer1.step()
-                        else:
-                            optimizer0.step()
-                    else:
-                        optimizer0.step()
-                        optimizer1.step()
-
-                final_params[what_got_skipped(which_iter, which_backward)] = \
-                    [param.data.clone() for param in model0.parameters()] + \
-                    [param.data.clone() for param in model1.parameters()]
-
-        for materialize_master_grads in (False, True):
-          for opt_level in ("O0", "O1", "O2", "O3"):
-            for how_to_zero in ("none", "model", "optimizer"):
-              for use_multiple_loss_scalers in (False, True):
-                if opt_level == "O1" or opt_level == "O2":
-                    inject_inf_iters = (-1, 0, 1)
-                else:
-                    inject_inf_iters = (-1,)
-
-                for inject_inf in inject_inf_iters:
-                  if inject_inf >= 0:
-                     inject_inf_locs = ("fp16", "fp32")
-                     which_backwards = (0, 1)
-                  else:
-                     inject_inf_locs = ("fdsa",)
-                     which_backwards = (None,)
-
-                  for inject_inf_loc in inject_inf_locs:
-                    for which_backward in which_backwards:
-                        if use_multiple_loss_scalers:
-                            num_losses = 2
-                            loss_ids = [0, 1]
-                        else:
-                            num_losses = 1
-                            loss_ids = [0, 0]
-
-                        if inject_inf >= 0:
-                            iters = 3
-                        else:
-                            iters = 2
-
-                        model0 = MyModel(1)
-                        model1 = MyModel(2)
-
-                        models = [model0, model1]
-
-                        optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                              momentum=0.125, materialize_master_grads=materialize_master_grads)
-                        optimizer1 = FusedSGD([{'params' : model1.parameters(), 'lr' : 0.5}],
-                                              momentum=0.25, materialize_master_grads=materialize_master_grads)
-
-                        _amp_state.allow_incoming_model_not_fp32 = True
-                        [model0, model1], [optimizer0, optimizer1] = amp.initialize(
-                            [model0, model1],
-                            [optimizer0, optimizer1],
-                            opt_level=opt_level,
-                            verbosity=0,
-                            cast_model_type=False,
-                            num_losses=num_losses)
-                        _amp_state.allow_incoming_model_not_fp32 = False
-
-                        _amp_state.loss_scalers[0]._loss_scale = 4.0
-                        if use_multiple_loss_scalers:
-                            _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                        unskipped = 0
-                        for i in range(iters):
-                            if how_to_zero == "none":
-                                for model in models:
-                                    for param in model.parameters():
-                                        param.grad = None
-                            elif how_to_zero == "model":
-                                for model in models:
-                                    model.zero_grad()
-                            else:
-                                optimizer0.zero_grad()
-                                optimizer1.zero_grad()
-
-                            loss0 = model0(self.x)
-                            loss1 = model1(self.x)
-
-                            with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 0:
-                                    if inject_inf_loc == "fp32":
-                                        model0.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        model0.weight1.grad[0] = float('inf')
-                            with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 1:
-                                    if inject_inf_loc == "fp32":
-                                        model1.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        model1.weight1.grad[0] = float('inf')
-
-                            # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))
-
-                            if i != inject_inf:
-                                master_params = list(amp.master_params(optimizer0)) + \
-                                                list(amp.master_params(optimizer1))
-                                for param, reference_grad in zip(master_params,
-                                        reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]):
-                                    if opt_level == "O2" and not materialize_master_grads:
-                                        continue
-                                    else:
-                                        self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
-                                unskipped += 1
-
-                            optimizer0.step()
-                            optimizer1.step()
-
-                        model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
-                        master_params = [p for p in amp.master_params(optimizer0)] + \
-                                        [p for p in amp.master_params(optimizer1)]
-                        for model, master, reference in zip(
-                                model_params,
-                                master_params,
-                                final_params[what_got_skipped(inject_inf, which_backward)]):
-                            self.assertTrue(torch.allclose(model, reference))
-                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                        if opt_level == "O1":
-                            _amp_state.handle._deactivate()
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    def test_3models2losses2optimizers(self):
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-        model2 = MyModel(3)
-
-        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                      {'params' : model1.parameters(), 'lr' : 1.0}],
-                                     momentum=0.5)
-        optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
-                                     momentum=0.25)
-
-        # Again, can't do this:  reference_grads = [[]]*9
-        reference_grads = [[], [], [], [], [], [], [], [], []]
-        final_params = [None, None, None, None, None, None, None, None, None]
-        for i in range(2):
-            optimizer0.zero_grad()
-            optimizer1.zero_grad()
-            loss0 = model0(self.x) + model1(self.x)
-            loss1 = model2(self.x) + model1(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()])
-
-            optimizer0.step()
-            optimizer1.step()
-
-        final_params[0] = \
-            [param.data.clone() for param in model0.parameters()] + \
-            [param.data.clone() for param in model1.parameters()] + \
-            [param.data.clone() for param in model2.parameters()]
-
-        def what_got_skipped(which_iter, which_backward, which_model):
-            if which_iter == 0:
-                if which_backward == 0:
-                    if which_model == 0:
-                        return 1
-                    if which_model == 1:
-                        return 2
-                if which_backward == 1:
-                    if which_model == 2:
-                        return 3
-                    if which_model == 1:
-                        return 4
-            if which_iter == 1:
-                if which_backward == 0:
-                    if which_model == 0:
-                        return 5
-                    if which_model == 1:
-                        return 6
-                if which_backward == 1:
-                    if which_model == 2:
-                        return 7
-                    if which_model == 1:
-                        return 8
-            return 0
-
-        for which_iter in (0,1):
-            for which_backward in (0,1):
-                if which_backward == 0:
-                    which_models = (0,1)
-                if which_backward == 1:
-                    which_models = (2,1)
-                for which_model in which_models:
-
-                    model0 = MyModel(1)
-                    model1 = MyModel(2)
-                    model2 = MyModel(3)
-
-                    optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                                  {'params' : model1.parameters(), 'lr' : 1.0}],
-                                                 momentum=0.5)
-                    optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
-                                                 momentum=0.25)
-
-                    for i in range(3):
-                        optimizer0.zero_grad()
-                        optimizer1.zero_grad()
-                        loss0 = model0(self.x) + model1(self.x)
-                        loss1 = model2(self.x) + model1(self.x)
-                        loss0.backward()
-                        loss1.backward()
-
-                        if i != which_iter:
-                            reference_grads[what_got_skipped(which_iter,
-                                    which_backward, which_model)].append(
-                                [param.grad.data.clone() for param in model0.parameters()] +
-                                [param.grad.data.clone() for param in model1.parameters()])
-
-                        if i == which_iter:
-                            if which_backward == 0:
-                                # if which_model == 0:
-                                    optimizer1.step()
-                                # if which_model == 1:
-                                #     optimizer1.step()
-                            if which_backward == 1:
-                                # if which_model == 2:
-                                #     optimizer0.step()
-                                # if which_model == 1:
-                                    continue
-                        else:
-                            optimizer0.step()
-                            optimizer1.step()
-
-                    final_params[what_got_skipped(which_iter, which_backward, which_model)] = \
-                        [param.data.clone() for param in model0.parameters()] + \
-                        [param.data.clone() for param in model1.parameters()] + \
-                        [param.data.clone() for param in model2.parameters()]
-
-        for materialize_master_grads in (False, True):
-          for opt_level in ("O0", "O1", "O2", "O3"):
-            for how_to_zero in ("none", "model", "optimizer"):
-              for use_multiple_loss_scalers in (False, True):
-                if opt_level == "O1" or opt_level == "O2":
-                    inject_inf_iters = (-1, 0, 1)
-                else:
-                    inject_inf_iters = (-1,)
-
-                for inject_inf in inject_inf_iters:
-                  if inject_inf >= 0:
-                     inject_inf_locs = ("fp16", "fp32")
-                     which_backwards = (0, 1)
-                  else:
-                     inject_inf_locs = ("fdsa",)
-                     which_backwards = (None,)
-
-                  for inject_inf_loc in inject_inf_locs:
-                    for which_backward in which_backwards:
-                      if use_multiple_loss_scalers:
-                          num_losses = 2
-                          loss_ids = [0, 1]
-                      else:
-                          num_losses = 1
-                          loss_ids = [0, 0]
-
-                      if inject_inf >= 0:
-                          iters = 3
-                          if which_backward == 0:
-                              which_models = (0, 1)
-                          elif which_backward == 1:
-                              which_models = (2, 1)
-                      else:
-                          iters = 2
-                          which_models = (None,)
-
-                      for which_model in which_models:
-                          model0 = MyModel(1)
-                          model1 = MyModel(2)
-                          model2 = MyModel(3)
-
-                          models = [model0, model1, model2]
-
-                          optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                            {'params' : model1.parameters(), 'lr' : 1.0}],
-                                            momentum=0.5, materialize_master_grads=materialize_master_grads)
-                          optimizer1 = FusedSGD([{'params' : model2.parameters(), 'lr' : 0.5}],
-                                                momentum=0.25, materialize_master_grads=materialize_master_grads)
-
-                          _amp_state.allow_incoming_model_not_fp32 = True
-                          [model0, model1, model2], [optimizer0, optimizer1] = amp.initialize(
-                              [model0, model1, model2],
-                              [optimizer0, optimizer1],
-                              opt_level=opt_level,
-                              verbosity=0,
-                              cast_model_type=False,
-                              num_losses=num_losses)
-                          _amp_state.allow_incoming_model_not_fp32 = False
-
-                          _amp_state.loss_scalers[0]._loss_scale = 4.0
-                          if use_multiple_loss_scalers:
-                              _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                          unskipped = 0
-                          for i in range(iters):
-                              if how_to_zero == "none":
-                                  for model in models:
-                                      for param in model.parameters():
-                                          param.grad = None
-                              elif how_to_zero == "model":
-                                  for model in models:
-                                      model.zero_grad()
-                              else:
-                                  optimizer0.zero_grad()
-                                  optimizer1.zero_grad()
-
-                              loss0 = model0(self.x) + model1(self.x)
-                              loss1 = model2(self.x) + model1(self.x)
-
-                              with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
-                                  scaled_loss.backward()
-                                  if i == inject_inf and which_backward == 0:
-                                      if which_model == 0:
-                                          inj_model = model0
-                                      elif which_model == 1:
-                                          inj_model = model1
-                                      else:
-                                          raise RuntimeError(which_model + " invalid for loss 0")
-                                      if inject_inf_loc == "fp32":
-                                          inj_model.weight0.grad[0] = float('inf')
-                                      elif inject_inf_loc == "fp16":
-                                          inj_model.weight1.grad[0] = float('inf')
-                              with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss:
-                                  scaled_loss.backward()
-                                  if i == inject_inf and which_backward == 1:
-                                      if which_model == 2:
-                                          inj_model = model2
-                                      elif which_model == 1:
-                                          inj_model = model1
-                                      else:
-                                          raise RuntimeError(which_model + " invalid for loss 1 ")
-                                      if inject_inf_loc == "fp32":
-                                          inj_model.weight0.grad[0] = float('inf')
-                                      elif inject_inf_loc == "fp16":
-                                          inj_model.weight1.grad[0] = float('inf')
-
-                              if i != inject_inf:
-                                  master_params = list(amp.master_params(optimizer0)) + \
-                                                  list(amp.master_params(optimizer1))
-                                  for param, reference_grad in zip(master_params,
-                                        reference_grads[what_got_skipped(inject_inf,
-                                            which_backward, which_model)][unskipped]):
-                                      if opt_level == "O2" and not materialize_master_grads:
-                                          continue
-                                      else:
-                                          self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
-                                  unskipped += 1
-
-                              optimizer0.step()
-                              optimizer1.step()
-
-                          model_params = [p for p in model0.parameters()] + \
-                                         [p for p in model1.parameters()] + \
-                                         [p for p in model2.parameters()]
-                          master_params = [p for p in amp.master_params(optimizer0)] + \
-                                          [p for p in amp.master_params(optimizer1)]
-
-                          # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model))
-
-                          for model, master, reference in zip(
-                                  model_params,
-                                  master_params,
-                                  final_params[what_got_skipped(inject_inf, which_backward, which_model)]):
-                              self.assertTrue(torch.allclose(model, reference))
-                              self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                          if opt_level == "O1":
-                              _amp_state.handle._deactivate()
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_larc.py b/tests/L0/run_amp/test_larc.py
deleted file mode 100644
index f4f3e838f..000000000
--- a/tests/L0/run_amp/test_larc.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import unittest
-
-import torch
-from torch import nn
-from torch.nn import Parameter
-
-from apex import amp
-from apex.parallel.LARC import LARC
-from utils import common_init
-
-
-class MyModel(torch.nn.Module):
-    def __init__(self, unique):
-        super(MyModel, self).__init__()
-        self.weight0 = Parameter(
-            unique + torch.arange(2, device="cuda", dtype=torch.float32)
-        )
-
-    def forward(self, input):
-        return (input * self.weight0).sum()
-
-
-class TestLARC(unittest.TestCase):
-    def setUp(self):
-        self.x = torch.ones((2), device="cuda", dtype=torch.float32)
-        common_init(self)
-
-    def tearDown(self):
-        pass
-
-    def test_larc_mixed_precision(self):
-        for opt_level in ["O0", "O1", "O2", "O3"]:
-            model = MyModel(1)
-
-            optimizer = LARC(
-                torch.optim.SGD(
-                    [{"params": model.parameters(), "lr": 0.25}], momentum=0.125
-                )
-            )
-
-            model, optimizer = amp.initialize(
-                model, optimizer, opt_level=opt_level, verbosity=0
-            )
-
-            optimizer.zero_grad()
-            loss = model(self.x)
-            with amp.scale_loss(loss, optimizer) as scaled_loss:
-                scaled_loss.backward()
-            optimizer.step()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/L0/run_amp/test_multi_tensor_axpby.py b/tests/L0/run_amp/test_multi_tensor_axpby.py
deleted file mode 100644
index 0b439bb8d..000000000
--- a/tests/L0/run_amp/test_multi_tensor_axpby.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-import torch
-from torch import nn
-import torch.nn.functional as F
-from math import floor
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-try:
-  import amp_C
-  from amp_C import multi_tensor_axpby
-  from apex.multi_tensor_apply import MultiTensorApply
-  disabled = False
-except ImportError as err:
-  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
-  disabled = True
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-try_nhwc = (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4)
-
-
-class TestMultiTensorAxpby(unittest.TestCase):
-
-    def setUp(self):
-        common_init(self)
-
-        self.a = 2.0
-        self.b = 8.0
-        self.xval = 4.0
-        self.yval = 16.0
-        self.overflow_buf = torch.cuda.IntTensor(1).zero_()
-        self.ref = torch.full((1,), 136.0, device="cuda", dtype=torch.float32)
-
-    def tearDown(self):
-        pass
-
-    # The tensor creation here is written for convenience, not speed.
-    def axpby(self, sizea, sizeb, applier, repeat_tensors,
-              x_type, y_type, out_type, inplace=False, nhwc=False):
-        self.overflow_buf.zero_()
-        sizea = sizea if isinstance(sizea, tuple) else (sizea,)
-        sizeb = sizeb if isinstance(sizeb, tuple) else (sizeb,)
-        t1 = torch.full(sizea, 1.0, device="cuda", dtype=torch.float32)
-        t2 = torch.full(sizeb, 1.0, device="cuda", dtype=torch.float32)
-
-        def to_fmt(t, tp):
-            if nhwc:
-                return t.clone().to(tp, memory_format=torch.channels_last)
-            else:
-                return t.clone().to(tp)
-
-        y_list = []
-        for i in range(repeat_tensors):
-            y_list += [to_fmt(t1, y_type)*self.yval, to_fmt(t2, y_type)*self.yval]
-
-        x_list = [to_fmt(x, x_type)*(self.xval/self.yval) for x in y_list]
-
-        if inplace:
-            out_list = y_list
-        else:
-            out_list = [to_fmt(out, out_type)*3.0 for out in y_list]
-
-        applier(multi_tensor_axpby, self.overflow_buf, [x_list, y_list, out_list], self.a, self.b, -1)
-
-        self.assertTrue(all([torch.allclose(out, self.ref.to(out_type)) for out in out_list]),
-                        msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
-                        x_type, y_type, out_type, inplace))
-        self.assertTrue(self.overflow_buf.item() == 0,
-                        msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
-                        x_type, y_type, out_type, inplace))
-
-    # def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
-    #     self.overflow_buf.zero_()
-    #     a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
-    #     b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
-
-    #     out_list = []
-    #     for i in range(repeat_tensors):
-    #         out_list += [a.clone().to(out_type), b.clone().to(out_type)]
-
-    #     if inplace:
-    #         in_list = out_list
-    #     else:
-    #         in_list = [out.clone().to(in_type) for out in out_list]
-
-    #     applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
-
-    #     self.overflow_buf.zero_()
-    #     in_list[t][ind] = val
-    #     applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
-    #     self.assertTrue(self.overflow_buf.item())
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    def test_fuzz(self):
-        input_size_pairs = (
-            (7777*77, 555*555),
-            (777, 555),
-            (555, 2048*32+1),
-            (2048*32+1, 555),
-            (555, 2048*32),
-            (2048*32, 555),
-            (33333, 555),
-            (555, 33333))
-        appliers = (
-            MultiTensorApply(2048*32),
-            MultiTensorApply(333),
-            MultiTensorApply(33333))
-        repeat_tensors = (
-            1,
-            55)
-
-        for sizea, sizeb in input_size_pairs:
-          for applier in appliers:
-            for repeat in repeat_tensors:
-              for x_type in (torch.float32, torch.float16):
-                for y_type in (torch.float32, torch.float16):
-                  for out_type in (torch.float32, torch.float16):
-                    for inplace in (True, False):
-                      if inplace is True and (y_type is not out_type):
-                        continue
-                      else:
-                        self.axpby(sizea, sizeb, applier, repeat,
-                                   x_type, y_type, out_type, inplace=inplace)
-                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                      #               0, 0, float('nan'), inplace=inplace)
-                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                      #               2*repeat-1, sizeb-1, float('inf'), inplace=inplace)
-                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                      #              2*(repeat//2), sizea//2, float('inf'), inplace=inplace)
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    @unittest.skipIf(not try_nhwc, "torch version is 1.4 or earlier, may not support nhwc")
-    def test_fuzz_nhwc(self):
-        input_size_pairs = (
-            ((7, 77, 7, 77), (5, 55, 5, 55)),
-            ((1, 1, 777, 1), (1, 1, 555, 1)),
-            ((5, 47, 5, 55), (1, 1, 1, 2048*32 + 1)),
-            ((1, 1, 1, 2048*32 + 1), (55, 47, 5, 55)),
-            ((555, 1, 1, 1), (32, 8, 32, 8)),
-            ((32, 8, 32, 8), (55, 47, 5, 55)),
-            ((1, 1, 33333, 1), (55, 47, 55, 5)),
-            ((55, 47, 55, 5), (1, 1, 33333, 1)))
-        appliers = (
-            MultiTensorApply(2048*32),
-            MultiTensorApply(333),
-            MultiTensorApply(33333))
-        repeat_tensors = (
-            1,
-            55)
-
-        for sizea, sizeb in input_size_pairs:
-          for applier in appliers:
-            for repeat in repeat_tensors:
-              for x_type in (torch.float32, torch.float16):
-                for y_type in (torch.float32, torch.float16):
-                  for out_type in (torch.float32, torch.float16):
-                    for inplace in (True, False):
-                      if inplace is True and (y_type is not out_type):
-                        continue
-                      else:
-                        self.axpby(sizea, sizeb, applier, repeat,
-                                   x_type, y_type, out_type, inplace=inplace, nhwc=True)
-                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                      #               0, 0, float('nan'), inplace=inplace)
-                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                      #               2*repeat-1, sizeb-1, float('inf'), inplace=inplace)
-                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                      #              2*(repeat//2), sizea//2, float('inf'), inplace=inplace)
-
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_multi_tensor_l2norm.py b/tests/L0/run_amp/test_multi_tensor_l2norm.py
deleted file mode 100644
index ed3cbd195..000000000
--- a/tests/L0/run_amp/test_multi_tensor_l2norm.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-try:
-  import amp_C
-  from amp_C import multi_tensor_l2norm
-  from apex.multi_tensor_apply import MultiTensorApply
-  disabled = False
-except ImportError as err:
-  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
-  disabled = True
-
-
-class TestMultiTensorL2Norm(unittest.TestCase):
-
-    def setUp(self):
-        common_init(self)
-        self.val = 4.0
-        self.overflow_buf = torch.cuda.IntTensor(1).zero_()
-
-    def tearDown(self):
-        pass
-
-    # The tensor creation here is written for convenience, not speed.
-    def l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type, per_tensor):
-        self.overflow_buf.zero_()
-        a = torch.cuda.FloatTensor(sizea).fill_(self.val)
-        b = torch.cuda.FloatTensor(sizeb).fill_(self.val)
-
-        in_list = []
-        for i in range(repeat_tensors):
-            in_list += [a.clone().to(in_type), b.clone().to(in_type)]
-
-        if per_tensor:
-            norm, norm_per_tensor = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
-            normab = torch.cat((a.norm().view(1), b.norm().view(1)))
-            norm_per_tensor = norm_per_tensor.view(-1, 2)
-        else:
-            norm, _ = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
-
-        reference = torch.cuda.FloatTensor((sizea + sizeb)*repeat_tensors).fill_(self.val).norm()
-
-        self.assertTrue(torch.allclose(norm, reference))
-        if per_tensor:
-          self.assertTrue(torch.allclose(norm_per_tensor, normab))
-        self.assertTrue(self.overflow_buf.item() == 0)
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    def test_fuzz(self):
-        input_size_pairs = (
-            (7777*77, 555*555),
-            (777, 555),
-            (555, 2048*32+1),
-            (2048*32+1, 555),
-            (555, 2048*32),
-            (2048*32, 555),
-            (33333, 555),
-            (555, 33333))
-        appliers = (
-            MultiTensorApply(2048*32), 
-            MultiTensorApply(333),
-            MultiTensorApply(33333))
-        repeat_tensors = (
-            1,
-            55)
-
-        for sizea, sizeb in input_size_pairs:
-          for applier in appliers:
-            for repeat in repeat_tensors:
-              for in_type in (torch.float32, torch.float16):
-                for per_tensor in (False, True):
-                  self.l2norm(sizea, sizeb, applier, repeat, in_type, per_tensor)
-
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_multi_tensor_scale.py b/tests/L0/run_amp/test_multi_tensor_scale.py
deleted file mode 100644
index 22da2490c..000000000
--- a/tests/L0/run_amp/test_multi_tensor_scale.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-try:
-  import amp_C
-  from amp_C import multi_tensor_scale 
-  from apex.multi_tensor_apply import MultiTensorApply
-  disabled = False
-except ImportError as err:
-  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
-  disabled = True
-
-
-class TestMultiTensorScale(unittest.TestCase):
-
-    def setUp(self):
-        common_init(self)
-        self.scale = 4.0
-        self.overflow_buf = torch.cuda.IntTensor(1).zero_()
-        self.ref = torch.cuda.FloatTensor([1.0])
-
-    def tearDown(self):
-        pass
-
-    # The tensor creation here is written for convenience, not speed.
-    def downscale(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, inplace=False):
-        self.overflow_buf.zero_()
-        a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
-        b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
-
-        out_list = []
-        for i in range(repeat_tensors):
-            out_list += [a.clone().to(out_type), b.clone().to(out_type)]
-
-        if inplace:
-            in_list = out_list
-        else:
-            in_list = [out.clone().to(in_type) for out in out_list]
-
-        applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
-
-        self.assertTrue(all([torch.allclose(out, self.ref.to(out_type)) for out in out_list]))
-        self.assertTrue(self.overflow_buf.item() == 0)
- 
-    def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
-        self.overflow_buf.zero_()
-        a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
-        b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
-
-        out_list = []
-        for i in range(repeat_tensors):
-            out_list += [a.clone().to(out_type), b.clone().to(out_type)]
-
-        if inplace:
-            in_list = out_list
-        else:
-            in_list = [out.clone().to(in_type) for out in out_list]
-
-        applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
-
-        self.overflow_buf.zero_()
-        in_list[t][ind] = val
-        applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
-        self.assertTrue(self.overflow_buf.item())
-
-    # Currently, the fused kernel gives a hard error if you attempt to downscale
-    # into fp16 output, which imo is the desired behavior.  Maybe someday we
-    # will learn otherwise.
-    # @unittest.skipIf(disabled, "amp_C is unavailable")
-    # def test_fp16_to_fp16(self):
-    #     self.downscale(self.fp16, self.fp16, self.fp16_ref)
-    # 
-    # @unittest.skipIf(disabled, "amp_C is unavailable")
-    # def test_fp32_to_fp16(self):
-    #     self.downscale(self.fp32, self.fp16, self.fp16_ref)
-
-    @unittest.skipIf(disabled, "amp_C is unavailable")
-    def test_fuzz(self):
-        input_size_pairs = (
-            (7777*77, 555*555),
-            (777, 555),
-            (555, 2048*32+1),
-            (2048*32+1, 555),
-            (555, 2048*32),
-            (2048*32, 555),
-            (33333, 555),
-            (555, 33333))
-        appliers = (
-            MultiTensorApply(2048*32), 
-            MultiTensorApply(333),
-            MultiTensorApply(33333))
-        repeat_tensors = (
-            1,
-            55)
-
-        for sizea, sizeb in input_size_pairs:
-          for applier in appliers:
-            for repeat in repeat_tensors:
-              for in_type in (torch.float32, torch.float16):
-                for out_type in (torch.float32, torch.float16):
-                  for inplace in (True, False):
-                    if inplace is True and (out_type is not in_type):
-                      continue
-                    else:
-                      self.downscale(sizea, sizeb, applier, repeat, in_type, out_type, inplace=inplace)
-                      self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                                    0, 0, float('nan'), inplace=inplace)
-                      self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                                    2*repeat-1, sizeb-1, float('inf'), inplace=inplace)
-                      self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
-                                   2*(repeat//2), sizea//2, float('inf'), inplace=inplace)
-
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_multiple_models_optimizers_losses.py b/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
deleted file mode 100644
index 068c84537..000000000
--- a/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
+++ /dev/null
@@ -1,762 +0,0 @@
-import unittest
-
-import functools as ft
-import itertools as it
-
-from apex import amp
-from apex.amp import _amp_state
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn import Parameter
-
-from utils import common_init, HALF, FLOAT,\
-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
-
-class MyModel(torch.nn.Module):
-    def __init__(self, unique):
-        super(MyModel, self).__init__()
-        self.weight0 = Parameter(unique +
-            torch.arange(2, device='cuda', dtype=torch.float32))
-        self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
-
-    @staticmethod
-    def ops(input, weight0, weight1):
-        return ((input*(weight0.float()))*(weight1.float())).sum()
-
-    def forward(self, input):
-        return self.ops(input, self.weight0, self.weight1)
-
-# Abandon all hope, ye who enter here.
-
-# This is hands down the ugliest code I have ever written, but it succeeds in testing
-# multiple models/optimizers/losses fairly thoroughly.  Many of the different test cases
-# require slightly divergent code in a way that seems near-impossible to genericize into a simple
-# cross product or nested loops.
-
-class TestMultipleModelsOptimizersLosses(unittest.TestCase):
-    def setUp(self):
-        self.x = torch.ones((2), device='cuda', dtype=torch.float32)
-        common_init(self)
-
-    def tearDown(self):
-        pass
-
-    def test_2models2losses1optimizer(self):
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-
-        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                     {'params' : model1.parameters(), 'lr' : 0.5}],
-                                    momentum=0.125)
-
-        reference_grads = []
-        for i in range(2):
-            optimizer.zero_grad()
-            loss0 = model0(self.x)
-            loss1 = model1(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()])
-
-            optimizer.step()
-
-        final_params = [param.data.clone() for param in model0.parameters()] + \
-                       [param.data.clone() for param in model1.parameters()]
-
-        for opt_level in ("O0", "O1", "O2", "O3"):
-          for how_to_zero in ("none", "model", "optimizer"):
-            for use_multiple_loss_scalers in (True, False):
-              if opt_level == "O1" or opt_level == "O2":
-                  inject_inf_iters = (-1, 0, 1)
-              else:
-                  inject_inf_iters = (-1,)
-
-              for inject_inf in inject_inf_iters:
-                if inject_inf >= 0:
-                   inject_inf_locs = ("fp16", "fp32")
-                   which_backwards = (0, 1)
-                else:
-                   inject_inf_locs = ("fdsa",)
-                   which_backwards = (None,)
-
-                for inject_inf_loc in inject_inf_locs:
-                  for which_backward in which_backwards:
-                      if use_multiple_loss_scalers:
-                          num_losses = 2
-                          loss_ids = [0, 1]
-                      else:
-                          num_losses = 1
-                          loss_ids = [0, 0]
-
-                      if inject_inf >= 0:
-                          iters = 3
-                      else:
-                          iters = 2
-
-                      model0 = MyModel(1)
-                      model1 = MyModel(2)
-
-                      models = [model0, model1]
-
-                      optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                                   {'params' : model1.parameters(), 'lr' : 0.5}],
-                                                  momentum=0.125)
-
-                      _amp_state.allow_incoming_model_not_fp32 = True
-                      [model0, model1], optimizer = amp.initialize(
-                          [model0, model1],
-                          optimizer,
-                          opt_level=opt_level,
-                          verbosity=0,
-                          cast_model_type=False,
-                          num_losses=num_losses)
-                      _amp_state.allow_incoming_model_not_fp32 = False
-
-                      _amp_state.loss_scalers[0]._loss_scale = 4.0
-                      if use_multiple_loss_scalers:
-                          _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                      unskipped = 0
-                      for i in range(iters):
-                          if how_to_zero == "none":
-                              for model in models:
-                                  for param in model.parameters():
-                                      param.grad = None
-                          elif how_to_zero == "model":
-                              for model in models:
-                                  model.zero_grad()
-                          else:
-                              optimizer.zero_grad()
-
-                          loss0 = model0(self.x)
-                          loss1 = model1(self.x)
-
-                          with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
-                              scaled_loss.backward()
-                              if i == inject_inf and which_backward == 0:
-                                  if inject_inf_loc == "fp32":
-                                      model0.weight0.grad[0] = float('inf')
-                                  elif inject_inf_loc == "fp16":
-                                      model0.weight1.grad[0] = float('inf')
-                          with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
-                              scaled_loss.backward()
-                              if i == inject_inf and which_backward == 1:
-                                  if inject_inf_loc == "fp32":
-                                      model1.weight0.grad[0] = float('inf')
-                                  elif inject_inf_loc == "fp16":
-                                      model1.weight1.grad[0] = float('inf')
-
-                          if i != inject_inf:
-                              for param, reference_grad in zip(amp.master_params(optimizer),
-                                                               reference_grads[unskipped]):
-                                  self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
-                              unskipped += 1
-                          optimizer.step()
-
-                      model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
-                      for model, master, reference in zip(
-                              model_params,
-                              amp.master_params(optimizer),
-                              final_params):
-                          self.assertTrue(torch.allclose(model, reference))
-                          self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                      if opt_level == "O1":
-                          _amp_state.handle._deactivate()
-
-    def test_3models2losses1optimizer(self):
-
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-        model2 = MyModel(3)
-
-        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                     {'params' : model1.parameters(), 'lr' : 0.5},
-                                     {'params' : model2.parameters(), 'lr' : 0.125}],
-                                     momentum=0.125)
-
-        reference_grads = []
-        for i in range(2):
-            optimizer.zero_grad()
-            loss0 = model0(self.x) + model2(self.x)
-            loss1 = model1(self.x) + model2(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()] +
-                                   [param.grad.data.clone() for param in model2.parameters()])
-
-            optimizer.step()
-
-
-        final_params = [param.data.clone() for param in model0.parameters()] + \
-                       [param.data.clone() for param in model1.parameters()] + \
-                       [param.data.clone() for param in model2.parameters()]
-
-        for opt_level in ("O0", "O1", "O2", "O3"):
-          for how_to_zero in ("none", "model", "optimizer"):
-            for use_multiple_loss_scalers in (True, False):
-              if opt_level == "O1" or opt_level == "O2":
-                  inject_inf_iters = (-1, 0, 1)
-              else:
-                  inject_inf_iters = (-1,)
-
-              for inject_inf in inject_inf_iters:
-                if inject_inf >= 0:
-                   inject_inf_locs = ("fp16", "fp32")
-                   which_backwards = (0, 1)
-                else:
-                   inject_inf_locs = ("fdsa",)
-                   which_backwards = (None,)
-
-                for inject_inf_loc in inject_inf_locs:
-                  for which_backward in which_backwards:
-                    if use_multiple_loss_scalers:
-                        num_losses = 2
-                        loss_ids = [0, 1]
-                    else:
-                        num_losses = 1
-                        loss_ids = [0, 0]
-
-                    if inject_inf >= 0:
-                        iters = 3
-                        if which_backward == 0:
-                            which_models = (0, 2)
-                        elif which_backward == 1:
-                            which_models = (1, 2)
-                    else:
-                        iters = 2
-                        which_models = (None,)
-
-                    for which_model in which_models:
-                        model0 = MyModel(1)
-                        model1 = MyModel(2)
-                        model2 = MyModel(3)
-
-                        models = [model0, model1, model2]
-
-                        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                                     {'params' : model1.parameters(), 'lr' : 0.5},
-                                                     {'params' : model2.parameters(), 'lr' : 0.125}],
-                                                     momentum=0.125)
-
-                        _amp_state.allow_incoming_model_not_fp32 = True
-                        [model0, model1, model2], optimizer = amp.initialize(
-                            [model0, model1, model2],
-                            optimizer,
-                            opt_level=opt_level,
-                            verbosity=0,
-                            cast_model_type=False,
-                            num_losses=num_losses)
-                        _amp_state.allow_incoming_model_not_fp32 = False
-
-                        _amp_state.loss_scalers[0]._loss_scale = 4.0
-                        if use_multiple_loss_scalers:
-                            _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                        unskipped = 0
-                        for i in range(iters):
-                            if how_to_zero == "none":
-                                for model in models:
-                                    for param in model.parameters():
-                                        param.grad = None
-                            elif how_to_zero == "model":
-                                for model in models:
-                                    model.zero_grad()
-                            else:
-                                optimizer.zero_grad()
-
-                            # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} which_model {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, which_model, use_multiple_loss_scalers))
-
-                            loss0 = model0(self.x) + model2(self.x)
-                            loss1 = model1(self.x) + model2(self.x)
-
-                            with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 0:
-                                    if which_model == 0:
-                                        inj_model = model0
-                                    elif which_model == 2:
-                                        inj_model = model2
-                                    else:
-                                        raise RuntimeError(which_model + " invalid for loss 0")
-                                    if inject_inf_loc == "fp32":
-                                        inj_model.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        inj_model.weight1.grad[0] = float('inf')
-                            with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 1:
-                                    if which_model == 1:
-                                        inj_model = model1
-                                    elif which_model == 2:
-                                        inj_model = model2
-                                    else:
-                                        raise RuntimeError(which_model + " invalid for loss 1 ")
-                                    if inject_inf_loc == "fp32":
-                                        inj_model.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        inj_model.weight1.grad[0] = float('inf')
-
-                            if i != inject_inf:
-                                for param, reference_grad in zip(amp.master_params(optimizer),
-                                                                 reference_grads[unskipped]):
-                                    self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
-                                unskipped += 1
-
-                            optimizer.step()
-
-                        model_params = [p for p in model0.parameters()] + \
-                                       [p for p in model1.parameters()] + \
-                                       [p for p in model2.parameters()]
-                        for model, master, reference in zip(
-                                model_params,
-                                amp.master_params(optimizer),
-                                final_params):
-                            self.assertTrue(torch.allclose(model, reference))
-                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                        if opt_level == "O1":
-                            _amp_state.handle._deactivate()
-
-    def test_2models2losses2optimizers(self):
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-
-        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                      momentum=0.125)
-        optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
-                                      momentum=0.25)
-
-        # Don't do it like this:  reference_grads = [[]]*5
-        # because then it creates a list of 5 references to the same "[]" and appending
-        # to any of them effectively makes you append to all of them, which multiplies
-        # the resulting size of reference_grads by 5x and needless to say makes the test fail.
-        reference_grads = [[], [], [], [], []]
-        final_params = [None, None, None, None, None]
-        for i in range(2):
-            optimizer0.zero_grad()
-            optimizer1.zero_grad()
-            loss0 = model0(self.x)
-            loss1 = model1(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()])
-
-            optimizer0.step()
-            optimizer1.step()
-
-        final_params[0] = [param.data.clone() for param in model0.parameters()] + \
-                          [param.data.clone() for param in model1.parameters()]
-
-        def what_got_skipped(which_iter, which_backward):
-            if which_iter == 0 and which_backward == 0:
-                return 1
-            if which_iter == 0 and which_backward == 1:
-                return 2
-            if which_iter == 1 and which_backward == 0:
-                return 3
-            if which_iter == 1 and which_backward == 1:
-                return 4
-            return 0
-
-        for which_iter in (0,1):
-            for which_backward in (0,1):
-                model0 = MyModel(1)
-                model1 = MyModel(2)
-
-                optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                              momentum=0.125)
-                optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
-                                              momentum=0.25)
-
-                for i in range(3):
-                    optimizer0.zero_grad()
-                    optimizer1.zero_grad()
-                    loss0 = model0(self.x)
-                    loss1 = model1(self.x)
-                    loss0.backward()
-                    loss1.backward()
-
-                    if i != which_iter:
-                        reference_grads[what_got_skipped(which_iter, which_backward)].append(
-                            [param.grad.data.clone() for param in model0.parameters()] +
-                            [param.grad.data.clone() for param in model1.parameters()])
-
-                    if i == which_iter:
-                        if which_backward == 0:
-                            optimizer1.step()
-                        else:
-                            optimizer0.step()
-                    else:
-                        optimizer0.step()
-                        optimizer1.step()
-
-                final_params[what_got_skipped(which_iter, which_backward)] = \
-                    [param.data.clone() for param in model0.parameters()] + \
-                    [param.data.clone() for param in model1.parameters()]
-
-        for opt_level in ("O0", "O1", "O2", "O3"):
-          for how_to_zero in ("none", "model", "optimizer"):
-            for use_multiple_loss_scalers in (True, False):
-              if opt_level == "O1" or opt_level == "O2":
-                  inject_inf_iters = (-1, 0, 1)
-              else:
-                  inject_inf_iters = (-1,)
-
-              for inject_inf in inject_inf_iters:
-                if inject_inf >= 0:
-                   inject_inf_locs = ("fp16", "fp32")
-                   which_backwards = (0, 1)
-                else:
-                   inject_inf_locs = ("fdsa",)
-                   which_backwards = (None,)
-
-                for inject_inf_loc in inject_inf_locs:
-                  for which_backward in which_backwards:
-                      if use_multiple_loss_scalers:
-                          num_losses = 2
-                          loss_ids = [0, 1]
-                      else:
-                          num_losses = 1
-                          loss_ids = [0, 0]
-
-                      if inject_inf >= 0:
-                          iters = 3
-                      else:
-                          iters = 2
-
-                      model0 = MyModel(1)
-                      model1 = MyModel(2)
-
-                      models = [model0, model1]
-
-                      optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
-                                                    momentum=0.125)
-                      optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
-                                                    momentum=0.25)
-
-                      _amp_state.allow_incoming_model_not_fp32 = True
-                      [model0, model1], [optimizer0, optimizer1] = amp.initialize(
-                          [model0, model1],
-                          [optimizer0, optimizer1],
-                          opt_level=opt_level,
-                          verbosity=0,
-                          cast_model_type=False,
-                          num_losses=num_losses)
-                      _amp_state.allow_incoming_model_not_fp32 = False
-
-                      _amp_state.loss_scalers[0]._loss_scale = 4.0
-                      if use_multiple_loss_scalers:
-                          _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                      unskipped = 0
-                      for i in range(iters):
-                          if how_to_zero == "none":
-                              for model in models:
-                                  for param in model.parameters():
-                                      param.grad = None
-                          elif how_to_zero == "model":
-                              for model in models:
-                                  model.zero_grad()
-                          else:
-                              optimizer0.zero_grad()
-                              optimizer1.zero_grad()
-
-                          loss0 = model0(self.x)
-                          loss1 = model1(self.x)
-
-                          with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
-                              scaled_loss.backward()
-                              if i == inject_inf and which_backward == 0:
-                                  if inject_inf_loc == "fp32":
-                                      model0.weight0.grad[0] = float('inf')
-                                  elif inject_inf_loc == "fp16":
-                                      model0.weight1.grad[0] = float('inf')
-                          with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss:
-                              scaled_loss.backward()
-                              if i == inject_inf and which_backward == 1:
-                                  if inject_inf_loc == "fp32":
-                                      model1.weight0.grad[0] = float('inf')
-                                  elif inject_inf_loc == "fp16":
-                                      model1.weight1.grad[0] = float('inf')
-
-                          # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))
-
-                          if i != inject_inf:
-                              master_params = list(amp.master_params(optimizer0)) + \
-                                              list(amp.master_params(optimizer1))
-                              for param, reference_grad in zip(master_params,
-                                      reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]):
-                                  self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
-                              unskipped += 1
-
-                          optimizer0.step()
-                          optimizer1.step()
-
-                      model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
-                      master_params = [p for p in amp.master_params(optimizer0)] + \
-                                      [p for p in amp.master_params(optimizer1)]
-                      for model, master, reference in zip(
-                              model_params,
-                              master_params,
-                              final_params[what_got_skipped(inject_inf, which_backward)]):
-                          self.assertTrue(torch.allclose(model, reference))
-                          self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                      if opt_level == "O1":
-                          _amp_state.handle._deactivate()
-
-    def test_3models2losses2optimizers(self):
-        model0 = MyModel(1)
-        model1 = MyModel(2)
-        model2 = MyModel(3)
-
-        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                      {'params' : model1.parameters(), 'lr' : 1.0}],
-                                     momentum=0.5)
-        optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
-                                     momentum=0.25)
-
-        # Again, can't do this:  reference_grads = [[]]*9
-        reference_grads = [[], [], [], [], [], [], [], [], []]
-        final_params = [None, None, None, None, None, None, None, None, None]
-        for i in range(2):
-            optimizer0.zero_grad()
-            optimizer1.zero_grad()
-            loss0 = model0(self.x) + model1(self.x)
-            loss1 = model2(self.x) + model1(self.x)
-            loss0.backward()
-            loss1.backward()
-
-            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
-                                   [param.grad.data.clone() for param in model1.parameters()])
-
-            optimizer0.step()
-            optimizer1.step()
-
-        final_params[0] = \
-            [param.data.clone() for param in model0.parameters()] + \
-            [param.data.clone() for param in model1.parameters()] + \
-            [param.data.clone() for param in model2.parameters()]
-
-        def what_got_skipped(which_iter, which_backward, which_model):
-            if which_iter == 0:
-                if which_backward == 0:
-                    if which_model == 0:
-                        return 1
-                    if which_model == 1:
-                        return 2
-                if which_backward == 1:
-                    if which_model == 2:
-                        return 3
-                    if which_model == 1:
-                        return 4
-            if which_iter == 1:
-                if which_backward == 0:
-                    if which_model == 0:
-                        return 5
-                    if which_model == 1:
-                        return 6
-                if which_backward == 1:
-                    if which_model == 2:
-                        return 7
-                    if which_model == 1:
-                        return 8
-            return 0
-
-        for which_iter in (0,1):
-            for which_backward in (0,1):
-                if which_backward == 0:
-                    which_models = (0,1)
-                if which_backward == 1:
-                    which_models = (2,1)
-                for which_model in which_models:
-
-                    model0 = MyModel(1)
-                    model1 = MyModel(2)
-                    model2 = MyModel(3)
-
-                    optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                                  {'params' : model1.parameters(), 'lr' : 1.0}],
-                                                 momentum=0.5)
-                    optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
-                                                 momentum=0.25)
-
-                    for i in range(3):
-                        optimizer0.zero_grad()
-                        optimizer1.zero_grad()
-                        loss0 = model0(self.x) + model1(self.x)
-                        loss1 = model2(self.x) + model1(self.x)
-                        loss0.backward()
-                        loss1.backward()
-
-                        if i != which_iter:
-                            reference_grads[what_got_skipped(which_iter,
-                                    which_backward, which_model)].append(
-                                [param.grad.data.clone() for param in model0.parameters()] +
-                                [param.grad.data.clone() for param in model1.parameters()])
-
-                        if i == which_iter:
-                            if which_backward == 0:
-                                # if which_model == 0:
-                                    optimizer1.step()
-                                # if which_model == 1:
-                                #     optimizer1.step()
-                            if which_backward == 1:
-                                # if which_model == 2:
-                                #     optimizer0.step()
-                                # if which_model == 1:
-                                    continue
-                        else:
-                            optimizer0.step()
-                            optimizer1.step()
-
-                    final_params[what_got_skipped(which_iter, which_backward, which_model)] = \
-                        [param.data.clone() for param in model0.parameters()] + \
-                        [param.data.clone() for param in model1.parameters()] + \
-                        [param.data.clone() for param in model2.parameters()]
-
-        for opt_level in ("O0", "O1", "O2", "O3"):
-          for how_to_zero in ("none", "model", "optimizer"):
-            for use_multiple_loss_scalers in (True, False):
-              if opt_level == "O1" or opt_level == "O2":
-                  inject_inf_iters = (-1, 0, 1)
-              else:
-                  inject_inf_iters = (-1,)
-
-              for inject_inf in inject_inf_iters:
-                if inject_inf >= 0:
-                   inject_inf_locs = ("fp16", "fp32")
-                   which_backwards = (0, 1)
-                else:
-                   inject_inf_locs = ("fdsa",)
-                   which_backwards = (None,)
-
-                for inject_inf_loc in inject_inf_locs:
-                  for which_backward in which_backwards:
-                    if use_multiple_loss_scalers:
-                        num_losses = 2
-                        loss_ids = [0, 1]
-                    else:
-                        num_losses = 1
-                        loss_ids = [0, 0]
-
-                    if inject_inf >= 0:
-                        iters = 3
-                        if which_backward == 0:
-                            which_models = (0, 1)
-                        elif which_backward == 1:
-                            which_models = (2, 1)
-                    else:
-                        iters = 2
-                        which_models = (None,)
-
-                    for which_model in which_models:
-                        model0 = MyModel(1)
-                        model1 = MyModel(2)
-                        model2 = MyModel(3)
-
-                        models = [model0, model1, model2]
-
-                        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
-                                                      {'params' : model1.parameters(), 'lr' : 1.0}],
-                                                     momentum=0.5)
-                        optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
-                                                     momentum=0.25)
-
-                        _amp_state.allow_incoming_model_not_fp32 = True
-                        [model0, model1, model2], [optimizer0, optimizer1] = amp.initialize(
-                            [model0, model1, model2],
-                            [optimizer0, optimizer1],
-                            opt_level=opt_level,
-                            verbosity=0,
-                            cast_model_type=False,
-                            num_losses=num_losses)
-                        _amp_state.allow_incoming_model_not_fp32 = False
-
-                        _amp_state.loss_scalers[0]._loss_scale = 4.0
-                        if use_multiple_loss_scalers:
-                            _amp_state.loss_scalers[1]._loss_scale = 16.0
-
-                        unskipped = 0
-                        for i in range(iters):
-                            if how_to_zero == "none":
-                                for model in models:
-                                    for param in model.parameters():
-                                        param.grad = None
-                            elif how_to_zero == "model":
-                                for model in models:
-                                    model.zero_grad()
-                            else:
-                                optimizer0.zero_grad()
-                                optimizer1.zero_grad()
-
-                            loss0 = model0(self.x) + model1(self.x)
-                            loss1 = model2(self.x) + model1(self.x)
-
-                            with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 0:
-                                    if which_model == 0:
-                                        inj_model = model0
-                                    elif which_model == 1:
-                                        inj_model = model1
-                                    else:
-                                        raise RuntimeError(which_model + " invalid for loss 0")
-                                    if inject_inf_loc == "fp32":
-                                        inj_model.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        inj_model.weight1.grad[0] = float('inf')
-                            with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss:
-                                scaled_loss.backward()
-                                if i == inject_inf and which_backward == 1:
-                                    if which_model == 2:
-                                        inj_model = model2
-                                    elif which_model == 1:
-                                        inj_model = model1
-                                    else:
-                                        raise RuntimeError(which_model + " invalid for loss 1 ")
-                                    if inject_inf_loc == "fp32":
-                                        inj_model.weight0.grad[0] = float('inf')
-                                    elif inject_inf_loc == "fp16":
-                                        inj_model.weight1.grad[0] = float('inf')
-
-                            if i != inject_inf:
-                                master_params = list(amp.master_params(optimizer0)) + \
-                                                list(amp.master_params(optimizer1))
-                                for param, reference_grad in zip(master_params,
-                                      reference_grads[what_got_skipped(inject_inf,
-                                          which_backward, which_model)][unskipped]):
-                                    self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
-                                unskipped += 1
-
-                            optimizer0.step()
-                            optimizer1.step()
-
-                        model_params = [p for p in model0.parameters()] + \
-                                       [p for p in model1.parameters()] + \
-                                       [p for p in model2.parameters()]
-                        master_params = [p for p in amp.master_params(optimizer0)] + \
-                                        [p for p in amp.master_params(optimizer1)]
-
-                        # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model))
-
-                        for model, master, reference in zip(
-                                model_params,
-                                master_params,
-                                final_params[what_got_skipped(inject_inf, which_backward, which_model)]):
-                            self.assertTrue(torch.allclose(model, reference))
-                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))
-
-                        if opt_level == "O1":
-                            _amp_state.handle._deactivate()
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_promotion.py b/tests/L0/run_amp/test_promotion.py
deleted file mode 100644
index f5ef30c12..000000000
--- a/tests/L0/run_amp/test_promotion.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import unittest
-
-import itertools as it
-
-from apex import amp
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-from utils import common_init, HALF, FLOAT, DTYPES
-
-class TestPromotion(unittest.TestCase):
-    def setUp(self):
-        self.handle = amp.init(enabled=True)
-        common_init(self)
-
-    def tearDown(self):
-        self.handle._deactivate()
-
-    def run_binary_promote_test(self, fns, input_shape, x_inplace=False):
-        type_pairs = it.product(DTYPES, DTYPES)
-        for fn, (xtype, ytype) in it.product(fns, type_pairs):
-            x = torch.randn(input_shape, dtype=xtype).requires_grad_()
-            x_leaf = x
-            if x_inplace:
-                # We need a non-leaf to call in place on
-                x = x.clone()
-            y = torch.randn(input_shape, dtype=ytype)
-            out = fn(x, y)
-            if x_inplace:
-                # In place: always match xtype
-                self.assertEqual(out.type(), x.type())
-            else:
-                # Out of place: match widest type
-                if xtype == torch.float or ytype == torch.float:
-                    self.assertEqual(out.type(), FLOAT)
-                else:
-                    self.assertEqual(out.type(), HALF)
-            out.float().sum().backward()
-            self.assertEqual(x_leaf.grad.dtype, xtype)
-
-    def test_atan2_matches_widest(self):
-        fns = [lambda x, y : torch.atan2(x, y),
-               lambda x, y : x.atan2(y)]
-        self.run_binary_promote_test(fns, (self.b,))
-
-    def test_mul_matches_widest(self):
-        fns = [lambda x, y : torch.mul(x, y),
-               lambda x, y: x.mul(y)]
-        self.run_binary_promote_test(fns, (self.b,))
-
-    def test_cat_matches_widest(self):
-        shape = self.b
-        ys = [torch.randn(shape, dtype=torch.half) for _ in range(5)]
-        x_float = torch.randn(shape)
-        out = torch.cat(ys + [x_float])
-        self.assertEqual(out.type(), FLOAT)
-        x_half = torch.randn(shape, dtype=torch.half)
-        out = torch.cat(ys + [x_half])
-        self.assertEqual(out.type(), HALF)
-
-    def test_inplace_exp_is_error_for_half(self):
-        xs = torch.randn(self.b)
-        xs.exp_()
-        self.assertEqual(xs.type(), FLOAT)
-        xs = torch.randn(self.b, dtype=torch.half)
-        with self.assertRaises(NotImplementedError):
-            xs.exp_()
-
-    def test_inplace_add_matches_self(self):
-        fn = lambda x, y: x.add_(y)
-        self.run_binary_promote_test([fn], (self.b,), x_inplace=True)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/test_rnn.py b/tests/L0/run_amp/test_rnn.py
deleted file mode 100644
index c49a5f003..000000000
--- a/tests/L0/run_amp/test_rnn.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import unittest
-
-from apex import amp
-import random
-import torch
-from torch import nn
-
-from utils import common_init, HALF
-
-class TestRnnCells(unittest.TestCase):
-    def setUp(self):
-        self.handle = amp.init(enabled=True)
-        common_init(self)
-
-    def tearDown(self):
-        self.handle._deactivate()
-
-    def run_cell_test(self, cell, state_tuple=False):
-        shape = (self.b, self.h)
-        for typ in [torch.float, torch.half]:
-            xs = [torch.randn(shape, dtype=typ).requires_grad_()
-                  for _ in range(self.t)]
-            hidden_fn = lambda: torch.zeros(shape, dtype=typ)
-            if state_tuple:
-                hidden = (hidden_fn(), hidden_fn())
-            else:
-                hidden = hidden_fn()
-            outputs = []
-            for i in range(self.t):
-                hidden = cell(xs[i], hidden)
-                if state_tuple:
-                    output = hidden[0]
-                else:
-                    output = hidden
-                outputs.append(output)
-            for y in outputs:
-                self.assertEqual(y.type(), HALF)
-            outputs[-1].float().sum().backward()
-            for i, x in enumerate(xs):
-                self.assertEqual(x.grad.dtype, x.dtype)
-
-    def test_rnn_cell_is_half(self):
-        cell = nn.RNNCell(self.h, self.h)
-        self.run_cell_test(cell)
-
-    def test_gru_cell_is_half(self):
-        cell = nn.GRUCell(self.h, self.h)
-        self.run_cell_test(cell)
-
-    def test_lstm_cell_is_half(self):
-        cell = nn.LSTMCell(self.h, self.h)
-        self.run_cell_test(cell, state_tuple=True)
-
-class TestRnns(unittest.TestCase):
-    def setUp(self):
-        self.handle = amp.init(enabled=True)
-        common_init(self)
-
-    def tearDown(self):
-        self.handle._deactivate()
-
-    def run_rnn_test(self, rnn, layers, bidir, state_tuple=False):
-        for typ in [torch.float, torch.half]:
-            x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_()
-            hidden_fn = lambda: torch.zeros((layers + (layers * bidir),
-                                             self.b, self.h), dtype=typ)
-            if state_tuple:
-                hidden = (hidden_fn(), hidden_fn())
-            else:
-                hidden = hidden_fn()
-            output, _ = rnn(x, hidden)
-            self.assertEqual(output.type(), HALF)
-            output[-1, :, :].float().sum().backward()
-            self.assertEqual(x.grad.dtype, x.dtype)
-
-    def test_rnn_is_half(self):
-        configs = [(1, False), (2, False), (2, True)]
-        for layers, bidir in configs:
-            rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=layers,
-                         nonlinearity='relu', bidirectional=bidir)
-            self.run_rnn_test(rnn, layers, bidir)
-
-    def test_gru_is_half(self):
-        configs = [(1, False), (2, False), (2, True)]
-        for layers, bidir in configs:
-            rnn = nn.GRU(input_size=self.h, hidden_size=self.h, num_layers=layers,
-                         bidirectional=bidir)
-            self.run_rnn_test(rnn, layers, bidir)
-
-    def test_lstm_is_half(self):
-        configs = [(1, False), (2, False), (2, True)]
-        for layers, bidir in configs:
-            rnn = nn.LSTM(input_size=self.h, hidden_size=self.h, num_layers=layers,
-                         bidirectional=bidir)
-            self.run_rnn_test(rnn, layers, bidir, state_tuple=True)
-
-    def test_rnn_packed_sequence(self):
-        num_layers = 2
-        rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=num_layers)
-        for typ in [torch.float, torch.half]:
-            x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_()
-            lens = sorted([random.randint(self.t // 2, self.t) for _ in range(self.b)],
-                          reverse=True)
-            # `pack_padded_sequence` breaks if default tensor type is non-CPU
-            torch.set_default_tensor_type(torch.FloatTensor)
-            lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu'))
-            packed_seq = nn.utils.rnn.pack_padded_sequence(x, lens)
-            torch.set_default_tensor_type(torch.cuda.FloatTensor)
-            hidden = torch.zeros((num_layers, self.b, self.h), dtype=typ)
-            output, _ = rnn(packed_seq, hidden)
-            self.assertEqual(output.data.type(), HALF)
-            output.data.float().sum().backward()
-            self.assertEqual(x.grad.dtype, x.dtype)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/L0/run_amp/utils.py b/tests/L0/run_amp/utils.py
deleted file mode 100644
index 7aa20c369..000000000
--- a/tests/L0/run_amp/utils.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torch
-
-HALF = 'torch.cuda.HalfTensor'
-FLOAT = 'torch.cuda.FloatTensor'
-
-DTYPES = [torch.half, torch.float]
-
-ALWAYS_HALF = {torch.float: HALF,
-               torch.half: HALF}
-ALWAYS_FLOAT = {torch.float: FLOAT,
-                torch.half: FLOAT}
-MATCH_INPUT = {torch.float: FLOAT,
-               torch.half: HALF}
-
-def common_init(test_case):
-    test_case.h = 64
-    test_case.b = 16
-    test_case.c = 16
-    test_case.k = 3
-    test_case.t = 10
-    torch.set_default_tensor_type(torch.cuda.FloatTensor)
diff --git a/tests/L0/run_deprecated/test_deprecated_warning.py b/tests/L0/run_deprecated/test_deprecated_warning.py
deleted file mode 100644
index f1f33f76f..000000000
--- a/tests/L0/run_deprecated/test_deprecated_warning.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import unittest
-
-import torch
-
-import apex
-from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase
-
-
-def init_model_and_optimizer():
-    model = torch.nn.Linear(1, 1, bias=False).cuda()
-    optimizer = torch.optim.SGD(model.parameters(), 1.0)
-    return model, optimizer
-
-
-@unittest.skipUnless(torch.cuda.is_available(), "")
-class TestDeprecatedWarning(unittest.TestCase):
-
-    def test_amp(self):
-        model, optimizer = init_model_and_optimizer()
-        with self.assertWarns(apex.DeprecatedFeatureWarning):
-            _ = apex.amp.initialize(model, optimizer)
-
-    def test_fp16_model(self):
-        model, _ = init_model_and_optimizer()
-        with self.assertWarns(apex.DeprecatedFeatureWarning):
-            _ = apex.fp16_utils.FP16Model(model)
-
-    def test_fp16_optimizer(self):
-        _, optimizer = init_model_and_optimizer()
-        with self.assertWarns(apex.DeprecatedFeatureWarning):
-            _ = apex.fp16_utils.FP16_Optimizer(optimizer)
-
-    def test_fp16_loss_scaler(self):
-        with self.assertWarns(apex.DeprecatedFeatureWarning):
-             apex.fp16_utils.LossScaler()
-
-
-class TestParallel(NcclDistributedTestBase):
-
-    @property
-    def world_size(self):
-        return min(torch.cuda.device_count(), 2)
-
-    def test_distributed_data_parallel(self):
-        model, _ = init_model_and_optimizer()
-        with self.assertWarns(apex.DeprecatedFeatureWarning):
-            _ = apex.parallel.DistributedDataParallel(model)
-
-    def test_convert_syncbn_model(self):
-        model, _ = init_model_and_optimizer()
-        with self.assertWarns(apex.DeprecatedFeatureWarning):
-            _ = apex.parallel.convert_syncbn_model(model)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/L0/run_fp16util/__init__.py b/tests/L0/run_fp16util/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/L0/run_fp16util/test_fp16util.py b/tests/L0/run_fp16util/test_fp16util.py
deleted file mode 100644
index eecddbc01..000000000
--- a/tests/L0/run_fp16util/test_fp16util.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import unittest
-
-import torch
-import torch.nn as nn
-
-from apex.fp16_utils import FP16Model
-
-
-class DummyBlock(nn.Module):
-    def __init__(self):
-        super(DummyBlock, self).__init__()
-
-        self.conv = nn.Conv2d(10, 10, 2)
-        self.bn = nn.BatchNorm2d(10, affine=True)
-
-    def forward(self, x):
-        return self.conv(self.bn(x))
-
-
-class DummyNet(nn.Module):
-    def __init__(self):
-        super(DummyNet, self).__init__()
-
-        self.conv1 = nn.Conv2d(3, 10, 2)
-        self.bn1 = nn.BatchNorm2d(10, affine=False)
-        self.db1 = DummyBlock()
-        self.db2 = DummyBlock()
-
-    def forward(self, x):
-        out = x
-        out = self.conv1(out)
-        out = self.bn1(out)
-        out = self.db1(out)
-        out = self.db2(out)
-        return out
-
-
-class DummyNetWrapper(nn.Module):
-    def __init__(self):
-        super(DummyNetWrapper, self).__init__()
-
-        self.bn = nn.BatchNorm2d(3, affine=True)
-        self.dn = DummyNet()
-
-    def forward(self, x):
-        return self.dn(self.bn(x))
-
-
-class TestFP16Model(unittest.TestCase):
-    def setUp(self):
-        self.N = 64
-        self.C_in = 3
-        self.H_in = 16
-        self.W_in = 32
-        self.in_tensor = torch.randn((self.N, self.C_in, self.H_in, self.W_in)).cuda()
-        self.orig_model = DummyNetWrapper().cuda()
-        self.fp16_model = FP16Model(self.orig_model)
-
-    def test_params_and_buffers(self):
-        exempted_modules = [
-            self.fp16_model.network.bn,
-            self.fp16_model.network.dn.db1.bn,
-            self.fp16_model.network.dn.db2.bn,
-        ]
-        for m in self.fp16_model.modules():
-            expected_dtype = torch.float if (m in exempted_modules) else torch.half
-            for p in m.parameters(recurse=False):
-                assert p.dtype == expected_dtype
-            for b in m.buffers(recurse=False):
-                assert b.dtype in (expected_dtype, torch.int64)
-
-    def test_output_is_half(self):
-        out_tensor = self.fp16_model(self.in_tensor)
-        assert out_tensor.dtype == torch.half
-
diff --git a/tests/L0/run_test.py b/tests/L0/run_test.py
index 675d6bfe9..afe9eff51 100644
--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
@@ -18,9 +18,6 @@
 
 TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
 TEST_DIRS = [
-    "run_amp",
-    "run_deprecated",
-    "run_fp16util",
     "run_optimizers",
     "run_fused_layer_norm",
     "run_mlp",

From 8c76a94b2b1ee6444ffb69b6904f99f4118c753c Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Thu, 9 Feb 2023 13:25:42 -0800
Subject: [PATCH 2/9] update

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 examples/imagenet/README.md   | 139 +++---------
 examples/imagenet/main_amp.py | 384 +++++++++++++++++-----------------
 2 files changed, 222 insertions(+), 301 deletions(-)

diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
index 257d4a78d..870c8e8d0 100644
--- a/examples/imagenet/README.md
+++ b/examples/imagenet/README.md
@@ -1,20 +1,22 @@
 # Mixed Precision ImageNet Training in PyTorch
 
 `main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet).
-It implements Automatic Mixed Precision (Amp) training of popular model architectures, such as ResNet, AlexNet, and VGG, on the ImageNet dataset.  Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).
+It implements Automatic Mixed Precision (Amp) training of popular model architectures, such as ResNet, AlexNet, and VGG, on the ImageNet dataset.  Command-line flags forwarded to `torch.cuda.amp.autocast` are used to easily manipulate and switch between various pure and mixed precision.
 
 Three lines enable Amp:
-```
-# Added after model and optimizer construction
-model, optimizer = amp.initialize(model, optimizer, flags...)
+```python
+# Enclose `model.forward` and loss computation with `autocast` context
+with torch.autocast(device_type="cuda", dtype=dtype, enable=enable):
+    pred = model(inputs)
+    loss = loss_fn(target, pred)
+
 ...
-# loss.backward() changed to:
-with amp.scale_loss(loss, optimizer) as scaled_loss:
-    scaled_loss.backward()
+# If fp16 is selected for AMP...
+grad_scaler.scale(loss).backward()
+grad_scaler.step(optimizer)
+grad_scaler.update()
 ```
 
-With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**
-
 ## Requirements
 
 - Download the ImageNet dataset and move validation images to labeled subfolders
@@ -47,124 +49,45 @@ $ ln -sf /data/imagenet/val-jpeg/ val
 ### Summary
 
 Amp allows easy experimentation with various pure and mixed precision options.
+```shell
+$ python main_amp.py -a resnet50 --b 128 --workers 4 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 ./
+$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 ./
+$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 ./
 ```
-$ python main_amp.py -a resnet50 --b 128 --workers 4 --opt-level O0 ./
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 ./
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 --keep-batchnorm-fp32 True ./
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 --loss-scale 128.0 ./
-$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 --loss-scale 128.0 ./
-$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
-```
-Options are explained below.  Again, the [updated API guide](https://nvidia.github.io/apex/amp.html) provides more detail.
 
-#### `--opt-level O0` (FP32 training) and `O3` (FP16 training)
 
-"Pure FP32" training:
-```
-$ python main_amp.py -a resnet50 --b 128 --workers 4 --opt-level O0 ./
-```
-"Pure FP16" training:
-```
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 ./
-```
-FP16 training with FP32 batchnorm:
-```
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 --keep-batchnorm-fp32 True ./
-```
-Keeping the batchnorms in FP32 improves stability and allows Pytorch
-to use cudnn batchnorms, which significantly increases speed in Resnet50.
-
-The `O3` options might not converge, because they are not true mixed precision.
-However, they can be useful to establish "speed of light" performance for
-your model, which provides a baseline for comparison with `O1` and `O2`.
-For Resnet50 in particular, `--opt-level O3 --keep-batchnorm-fp32 True` establishes
-the "speed of light."  (Without `--keep-batchnorm-fp32`, it's slower, because it does
-not use cudnn batchnorm.)
-
-#### `--opt-level O1` (Official Mixed Precision recipe, recommended for typical use)
-
-`O1` patches Torch functions to cast inputs according to a whitelist-blacklist model.
-FP16-friendly (Tensor Core) ops like gemms and convolutions run in FP16, while ops
-that benefit from FP32, like batchnorm and softmax, run in FP32.
-Also, dynamic loss scaling is used by default.
-```
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
-```
-`O1` overridden to use static loss scaling:
-```
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 --loss-scale 128.0
-```
 Distributed training with 2 processes (1 GPU per process, see **Distributed training** below
 for more detail)
 ```
-$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
+$ torchrun --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 ./
 ```
 For best performance, set `--nproc_per_node` equal to the total number of GPUs on the node
 to use all available resources.
 
-#### `--opt-level O2` ("Almost FP16" mixed precision.  More dangerous than O1.)
-
-`O2` exists mainly to support some internal use cases.  Please prefer `O1`.
-
-`O2` casts the model to FP16, keeps batchnorms in FP32,
-maintains master weights in FP32, and implements
-dynamic loss scaling by default. (Unlike --opt-level O1, --opt-level O2
-does not patch Torch functions.)
-```
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
-```
-"Fast mixed precision" overridden to use static loss scaling:
-```
-$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 --loss-scale 128.0 ./
-```
-Distributed training with 2 processes (1 GPU per process)
-```
-$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
-```
-
 ## Distributed training
 
-`main_amp.py` optionally uses `apex.parallel.DistributedDataParallel` (DDP) for multiprocess training with one GPU per process.
-```
-model = apex.parallel.DistributedDataParallel(model)
+`main_amp.py` optionally uses `torch.nn.parallel.DistributedDataParallel` (DDP) for multiprocess training with one GPU per process.
+```python
+model = torch.nn.parallel.DistributedDataParallel(model)
 ```
 is a drop-in replacement for
-```
-model = torch.nn.parallel.DistributedDataParallel(model,
-                                                  device_ids=[arg.local_rank],
-                                                  output_device=arg.local_rank)
-```
-(because Torch DDP permits multiple GPUs per process, with Torch DDP you are required to
-manually specify the device to run on and the output device.
-With Apex DDP, it uses only the current device by default).
 
-The choice of DDP wrapper (Torch or Apex) is orthogonal to the use of Amp and other Apex tools.  It is safe to use `apex.amp` with either `torch.nn.parallel.DistributedDataParallel` or `apex.parallel.DistributedDataParallel`.  In the future, I may add some features that permit optional tighter integration between `Amp` and `apex.parallel.DistributedDataParallel` for marginal performance benefits, but currently, there's no compelling reason to use Apex DDP versus Torch DDP for most models.
-
-To use DDP with `apex.amp`, the only gotcha is that
-```
-model, optimizer = amp.initialize(model, optimizer, flags...)
-```
-must precede
-```
-model = DDP(model)
-```
-If DDP wrapping occurs before `amp.initialize`, `amp.initialize` will raise an error.
-
-With both Apex DDP and Torch DDP, you must also call `torch.cuda.set_device(args.local_rank)` within
-each process prior to initializing your model or any other tensors.
 More information can be found in the docs for the
 Pytorch multiprocess launcher module [torch.distributed.launch](https://pytorch.org/docs/stable/distributed.html#launch-utility).
 
-`main_amp.py` is written to interact with 
+`main_amp.py` is written to interact with
 [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility),
 which spawns multiprocess jobs using the following syntax:
+```shell
+$ torchrun --nproc_per_node=NUM_GPUS main_amp.py args...
 ```
-python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_amp.py args...
-```
-`NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.  The use of `torch.distributed.launch` is unrelated to the choice of DDP wrapper.  It is safe to use either apex DDP or torch DDP with `torch.distributed.launch`.
+`NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.
 
 Optionally, one can run imagenet with synchronized batch normalization across processes by adding
 `--sync_bn` to the `args...`
@@ -180,4 +103,6 @@ cause a modest performance decrease.
 
 If you're curious how the network actually looks on the CPU and GPU timelines (for example, how good is the overall utilization?
 Is the prefetcher really overlapping data transfers?) try profiling `main_amp.py`.
-[Detailed instructions can be found here](https://gist.github.com/mcarilli/213a4e698e4a0ae2234ddee56f4f3f95).
+
+See [
+Using Nsight Systems to profile GPU workload (PyTorch Dev Discussions)](https://dev-discuss.pytorch.org/t/using-nsight-systems-to-profile-gpu-workload/59) and [User Guide :: Nsight Systems Documentation](https://docs.nvidia.com/nsight-systems/UserGuide/index.html).
diff --git a/examples/imagenet/main_amp.py b/examples/imagenet/main_amp.py
index c4b0fdfd5..94001acd2 100644
--- a/examples/imagenet/main_amp.py
+++ b/examples/imagenet/main_amp.py
@@ -1,11 +1,12 @@
 import argparse
+from dataclasses import dataclass
 import os
 import shutil
 import time
 
+import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.parallel
 import torch.backends.cudnn as cudnn
 import torch.distributed as dist
 import torch.optim
@@ -15,91 +16,78 @@
 import torchvision.datasets as datasets
 import torchvision.models as models
 
-import numpy as np
 
-try:
-    from apex.parallel import DistributedDataParallel as DDP
-    from apex.fp16_utils import *
-    from apex import amp, optimizers
-    from apex.multi_tensor_apply import multi_tensor_applier
-except ImportError:
-    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
-
-def fast_collate(batch, memory_format):
-
-    imgs = [img[0] for img in batch]
-    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
-    w = imgs[0].size[0]
-    h = imgs[0].size[1]
-    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8).contiguous(memory_format=memory_format)
-    for i, img in enumerate(imgs):
-        nump_array = np.asarray(img, dtype=np.uint8)
-        if(nump_array.ndim < 3):
-            nump_array = np.expand_dims(nump_array, axis=-1)
-        nump_array = np.rollaxis(nump_array, 2)
-        tensor[i] += torch.from_numpy(nump_array)
-    return tensor, targets
 
+@dataclass(frozen=True)
+class CollateFn:
+    memory_format: torch.memory_format
 
-def parse():
-    model_names = sorted(name for name in models.__dict__
-                     if name.islower() and not name.startswith("__")
-                     and callable(models.__dict__[name]))
-
-    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
-    parser.add_argument('data', metavar='DIR',
-                        help='path to dataset')
-    parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
-                        choices=model_names,
-                        help='model architecture: ' +
-                        ' | '.join(model_names) +
-                        ' (default: resnet18)')
-    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
-                        help='number of data loading workers (default: 4)')
-    parser.add_argument('--epochs', default=90, type=int, metavar='N',
-                        help='number of total epochs to run')
-    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
-                        help='manual epoch number (useful on restarts)')
-    parser.add_argument('-b', '--batch-size', default=256, type=int,
-                        metavar='N', help='mini-batch size per process (default: 256)')
-    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
-                        metavar='LR', help='Initial learning rate.  Will be scaled by <global batch size>/256: args.lr = args.lr*float(args.batch_size*args.world_size)/256.  A warmup schedule will also be applied over the first 5 epochs.')
-    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
-                        help='momentum')
-    parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
-                        metavar='W', help='weight decay (default: 1e-4)')
-    parser.add_argument('--print-freq', '-p', default=10, type=int,
-                        metavar='N', help='print frequency (default: 10)')
-    parser.add_argument('--resume', default='', type=str, metavar='PATH',
-                        help='path to latest checkpoint (default: none)')
-    parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
-                        help='evaluate model on validation set')
-    parser.add_argument('--pretrained', dest='pretrained', action='store_true',
-                        help='use pre-trained model')
-
-    parser.add_argument('--prof', default=-1, type=int,
-                        help='Only run 10 iterations for profiling.')
-    parser.add_argument('--deterministic', action='store_true')
+    def fast_collate(self, batch):
+        imgs = [img[0] for img in batch]
+        targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+        w = imgs[0].size[0]
+        h = imgs[0].size[1]
+        tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8).contiguous(memory_format=self.memory_format)
+        for i, img in enumerate(imgs):
+            nump_array = np.asarray(img, dtype=np.uint8)
+            if(nump_array.ndim < 3):
+                nump_array = np.expand_dims(nump_array, axis=-1)
+            nump_array = np.rollaxis(nump_array, 2)
+            tensor[i] += torch.from_numpy(nump_array)
+        return tensor, targets
 
-    parser.add_argument("--local_rank", default=os.getenv('LOCAL_RANK', 0), type=int)
-    parser.add_argument('--sync_bn', action='store_true',
-                        help='enabling apex sync BN.')
 
-    parser.add_argument('--opt-level', type=str)
-    parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
-    parser.add_argument('--loss-scale', type=str, default=None)
-    parser.add_argument('--channels-last', type=bool, default=False)
+def parse():
+    model_names = sorted(
+        name for name in models.__dict__
+        if name.islower() and not name.startswith("__") and callable(models.__dict__[name])
+    )
+    parser = argparse.ArgumentParser(
+        description='PyTorch ImageNet Training', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('data', metavar='DIR', help='path to dataset')
+    parser.add_argument(
+        '--arch', '-a', metavar='ARCH', default='resnet18', choices=model_names,
+        help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)',
+    )
+    parser.add_argument(
+        '-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)')
+    parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run')
+    parser.add_argument(
+        '--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)')
+    parser.add_argument(
+        '-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size per process (default: 256)')
+    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR',
+        help=
+            'Initial learning rate.  Will be scaled by <global batch size>/256: args.lr = '
+            'args.lr*float(args.batch_size*args.world_size)/256.  A warmup schedule will also '
+            'be applied over the first 5 epochs.')
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum')
+    parser.add_argument(
+        '--weight-decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)')
+    parser.add_argument(
+        '--print-freq', '-p', default=10, type=int, metavar='N', help='print frequency (default: 10)')
+    parser.add_argument(
+        '--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')
+    parser.add_argument(
+        '-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set')
+    parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model')
+    parser.add_argument('--prof', default=-1, type=int, help='Only run 10 iterations for profiling.')
+    parser.add_argument('--deterministic', action='store_true')
+    # parser.add_argument("--local_rank", default=os.getenv('LOCAL_RANK', 0), type=int)
+    parser.add_argument('--sync_bn', action='store_true', help='enabling apex sync BN.')
+    parser.add_argument('--dtype', type=str, default='float32', choices=('float32', 'float16', 'bfloat16'))
+    # parser.add_argument('--opt-level', type=str)
+    # parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+    # parser.add_argument('--loss-scale', type=str, default=None)
+    parser.add_argument('--channels-last', action='store_true')
     args = parser.parse_args()
     return args
 
+
 def main():
     global best_prec1, args
 
     args = parse()
-    print("opt_level = {}".format(args.opt_level))
-    print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32))
-    print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))
-
     print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))
 
     cudnn.benchmark = True
@@ -113,6 +101,8 @@ def main():
     args.distributed = False
     if 'WORLD_SIZE' in os.environ:
         args.distributed = int(os.environ['WORLD_SIZE']) > 1
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.local_rank = int(os.environ['LOCAL_RANK'])
 
     args.gpu = 0
     args.world_size = 1
@@ -120,9 +110,7 @@ def main():
     if args.distributed:
         args.gpu = args.local_rank
         torch.cuda.set_device(args.gpu)
-        torch.distributed.init_process_group(backend='nccl',
-                                             init_method='env://')
-        args.world_size = torch.distributed.get_world_size()
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
 
     assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."
 
@@ -140,25 +128,20 @@ def main():
         model = models.__dict__[args.arch]()
 
     if args.sync_bn:
-        import apex
-        print("using apex synced BN")
-        model = apex.parallel.convert_syncbn_model(model)
+        model = nn.SyncBatchNorm.convert_syncbn_model(model)
 
     model = model.cuda().to(memory_format=memory_format)
 
     # Scale learning rate based on global batch size
     args.lr = args.lr*float(args.batch_size*args.world_size)/256.
-    optimizer = torch.optim.SGD(model.parameters(), args.lr,
-                                momentum=args.momentum,
-                                weight_decay=args.weight_decay)
-
-    # Initialize Amp.  Amp accepts either values or strings for the optional override arguments,
-    # for convenient interoperation with argparse.
-    model, optimizer = amp.initialize(model, optimizer,
-                                      opt_level=args.opt_level,
-                                      keep_batchnorm_fp32=args.keep_batchnorm_fp32,
-                                      loss_scale=args.loss_scale
-                                      )
+    optimizer = torch.optim.SGD(
+        model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+
+    dtype = getattr(torch, args.dtype)
+    enable_amp = dtype in (torch.float16, torch.bfloat16)
+    grad_scaler = None
+    if dtype == torch.float16:
+        grad_scaler = torch.cuda.amp.grad_scaler.GradScaler()
 
     # For distributed training, wrap the model with apex.parallel.DistributedDataParallel.
     # This must be done AFTER the call to amp.initialize.  If model = DDP(model) is called
@@ -169,7 +152,7 @@ def main():
         # computation in the backward pass.
         # model = DDP(model)
         # delay_allreduce delays all communication to the end of the backward pass.
-        model = DDP(model, delay_allreduce=True)
+        model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
 
     # define loss function (criterion) and optimizer
     criterion = nn.CrossEntropyLoss().cuda()
@@ -180,7 +163,7 @@ def main():
         def resume():
             if os.path.isfile(args.resume):
                 print("=> loading checkpoint '{}'".format(args.resume))
-                checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
+                checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu))
                 args.start_epoch = checkpoint['epoch']
                 global best_prec1
                 best_prec1 = checkpoint['best_prec1']
@@ -196,7 +179,7 @@ def resume():
     traindir = os.path.join(args.data, 'train')
     valdir = os.path.join(args.data, 'val')
 
-    if(args.arch == "inception_v3"):
+    if args.arch == "inception_v3":
         raise RuntimeError("Currently, inception_v3 is not supported by this example.")
         # crop_size = 299
         # val_size = 320 # I chose this value arbitrarily, we can adjust.
@@ -223,18 +206,16 @@ def resume():
         train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
         val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
 
-    collate_fn = lambda b: fast_collate(b, memory_format)
-
     train_loader = torch.utils.data.DataLoader(
         train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
-        num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn)
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=CollateFn(memory_format))
 
     val_loader = torch.utils.data.DataLoader(
         val_dataset,
         batch_size=args.batch_size, shuffle=False,
         num_workers=args.workers, pin_memory=True,
         sampler=val_sampler,
-        collate_fn=collate_fn)
+        collate_fn=CollateFn(memory_format))
 
     if args.evaluate:
         validate(val_loader, model, criterion)
@@ -245,10 +226,10 @@ def resume():
             train_sampler.set_epoch(epoch)
 
         # train for one epoch
-        train(train_loader, model, criterion, optimizer, epoch)
+        train(train_loader, model, criterion, optimizer, epoch, grad_scaler, dtype, enable_amp)
 
         # evaluate on validation set
-        prec1 = validate(val_loader, model, criterion)
+        prec1 = validate(val_loader, model, criterion, dtype, enable_amp)
 
         # remember best prec@1 and save checkpoint
         if args.local_rank == 0:
@@ -259,15 +240,16 @@ def resume():
                 'arch': args.arch,
                 'state_dict': model.state_dict(),
                 'best_prec1': best_prec1,
-                'optimizer' : optimizer.state_dict(),
+                'optimizer': optimizer.state_dict(),
             }, is_best)
 
-class data_prefetcher():
+
+class data_prefetcher:
     def __init__(self, loader):
         self.loader = iter(loader)
         self.stream = torch.cuda.Stream()
-        self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)
-        self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)
+        self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1, 3, 1, 1)
+        self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1, 3, 1, 1)
         # With Amp, it isn't necessary to manually convert data to half.
         # if args.fp16:
         #     self.mean = self.mean.half()
@@ -317,7 +299,7 @@ def next(self):
         return input, target
 
 
-def train(train_loader, model, criterion, optimizer, epoch):
+def train(train_loader, model, criterion, optimizer, epoch, grad_scaler, dtype, enable_amp):
     batch_time = AverageMeter()
     losses = AverageMeter()
     top1 = AverageMeter()
@@ -328,7 +310,7 @@ def train(train_loader, model, criterion, optimizer, epoch):
     end = time.time()
 
     prefetcher = data_prefetcher(train_loader)
-    input, target = prefetcher.next()
+    input, target = next(prefetcher)
     i = 0
     while input is not None:
         i += 1
@@ -336,74 +318,89 @@ def train(train_loader, model, criterion, optimizer, epoch):
             print("Profiling begun at iteration {}".format(i))
             torch.cuda.cudart().cudaProfilerStart()
 
-        if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i))
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_push("Body of iteration {}".format(i))
 
         adjust_learning_rate(optimizer, epoch, i, len(train_loader))
 
         # compute output
-        if args.prof >= 0: torch.cuda.nvtx.range_push("forward")
-        output = model(input)
-        if args.prof >= 0: torch.cuda.nvtx.range_pop()
-        loss = criterion(output, target)
+        with torch.autocast(device_type='cuda', dtype=dtype, enabled=enable_amp):
+            if args.prof >= 0:
+                torch.cuda.nvtx.range_push("forward")
+            output = model(input)
+            if args.prof >= 0:
+                torch.cuda.nvtx.range_pop()
+            loss = criterion(output, target)
 
         # compute gradient and do SGD step
         optimizer.zero_grad()
 
-        if args.prof >= 0: torch.cuda.nvtx.range_push("backward")
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-        if args.prof >= 0: torch.cuda.nvtx.range_pop()
-
-        # for param in model.parameters():
-        #     print(param.data.double().sum().item(), param.grad.data.double().sum().item())
-
-        if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()")
-        optimizer.step()
-        if args.prof >= 0: torch.cuda.nvtx.range_pop()
-
-        if i%args.print_freq == 0:
-            # Every print_freq iterations, check the loss, accuracy, and speed.
-            # For best performance, it doesn't make sense to print these metrics every
-            # iteration, since they incur an allreduce and some host<->device syncs.
-
-            # Measure accuracy
-            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
-
-            # Average loss and accuracy across processes for logging
-            if args.distributed:
-                reduced_loss = reduce_tensor(loss.data)
-                prec1 = reduce_tensor(prec1)
-                prec5 = reduce_tensor(prec5)
-            else:
-                reduced_loss = loss.data
-
-            # to_python_float incurs a host<->device sync
-            losses.update(to_python_float(reduced_loss), input.size(0))
-            top1.update(to_python_float(prec1), input.size(0))
-            top5.update(to_python_float(prec5), input.size(0))
-
-            torch.cuda.synchronize()
-            batch_time.update((time.time() - end)/args.print_freq)
-            end = time.time()
-
-            if args.local_rank == 0:
-                print('Epoch: [{0}][{1}/{2}]\t'
-                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
-                      'Speed {3:.3f} ({4:.3f})\t'
-                      'Loss {loss.val:.10f} ({loss.avg:.4f})\t'
-                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
-                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
-                       epoch, i, len(train_loader),
-                       args.world_size*args.batch_size/batch_time.val,
-                       args.world_size*args.batch_size/batch_time.avg,
-                       batch_time=batch_time,
-                       loss=losses, top1=top1, top5=top5))
-        if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()")
-        input, target = prefetcher.next()
-        if args.prof >= 0: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_push("backward")
+        if grad_scaler is None:
+            loss.backward()
+        else:
+            grad_scaler.scale(loss).backward()
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_pop()
+
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_push("optimizer.step()")
+        if grad_scaler is None:
+            optimizer.step()
+        else:
+            grad_scaler.step(optimizer)
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_pop()
+
+        if i % args.print_freq == 0:
+            with torch.no_grad():
+                # Every print_freq iterations, check the loss, accuracy, and speed.
+                # For best performance, it doesn't make sense to print these metrics every
+                # iteration, since they incur an allreduce and some host<->device syncs.
+
+                # Measure accuracy
+                prec1, prec5 = accuracy(output, target, topk=(1, 5))
+
+                # Average loss and accuracy across processes for logging
+                if args.distributed:
+                    reduced_loss = reduce_tensor(loss)
+                    prec1 = reduce_tensor(prec1)
+                    prec5 = reduce_tensor(prec5)
+                else:
+                    reduced_loss = loss.data
+
+                losses.update(reduced_loss.item(), input.size(0))
+                top1.update(prec1.item(), input.size(0))
+                top5.update(prec5.item(), input.size(0))
+
+                torch.cuda.synchronize()
+                batch_time.update((time.time() - end) / args.print_freq)
+                end = time.time()
+
+                if args.local_rank == 0:
+                    print(
+                        'Epoch: [{0}][{1}/{2}]\t'
+                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                        'Speed {3:.3f} ({4:.3f})\t'
+                        'Loss {loss.val:.10f} ({loss.avg:.4f})\t'
+                        'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                        'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                            epoch, i, len(train_loader),
+                            args.world_size*args.batch_size/batch_time.val,
+                            args.world_size*args.batch_size/batch_time.avg,
+                            batch_time=batch_time,
+                            loss=losses, top1=top1, top5=top5)
+                    )
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_push("prefetcher.next()")
+        input, target = next(prefetcher)
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_pop()
 
         # Pop range "Body of iteration {}".format(i)
-        if args.prof >= 0: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0:
+            torch.cuda.nvtx.range_pop()
 
         if args.prof >= 0 and i == args.prof + 10:
             print("Profiling ended at iteration {}".format(i))
@@ -411,7 +408,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
             quit()
 
 
-def validate(val_loader, model, criterion):
+@torch.inference_mode()
+def validate(val_loader, model, criterion, dtype, enable_amp):
     batch_time = AverageMeter()
     losses = AverageMeter()
     top1 = AverageMeter()
@@ -423,29 +421,29 @@ def validate(val_loader, model, criterion):
     end = time.time()
 
     prefetcher = data_prefetcher(val_loader)
-    input, target = prefetcher.next()
+    input, target = next(prefetcher)
     i = 0
     while input is not None:
         i += 1
 
         # compute output
-        with torch.no_grad():
+        with torch.no_grad(), torch.autocast(device_type='cuda', dtype=dtype, enabled=enable_amp):
             output = model(input)
             loss = criterion(output, target)
 
         # measure accuracy and record loss
-        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+        prec1, prec5 = accuracy(output, target, topk=(1, 5))
 
         if args.distributed:
-            reduced_loss = reduce_tensor(loss.data)
+            reduced_loss = reduce_tensor(loss)
             prec1 = reduce_tensor(prec1)
             prec5 = reduce_tensor(prec5)
         else:
             reduced_loss = loss.data
 
-        losses.update(to_python_float(reduced_loss), input.size(0))
-        top1.update(to_python_float(prec1), input.size(0))
-        top5.update(to_python_float(prec5), input.size(0))
+        losses.update(reduced_loss.item(), input.size(0))
+        top1.update(prec1.item(), input.size(0))
+        top5.update(prec5.item(), input.size(0))
 
         # measure elapsed time
         batch_time.update(time.time() - end)
@@ -453,22 +451,23 @@ def validate(val_loader, model, criterion):
 
         # TODO:  Change timings to mirror train().
         if args.local_rank == 0 and i % args.print_freq == 0:
-            print('Test: [{0}/{1}]\t'
-                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
-                  'Speed {2:.3f} ({3:.3f})\t'
-                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
-                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
-                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
-                   i, len(val_loader),
-                   args.world_size * args.batch_size / batch_time.val,
-                   args.world_size * args.batch_size / batch_time.avg,
-                   batch_time=batch_time, loss=losses,
-                   top1=top1, top5=top5))
-
-        input, target = prefetcher.next()
-
-    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
-          .format(top1=top1, top5=top5))
+            print(
+                'Test: [{0}/{1}]\t'
+                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                'Speed {2:.3f} ({3:.3f})\t'
+                'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                    i, len(val_loader),
+                    args.world_size * args.batch_size / batch_time.val,
+                    args.world_size * args.batch_size / batch_time.avg,
+                    batch_time=batch_time, loss=losses,
+                    top1=top1, top5=top5)
+            )
+
+        input, target = next(prefetcher)
+
+    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))
 
     return top1.avg
 
@@ -479,7 +478,7 @@ def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
         shutil.copyfile(filename, 'model_best.pth.tar')
 
 
-class AverageMeter(object):
+class AverageMeter:
     """Computes and stores the average and current value"""
     def __init__(self):
         self.reset()
@@ -510,9 +509,6 @@ def adjust_learning_rate(optimizer, epoch, step, len_epoch):
     if epoch < 5:
         lr = lr*float(1 + step + epoch*len_epoch)/(5.*len_epoch)
 
-    # if(args.local_rank == 0):
-    #     print("epoch = {}, step = {}, lr = {}".format(epoch, step, lr))
-
     for param_group in optimizer.param_groups:
         param_group['lr'] = lr
 
@@ -535,9 +531,9 @@ def accuracy(output, target, topk=(1,)):
 
 def reduce_tensor(tensor):
     rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.reduce_op.SUM)
-    rt /= args.world_size
+    dist.all_reduce(rt, op=dist._make_nccl_premul_sum(1 / args.world_size))
     return rt
 
+
 if __name__ == '__main__':
     main()

From 8e3a2d5d27a5aac631b3390248ed0f9dcd307251 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Tue, 14 Feb 2023 17:45:30 -0800
Subject: [PATCH 3/9] remove apex.amp from dcgan example

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 examples/dcgan/README.md   |  40 +----------
 examples/dcgan/main_amp.py | 142 +++++++++++++++++++------------------
 2 files changed, 74 insertions(+), 108 deletions(-)

diff --git a/examples/dcgan/README.md b/examples/dcgan/README.md
index 9fc896cb5..fb1829366 100644
--- a/examples/dcgan/README.md
+++ b/examples/dcgan/README.md
@@ -1,41 +1,3 @@
 # Mixed Precision DCGAN Training in PyTorch
 
-`main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/dcgan](https://github.com/pytorch/examples/tree/master/dcgan).
-It implements Automatic Mixed Precision (Amp) training of the DCGAN example for different datasets. Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).
-
-We introduce these changes to the PyTorch DCGAN example as described in the [Multiple models/optimizers/losses](https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses) section of the documentation::
-```
-# Added after models and optimizers construction
-[netD, netG], [optimizerD, optimizerG] = amp.initialize(
-    [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
-...
-# loss.backward() changed to:
-with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
-    errD_real_scaled.backward()
-...
-with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
-    errD_fake_scaled.backward()
-...
-with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
-    errG_scaled.backward()
-```
-
-Note that we use different `loss_scalers` for each computed loss.
-Using a separate loss scaler per loss is [optional, not required](https://nvidia.github.io/apex/advanced.html#optionally-have-amp-use-a-different-loss-scaler-per-loss).
-
-To improve the numerical stability, we swapped `nn.Sigmoid() + nn.BCELoss()` to `nn.BCEWithLogitsLoss()`.
-
-With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**
-
-"Pure FP32" training:
-```
-$ python main_amp.py --opt_level O0
-```
-Recommended mixed precision training:
-```
-$ python main_amp.py --opt_level O1
-```
-
-Have a look at the original [DCGAN example](https://github.com/pytorch/examples/tree/master/dcgan) for more information about the used arguments.
-
-To enable mixed precision training, we introduce the `--opt_level` argument.
+__UNDER CONSTRUCTION__
diff --git a/examples/dcgan/main_amp.py b/examples/dcgan/main_amp.py
index be1a2894f..812c74699 100644
--- a/examples/dcgan/main_amp.py
+++ b/examples/dcgan/main_amp.py
@@ -1,7 +1,7 @@
-from __future__ import print_function
 import argparse
 import os
 import random
+
 import torch
 import torch.nn as nn
 import torch.nn.parallel
@@ -12,11 +12,6 @@
 import torchvision.transforms as transforms
 import torchvision.utils as vutils
 
-try:    
-    from apex import amp
-except ImportError:
-    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
-
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', default='cifar10', help='cifar10 | lsun | mnist |imagenet | folder | lfw | fake')
@@ -36,7 +31,6 @@
 parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
 parser.add_argument('--manualSeed', type=int, help='manual seed')
 parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set')
-parser.add_argument('--opt_level', default='O1', help='amp opt_level, default="O1"')
 
 opt = parser.parse_args()
 print(opt)
@@ -58,56 +52,69 @@
 
 if opt.dataset in ['imagenet', 'folder', 'lfw']:
     # folder dataset
-    dataset = dset.ImageFolder(root=opt.dataroot,
-                               transform=transforms.Compose([
-                                   transforms.Resize(opt.imageSize),
-                                   transforms.CenterCrop(opt.imageSize),
-                                   transforms.ToTensor(),
-                                   transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-                               ]))
-    nc=3
+    dataset = dset.ImageFolder(
+        root=opt.dataroot,
+        transform=transforms.Compose([
+            transforms.Resize(opt.imageSize),
+            transforms.CenterCrop(opt.imageSize),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]),
+    )
+    nc = 3
 elif opt.dataset == 'lsun':
-    classes = [ c + '_train' for c in opt.classes.split(',')]
-    dataset = dset.LSUN(root=opt.dataroot, classes=classes,
-                        transform=transforms.Compose([
-                            transforms.Resize(opt.imageSize),
-                            transforms.CenterCrop(opt.imageSize),
-                            transforms.ToTensor(),
-                            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-                        ]))
-    nc=3
+    classes = [c + '_train' for c in opt.classes.split(',')]
+    dataset = dset.LSUN(
+        root=opt.dataroot,
+        classes=classes,
+        transform=transforms.Compose([
+            transforms.Resize(opt.imageSize),
+            transforms.CenterCrop(opt.imageSize),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]),
+    )
+    nc = 3
 elif opt.dataset == 'cifar10':
-    dataset = dset.CIFAR10(root=opt.dataroot, download=True,
-                           transform=transforms.Compose([
-                               transforms.Resize(opt.imageSize),
-                               transforms.ToTensor(),
-                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-                           ]))
-    nc=3
+    dataset = dset.CIFAR10(
+        root=opt.dataroot,
+        download=True,
+        transform=transforms.Compose([
+            transforms.Resize(opt.imageSize),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]),
+    )
+    nc = 3
 
 elif opt.dataset == 'mnist':
-        dataset = dset.MNIST(root=opt.dataroot, download=True,
-                           transform=transforms.Compose([
-                               transforms.Resize(opt.imageSize),
-                               transforms.ToTensor(),
-                               transforms.Normalize((0.5,), (0.5,)),
-                           ]))
-        nc=1
+    dataset = dset.MNIST(
+        root=opt.dataroot,
+        download=True,
+        transform=transforms.Compose([
+            transforms.Resize(opt.imageSize),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5,), (0.5,)),
+        ]),
+    )
+    nc = 1
 
 elif opt.dataset == 'fake':
-    dataset = dset.FakeData(image_size=(3, opt.imageSize, opt.imageSize),
-                            transform=transforms.ToTensor())
-    nc=3
+    dataset = dset.FakeData(
+        image_size=(3, opt.imageSize, opt.imageSize),
+        transform=transforms.ToTensor(),
+    )
+    nc = 3
 
 assert dataset
-dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize,
-                                         shuffle=True, num_workers=int(opt.workers))
+dataloader = torch.utils.data.DataLoader(
+    dataset, batch_size=opt.batchSize, shuffle=True, num_workers=opt.workers)
 
 device = torch.device("cuda:0")
-ngpu = int(opt.ngpu)
-nz = int(opt.nz)
-ngf = int(opt.ngf)
-ndf = int(opt.ndf)
+ngpu = opt.ngpu
+nz = opt.nz
+ngf = opt.ngf
+ndf = opt.ndf
 
 
 # custom weights initialization called on netG and netD
@@ -122,7 +129,7 @@ def weights_init(m):
 
 class Generator(nn.Module):
     def __init__(self, ngpu):
-        super(Generator, self).__init__()
+        super().__init__()
         self.ngpu = ngpu
         self.main = nn.Sequential(
             # input is Z, going into a convolution
@@ -164,7 +171,7 @@ def forward(self, input):
 
 class Discriminator(nn.Module):
     def __init__(self, ngpu):
-        super(Discriminator, self).__init__()
+        super().__init__()
         self.ngpu = ngpu
         self.main = nn.Sequential(
             # input is (nc) x 64 x 64
@@ -211,9 +218,6 @@ def forward(self, input):
 optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
 optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
 
-[netD, netG], [optimizerD, optimizerG] = amp.initialize(
-    [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
-
 for epoch in range(opt.niter):
     for i, data in enumerate(dataloader, 0):
         ############################
@@ -227,8 +231,7 @@ def forward(self, input):
 
         output = netD(real_cpu)
         errD_real = criterion(output, label)
-        with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
-            errD_real_scaled.backward()
+        errD_real.backward()
         D_x = output.mean().item()
 
         # train with fake
@@ -237,8 +240,7 @@ def forward(self, input):
         label.fill_(fake_label)
         output = netD(fake.detach())
         errD_fake = criterion(output, label)
-        with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
-            errD_fake_scaled.backward()
+        errD_fake.backward()
         D_G_z1 = output.mean().item()
         errD = errD_real + errD_fake
         optimizerD.step()
@@ -250,25 +252,27 @@ def forward(self, input):
         label.fill_(real_label)  # fake labels are real for generator cost
         output = netD(fake)
         errG = criterion(output, label)
-        with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
-            errG_scaled.backward()
+        errG.backward()
         D_G_z2 = output.mean().item()
         optimizerG.step()
 
-        print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
-              % (epoch, opt.niter, i, len(dataloader),
-                 errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
+        print(
+            '[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
+            % (epoch, opt.niter, i, len(dataloader), errD.item(), errG.item(), D_x, D_G_z1, D_G_z2)
+        )
         if i % 100 == 0:
-            vutils.save_image(real_cpu,
-                    '%s/real_samples.png' % opt.outf,
-                    normalize=True)
+            vutils.save_image(
+                real_cpu,
+                '%s/real_samples.png' % opt.outf,
+                normalize=True,
+            )
             fake = netG(fixed_noise)
-            vutils.save_image(fake.detach(),
-                    '%s/amp_fake_samples_epoch_%03d.png' % (opt.outf, epoch),
-                    normalize=True)
+            vutils.save_image(
+                fake.detach(),
+                '%s/amp_fake_samples_epoch_%03d.png' % (opt.outf, epoch),
+                normalize=True,
+            )
 
     # do checkpointing
     torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch))
     torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch))
-
-

From 4951c3e4c5a4100531faf6b69932a2871b67b0a8 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Tue, 14 Feb 2023 20:38:12 -0800
Subject: [PATCH 4/9] add dtype parser

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 examples/dcgan/main_amp.py | 57 ++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/examples/dcgan/main_amp.py b/examples/dcgan/main_amp.py
index 812c74699..9359147fa 100644
--- a/examples/dcgan/main_amp.py
+++ b/examples/dcgan/main_amp.py
@@ -31,6 +31,7 @@
 parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
 parser.add_argument('--manualSeed', type=int, help='manual seed')
 parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set')
+parser.add_argument('--dtype', type=str, default='float32', choices=('float32', 'float16', 'bfloat16'))
 
 opt = parser.parse_args()
 print(opt)
@@ -116,6 +117,13 @@
 ngf = opt.ngf
 ndf = opt.ndf
 
+dtype = getattr(torch, opt.dtype)
+enable_autocast, grad_scaler = False, None
+if dtype != torch.float32:
+    enable_autocast = True
+    if dtype == torch.float16:
+        grad_scaler = torch.cuda.amp.GradScaler()
+
 
 # custom weights initialization called on netG and netD
 def weights_init(m):
@@ -227,34 +235,55 @@ def forward(self, input):
         netD.zero_grad()
         real_cpu = data[0].to(device)
         batch_size = real_cpu.size(0)
-        label = torch.full((batch_size,), real_label, device=device)
+        label = torch.full((batch_size,), real_label, device=device, dtype=torch.float32)
 
-        output = netD(real_cpu)
-        errD_real = criterion(output, label)
-        errD_real.backward()
+        with torch.autocast(device_type='cuda', dtype=dtype, enabled=enable_autocast):
+            output = netD(real_cpu)
+            errD_real = criterion(output, label)
+        if grad_scaler is None:
+            errD_real.backward()
+        else:
+            grad_scaler.scale(errD_real).backward()
         D_x = output.mean().item()
 
         # train with fake
         noise = torch.randn(batch_size, nz, 1, 1, device=device)
-        fake = netG(noise)
-        label.fill_(fake_label)
-        output = netD(fake.detach())
-        errD_fake = criterion(output, label)
-        errD_fake.backward()
+        with torch.autocast(device_type='cuda', dtype=dtype, enabled=enable_autocast):
+            fake = netG(noise)
+            label.fill_(fake_label)
+            output = netD(fake.detach())
+            errD_fake = criterion(output, label)
+        if grad_scaler is None:
+            errD_fake.backward()
+        else:
+            grad_scaler.scale(errD_fake).backward()
         D_G_z1 = output.mean().item()
         errD = errD_real + errD_fake
-        optimizerD.step()
+
+        if grad_scaler is None:
+            optimizerD.step()
+        else:
+            grad_scaler.step(optimizerD)
+            grad_scaler.update()
 
         ############################
         # (2) Update G network: maximize log(D(G(z)))
         ###########################
         netG.zero_grad()
         label.fill_(real_label)  # fake labels are real for generator cost
-        output = netD(fake)
-        errG = criterion(output, label)
-        errG.backward()
+        with torch.autocast(device_type='cuda', dtype=dtype, enabled=enable_autocast):
+            output = netD(fake)
+            errG = criterion(output, label)
+        if grad_scaler is None:
+            errG.backward()
+        else:
+            grad_scaler.scale(errG).backward()
         D_G_z2 = output.mean().item()
-        optimizerG.step()
+        if grad_scaler is None:
+            optimizerG.step()
+        else:
+            grad_scaler.step(optimizerG)
+            grad_scaler.update()
 
         print(
             '[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'

From 38aa010ac741ad5bd3c17d9bf5702c2e39218878 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Tue, 14 Feb 2023 21:56:06 -0800
Subject: [PATCH 5/9] display default values

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 examples/dcgan/main_amp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dcgan/main_amp.py b/examples/dcgan/main_amp.py
index 9359147fa..3d8abdf67 100644
--- a/examples/dcgan/main_amp.py
+++ b/examples/dcgan/main_amp.py
@@ -13,7 +13,7 @@
 import torchvision.utils as vutils
 
 
-parser = argparse.ArgumentParser()
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('--dataset', default='cifar10', help='cifar10 | lsun | mnist |imagenet | folder | lfw | fake')
 parser.add_argument('--dataroot', default='./', help='path to dataset')
 parser.add_argument('--workers', type=int, help='number of data loading workers', default=2)

From 374d16d35193ef00478ee225e4dfab38b41d425d Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Wed, 15 Feb 2023 15:23:36 -0800
Subject: [PATCH 6/9] update README

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 examples/dcgan/README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/examples/dcgan/README.md b/examples/dcgan/README.md
index fb1829366..dabcd758d 100644
--- a/examples/dcgan/README.md
+++ b/examples/dcgan/README.md
@@ -1,3 +1,15 @@
 # Mixed Precision DCGAN Training in PyTorch
 
-__UNDER CONSTRUCTION__
+This example is based off of pytorch/examples' dcgan example script and the commit referenced is https://github.com/pytorch/examples/blob/79d71b87d5bb46dc58da2dac5bf8289a7a2c3295/dcgan/main.py or older.
+
+The differences are (a) this script can take a command line argument of `--dtype` to specify the dtype used during training and inference, and (b) this script uses CUDA by default.
+
+Speaking about `--dtype` option, `torch.float32` is the default value.
+
+`float16` has the script enable `torch.autocast`[^1] with device_type of CUDA and use `torch.cuda.amp.GradScaler`[^2].
+`bfloat16` has the script enable `torch.autocast` with `torch.bfloat16`.
+
+[^1]: https://pytorch.org/docs/stable/amp.html#torch.autocast
+[^2]: https://pytorch.org/docs/stable/amp.html#gradient-scaling
+
+Use `-h` or `--help` to see all the available command line options.

From a919edde87e32e092c85a5a76ce41ce7398f2225 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Sun, 19 Feb 2023 18:11:20 -0800
Subject: [PATCH 7/9] copy `flat_dist_call`

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 .../optimizers/test_distributed_fused_lamb.py | 70 ++++++++++++++++---
 1 file changed, 59 insertions(+), 11 deletions(-)

diff --git a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py
index f38f371b7..8f9cad419 100644
--- a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py
+++ b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py
@@ -1,12 +1,47 @@
-import os
+from collections import OrderedDict
 import inspect
+
 import torch
+import torch.distributed as dist
 from torch.cuda.amp import GradScaler
 from torch.testing._internal import common_utils
-from apex.parallel.distributed import flat_dist_call
+
 from apex.contrib.optimizers.distributed_fused_lamb import DistributedFusedLAMB
 from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase
 
+
+# apply_dist_call requires that tensors in 'bucket' are all the same type.
+def apply_flat_dist_call(bucket, call, extra_args=None):
+    coalesced = torch._utils._flatten_dense_tensors(bucket)
+    if extra_args is not None:
+        call(coalesced, *extra_args)
+    else:
+        call(coalesced)
+
+    if call is dist.all_reduce:
+        coalesced /= dist.get_world_size()
+
+    for buf, synced in zip(bucket, torch._utils._unflatten_dense_tensors(coalesced, bucket)):
+        buf.copy_(synced)
+
+
+def split_by_type(tensors):
+    buckets = OrderedDict()
+    for tensor in tensors:
+        tp = tensor.type()
+        if tp not in buckets:
+            buckets[tp] = []
+        buckets[tp].append(tensor)
+    return buckets
+
+
+def flat_dist_call(tensors, call, extra_args=None):
+    buckets = split_by_type(tensors)
+    for tp in buckets:
+        bucket = buckets[tp]
+        apply_flat_dist_call(bucket, call, extra_args)
+
+
 def get_init_weights_func():
     @torch.no_grad()
     def init_weights(m):
@@ -14,10 +49,11 @@ def init_weights(m):
             m.weight.fill_(1.0)
     return init_weights
 
+
 class ModelFoo(torch.nn.Module):
     def __init__(self):
         super(ModelFoo, self).__init__()
-        self.linear = torch.nn.Linear(128, 128, bias = False)
+        self.linear = torch.nn.Linear(128, 128, bias=False)
         self.loss = torch.nn.MSELoss()
 
     def forward(self, input_tensor, gt):
@@ -25,8 +61,10 @@ def forward(self, input_tensor, gt):
         loss = self.loss(y, gt)
         return loss
 
+
 # A test for distributed fused Lamb optimizer: run several iterations and see if loss decreases
-# There are two instances of the same test because based on `world_size` the optimizer decides what collectives operation to use. 
+# There are two instances of the same test because based on `world_size` the optimizer decides
+# what collectives operation to use.
 # If torch.distributed.get_world_size() == torch.cuda.device_count() it uses only `all_gather`.
 # If torch.distributed.get_world_size() < torch.cuda.device_count() it uses both `all_gather` and `reduce_scatter`.
 class NcclDistributedFusedLAMB(NcclDistributedTestBase):
@@ -62,17 +100,23 @@ def test_distributed_fused_lamb(self, no_copy, opt_kwargs):
         param_optimizer = list(model.named_parameters())
         no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
         optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            {
+                'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                'weight_decay': 0.01,
+            },
+            {
+                'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
+                'weight_decay': 0.0,
+            },
         ]
 
         if 'full_ar' not in opt_kwargs:
             opt_kwargs['full_ar'] = gpu_count == torch.cuda.device_count()
 
-        # Aidyn-A: not sure what parameters are the best for testing purposes, 
-        # setting up whatever I think appropriate. 
+        # Aidyn-A: not sure what parameters are the best for testing purposes,
+        # setting up whatever I think appropriate.
         optimizer = DistributedFusedLAMB(
-                optimizer_grouped_parameters, 
+                optimizer_grouped_parameters,
                 lr=0.1,
                 betas=(0.9, 0.9),
                 eps=1e-6,
@@ -91,7 +135,9 @@ def test_distributed_fused_lamb(self, no_copy, opt_kwargs):
         optimizer._reduce_scatter_no_copy = no_copy
         optimizer._all_gather_no_copy = no_copy
 
-        flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )
+        with torch.no_grad():
+            flat_dist_call(
+                [param for param in model.parameters()], torch.distributed.broadcast, (0,))
 
         x = torch.randn(4096, 128, dtype=torch.float16).cuda()
         y = torch.randn(4096, 128, dtype=torch.float16).cuda()
@@ -113,13 +159,15 @@ def test_distributed_fused_lamb(self, no_copy, opt_kwargs):
 
         self.assertTrue(losses == sorted(losses, reverse=True))
 
+
 common_utils.instantiate_parametrized_tests(NcclDistributedFusedLAMB)
 
+
 class NcclDistributedFusedLAMB_partial_ar(NcclDistributedFusedLAMB):
     @property
     def world_size(self) -> int:
         return max(torch.cuda.device_count()-1, 1)
 
+
 if __name__ == "__main__":
     common_utils.run_tests()
-

From 4a283659bf6d92c82b8c24b5f85efc468a765160 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Thu, 23 Feb 2023 14:08:18 -0800
Subject: [PATCH 8/9] update readme

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 README.md | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index eba21bffd..fb4d273a9 100644
--- a/README.md
+++ b/README.md
@@ -6,25 +6,7 @@ The intent of Apex is to make up-to-date utilities available to users as quickly
 
 ## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)
 
-## [GTC 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/GTC_2019) and [PyTorch DevCon 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/PyTorch_Devcon_2019) Slides
-
-# Contents
-
-## 1. Amp:  Automatic Mixed Precision
-
-**Removed. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)**
-
-## 2. Distributed Training
-
-**`apex.parallel.DistributedDataParallel` is removed. Use [`torch.nn.parallel.DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html?highlight=distributeddataparallel#torch.nn.parallel.DistributedDataParallel)**
-
-`apex.parallel.DistributedDataParallel` is a module wrapper, similar to
-`torch.nn.parallel.DistributedDataParallel`.  It enables convenient multiprocess distributed training,
-optimized for NVIDIA's NCCL communication library.
-
-### Synchronized Batch Normalization
-
-**Removed. Use [`torch.nn.SyncBatchNorm`](https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html)**
+We are going to update the documentation.
 
 # Installation
 Each [`apex.contrib`](./apex/contrib) module requires one or more install options other than `--cpp_ext` and `--cuda_ext`.
@@ -41,7 +23,7 @@ See [the NGC documentation](https://docs.nvidia.com/deeplearning/frameworks/pyto
 
 ## From Source
 
-To install Apex from source, we recommend using the nightly PyTorch obtainable from https://github.com/pytorch/pytorch.
+To install apex from source, we recommend using the nightly PyTorch obtainable from https://github.com/pytorch/pytorch.
 
 The latest stable release obtainable from https://pytorch.org should also work.
 

From 3865e2198a218a9e56742e83cce82513218c2f7b Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Fri, 24 Feb 2023 00:22:21 -0800
Subject: [PATCH 9/9] update docs

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
 docs/source/advanced.rst      | 219 --------------------------
 docs/source/amp.rst           | 288 ----------------------------------
 docs/source/conf.py           |   4 +-
 docs/source/contrib.rst       | 166 ++++++++++++++++++++
 docs/source/fp16_utils.rst    |  59 -------
 docs/source/index.rst         |  32 +---
 docs/source/layernorm.rst     |  17 --
 docs/source/normalization.rst |  11 ++
 docs/source/optimizers.rst    |  29 ++--
 docs/source/parallel.rst      |  25 ---
 docs/source/transormer.rst    |  14 ++
 11 files changed, 208 insertions(+), 656 deletions(-)
 delete mode 100644 docs/source/advanced.rst
 delete mode 100644 docs/source/amp.rst
 create mode 100644 docs/source/contrib.rst
 delete mode 100644 docs/source/fp16_utils.rst
 delete mode 100644 docs/source/layernorm.rst
 create mode 100644 docs/source/normalization.rst
 delete mode 100644 docs/source/parallel.rst
 create mode 100644 docs/source/transormer.rst

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
deleted file mode 100644
index d6623e626..000000000
--- a/docs/source/advanced.rst
+++ /dev/null
@@ -1,219 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-Advanced Amp Usage
-===================================
-
-GANs
-----
-
-GANs are an interesting synthesis of several topics below.  A `comprehensive example`_
-is under construction.
-
-.. _`comprehensive example`:
-    https://github.com/NVIDIA/apex/tree/master/examples/dcgan
-
-Gradient clipping
------------------
-Amp calls the params owned directly by the optimizer's ``param_groups`` the "master params."
-
-These master params may be fully or partially distinct from ``model.parameters()``.
-For example, with `opt_level="O2"`_, ``amp.initialize`` casts most model params to FP16,
-creates an FP32 master param outside the model for each newly-FP16 model param,
-and updates the optimizer's ``param_groups`` to point to these FP32 params.
-
-The master params owned by the optimizer's ``param_groups`` may also fully coincide with the
-model params, which is typically true for ``opt_level``\s ``O0``, ``O1``, and ``O3``.
-
-In all cases, correct practice is to clip the gradients of the params that are guaranteed to be
-owned **by the optimizer's** ``param_groups``, instead of those retrieved via ``model.parameters()``.
-
-Also, if Amp uses loss scaling, gradients must be clipped after they have been unscaled
-(which occurs during exit from the ``amp.scale_loss`` context manager).
-
-The following pattern should be correct for any ``opt_level``::
-
-    with amp.scale_loss(loss, optimizer) as scaled_loss:
-        scaled_loss.backward()
-        # Gradients are unscaled during context manager exit.
-    # Now it's safe to clip.  Replace
-    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-    # with
-    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
-    # or
-    torch.nn.utils.clip_grad_value_(amp.master_params(optimizer), max_)
-
-Note the use of the utility function ``amp.master_params(optimizer)``,
-which returns a generator-expression that iterates over the
-params in the optimizer's ``param_groups``.
-
-Also note that ``clip_grad_norm_(amp.master_params(optimizer), max_norm)`` is invoked
-*instead of*, not *in addition to*, ``clip_grad_norm_(model.parameters(), max_norm)``.
-
-.. _`opt_level="O2"`:
-    https://nvidia.github.io/apex/amp.html#o2-fast-mixed-precision
-
-Custom/user-defined autograd functions
---------------------------------------
-
-The old Amp API for `registering user functions`_ is still considered correct.  Functions must
-be registered before calling ``amp.initialize``.
-
-.. _`registering user functions`:
-    https://github.com/NVIDIA/apex/tree/master/apex/amp#annotating-user-functions
-
-Forcing particular layers/functions to a desired type
------------------------------------------------------
-
-I'm still working on a generalizable exposure for this that won't require user-side code divergence
-across different ``opt-level``\ s.
-
-Multiple models/optimizers/losses
----------------------------------
-
-Initialization with multiple models/optimizers
-**********************************************
-
-``amp.initialize``'s optimizer argument may be a single optimizer or a list of optimizers,
-as long as the output you accept has the same type.
-Similarly, the ``model`` argument may be a single model or a list of models, as long as the accepted
-output matches.  The following calls are all legal::
-
-    model, optim = amp.initialize(model, optim,...)
-    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1],...)
-    [model0, model1], optim = amp.initialize([model0, model1], optim,...)
-    [model0, model1], [optim0, optim1] = amp.initialize([model0, model1], [optim0, optim1],...)
-
-Backward passes with multiple optimizers
-****************************************
-
-Whenever you invoke a backward pass, the ``amp.scale_loss`` context manager must receive
-**all the optimizers that own any params for which the current backward pass is creating gradients.**
-This is true even if each optimizer owns only some, but not all, of the params that are about to
-receive gradients.
-
-If, for a given backward pass, there's only one optimizer whose params are about to receive gradients,
-you may pass that optimizer directly to ``amp.scale_loss``.  Otherwise, you must pass the
-list of optimizers whose params are about to receive gradients.  Example with 3 losses and 2 optimizers::
-
-    # loss0 accumulates gradients only into params owned by optim0:
-    with amp.scale_loss(loss0, optim0) as scaled_loss:
-        scaled_loss.backward()
-
-    # loss1 accumulates gradients only into params owned by optim1:
-    with amp.scale_loss(loss1, optim1) as scaled_loss:
-        scaled_loss.backward()
-
-    # loss2 accumulates gradients into some params owned by optim0
-    # and some params owned by optim1
-    with amp.scale_loss(loss2, [optim0, optim1]) as scaled_loss:
-        scaled_loss.backward()
-
-Optionally have Amp use a different loss scaler per-loss
-********************************************************
-
-By default, Amp maintains a single global loss scaler that will be used for all backward passes
-(all invocations of ``with amp.scale_loss(...)``).  No additional arguments to ``amp.initialize``
-or ``amp.scale_loss`` are required to use the global loss scaler.  The code snippets above with
-multiple optimizers/backward passes use the single global loss scaler under the hood,
-and they should "just work."
-
-However, you can optionally tell Amp to maintain a loss scaler per-loss, which gives Amp increased
-numerical flexibility.  This is accomplished by supplying the ``num_losses`` argument to
-``amp.initialize`` (which tells Amp how many backward passes you plan to invoke, and therefore
-how many loss scalers Amp should create), then supplying the ``loss_id`` argument to each of your
-backward passes (which tells Amp the loss scaler to use for this particular backward pass)::
-
-    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1], ..., num_losses=3)
-
-    with amp.scale_loss(loss0, optim0, loss_id=0) as scaled_loss:
-        scaled_loss.backward()
-
-    with amp.scale_loss(loss1, optim1, loss_id=1) as scaled_loss:
-        scaled_loss.backward()
-
-    with amp.scale_loss(loss2, [optim0, optim1], loss_id=2) as scaled_loss:
-        scaled_loss.backward()
-
-``num_losses`` and ``loss_id``\ s should be specified purely based on the set of
-losses/backward passes.  The use of multiple optimizers, or association of single or
-multiple optimizers with each backward pass, is unrelated.
-
-Gradient accumulation across iterations
----------------------------------------
-
-The following should "just work," and properly accommodate multiple models/optimizers/losses, as well as
-gradient clipping via the `instructions above`_::
-
-    # If your intent is to simulate a larger batch size using gradient accumulation,
-    # you can divide the loss by the number of accumulation iterations (so that gradients
-    # will be averaged over that many iterations):
-    loss = loss/iters_to_accumulate
-
-    with amp.scale_loss(loss, optimizer) as scaled_loss:
-        scaled_loss.backward()
-
-    # Every iters_to_accumulate iterations, call step() and reset gradients:
-    if iter%iters_to_accumulate == 0:
-        # Gradient clipping if desired:
-        # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
-        optimizer.step()
-        optimizer.zero_grad()
-
-As a minor performance optimization, you can pass ``delay_unscale=True``
-to ``amp.scale_loss`` until you're ready to ``step()``.  You should only attempt ``delay_unscale=True``
-if you're sure you know what you're doing, because the interaction with gradient clipping and
-multiple models/optimizers/losses can become tricky.::
-
-    if iter%iters_to_accumulate == 0:
-        # Every iters_to_accumulate iterations, unscale and step
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-        optimizer.step()
-        optimizer.zero_grad()
-    else:
-        # Otherwise, accumulate gradients, don't unscale or step.
-        with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss:
-            scaled_loss.backward()
-
-.. _`instructions above`:
-    https://nvidia.github.io/apex/advanced.html#gradient-clipping
-
-Custom data batch types
------------------------
-
-The intention of Amp is that you never need to cast your input data manually, regardless of
-``opt_level``.  Amp accomplishes this by patching any models' ``forward`` methods to cast
-incoming data appropriately for the ``opt_level``.  But to cast incoming data,
-Amp needs to know how.  The patched ``forward`` will recognize and cast floating-point Tensors
-(non-floating-point Tensors like IntTensors are not touched) and
-Python containers of floating-point Tensors.  However, if you wrap your Tensors in a custom class,
-the casting logic doesn't know how to drill
-through the tough custom shell to access and cast the juicy Tensor meat within.  You need to tell
-Amp how to cast your custom batch class, by assigning it a ``to`` method that accepts a ``torch.dtype``
-(e.g., ``torch.float16`` or ``torch.float32``) and returns an instance of the custom batch cast to
-``dtype``.  The patched ``forward`` checks for the presence of your ``to`` method, and will
-invoke it with the correct type for the ``opt_level``.
-
-Example::
-
-    class CustomData(object):
-        def __init__(self):
-            self.tensor = torch.cuda.FloatTensor([1,2,3])
-
-        def to(self, dtype):
-            self.tensor = self.tensor.to(dtype)
-            return self
-
-.. warning::
-
-    Amp also forwards numpy ndarrays without casting them.  If you send input data as a raw, unwrapped
-    ndarray, then later use it to create a Tensor within your ``model.forward``, this Tensor's type will
-    not depend on the ``opt_level``, and may or may not be correct.  Users are encouraged to pass
-    castable data inputs (Tensors, collections of Tensors, or custom classes with a ``to`` method)
-    wherever possible.
-
-.. note::
-
-    Amp does not call ``.cuda()`` on any Tensors for you.  Amp assumes that your original script
-    is already set up to move Tensors from the host to the device as needed.
diff --git a/docs/source/amp.rst b/docs/source/amp.rst
deleted file mode 100644
index 4bc140518..000000000
--- a/docs/source/amp.rst
+++ /dev/null
@@ -1,288 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-apex.amp
-===================================
-
-This page documents the updated API for Amp (Automatic Mixed Precision),
-a tool to enable Tensor Core-accelerated training in only 3 lines of Python.
-
-A `runnable, comprehensive Imagenet example`_ demonstrating good practices can be found
-on the Github page.
-
-GANs are a tricky case that many people have requested.  A `comprehensive DCGAN example`_
-is under construction.
-
-If you already implemented Amp based on the instructions below, but it isn't behaving as expected,
-please review `Advanced Amp Usage`_ to see if any topics match your use case.  If that doesn't help,
-`file an issue`_.
-
-.. _`file an issue`:
-    https://github.com/NVIDIA/apex/issues
-
-``opt_level``\ s and Properties
--------------------------------
-
-Amp allows users to easily experiment with different pure and mixed precision modes.
-Commonly-used default modes are chosen by
-selecting an "optimization level" or ``opt_level``; each ``opt_level`` establishes a set of
-properties that govern Amp's implementation of pure or mixed precision training.
-Finer-grained control of how a given ``opt_level`` behaves can be achieved by passing values for
-particular properties directly to ``amp.initialize``.  These manually specified values
-override the defaults established by the ``opt_level``.
-
-Example::
-
-        # Declare model and optimizer as usual, with default (FP32) precision
-        model = torch.nn.Linear(D_in, D_out).cuda()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-
-        # Allow Amp to perform casts as required by the opt_level
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
-        ...
-        # loss.backward() becomes:
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-        ...
-
-Users **should not** manually cast their model or data to ``.half()``, regardless of what ``opt_level``
-or properties are chosen.  Amp intends that users start with an existing default (FP32) script,
-add the three lines corresponding to the Amp API, and begin training with mixed precision.
-Amp can also be disabled, in which case the original script will behave exactly as it used to.
-In this way, there's no risk adhering to the Amp API, and a lot of potential performance benefit.
-
-.. note::
-    Because it's never necessary to manually cast your model (aside from the call ``amp.initialize``)
-    or input data, a script that adheres to the new API
-    can switch between different ``opt-level``\ s without having to make any other changes.
-
-.. _`runnable, comprehensive Imagenet example`:
-    https://github.com/NVIDIA/apex/tree/master/examples/imagenet
-
-.. _`comprehensive DCGAN example`:
-    https://github.com/NVIDIA/apex/tree/master/examples/dcgan
-
-.. _`Advanced Amp Usage`:
-    https://nvidia.github.io/apex/advanced.html
-
-Properties
-**********
-
-Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
-
-- ``cast_model_type``:  Casts your model's parameters and buffers to the desired type.
-- ``patch_torch_functions``: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
-- ``keep_batchnorm_fp32``:  To enhance precision and enable cudnn batchnorm (which improves performance), it's often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
-- ``master_weights``:  Maintain FP32 master weights to accompany any FP16 model weights.  FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
-- ``loss_scale``:  If ``loss_scale`` is a float value, use this value as the static (fixed) loss scale.  If ``loss_scale`` is the string ``"dynamic"``, adaptively adjust the loss scale over time.  Dynamic loss scale adjustments are performed by Amp automatically.
-
-Again, you often don't need to specify these properties by hand.  Instead, select an ``opt_level``,
-which will set them up for you.  After selecting an ``opt_level``, you can optionally pass property
-kwargs as manual overrides.
-
-If you attempt to override a property that does not make sense for the selected ``opt_level``,
-Amp will raise an error with an explanation.  For example, selecting ``opt_level="O1"`` combined with
-the override ``master_weights=True`` does not make sense.  ``O1`` inserts casts
-around Torch functions rather than model weights.  Data, activations, and weights are recast
-out-of-place on the fly as they flow through patched functions.  Therefore, the model weights themselves
-can (and should) remain FP32, and there is no need to maintain separate FP32 master weights.
-
-``opt_level``\ s
-****************
-
-Recognized ``opt_level``\ s are ``"O0"``, ``"O1"``, ``"O2"``, and ``"O3"``.
-
-``O0`` and ``O3`` are not true mixed precision, but they are useful for establishing accuracy and
-speed baselines, respectively.
-
-``O1`` and ``O2`` are different implementations of mixed precision.  Try both, and see
-what gives the best speedup and accuracy for your model.
-
-``O0``:  FP32 training
-^^^^^^^^^^^^^^^^^^^^^^
-Your incoming model should be FP32 already, so this is likely a no-op.
-``O0`` can be useful to establish an accuracy baseline.
-
-| Default properties set by ``O0``:
-| ``cast_model_type=torch.float32``
-| ``patch_torch_functions=False``
-| ``keep_batchnorm_fp32=None`` (effectively, "not applicable," everything is FP32)
-| ``master_weights=False``
-| ``loss_scale=1.0``
-|
-|
-
-``O1``:  Mixed Precision (recommended for typical use)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Patch all Torch functions and Tensor methods to cast their inputs according to a whitelist-blacklist
-model.  Whitelist ops (for example, Tensor Core-friendly ops like GEMMs and convolutions) are performed
-in FP16.  Blacklist ops that benefit from FP32 precision (for example, softmax)
-are performed in FP32.  ``O1`` also uses dynamic loss scaling, unless overridden.
-
-| Default properties set by ``O1``:
-| ``cast_model_type=None`` (not applicable)
-| ``patch_torch_functions=True``
-| ``keep_batchnorm_fp32=None`` (again, not applicable, all model weights remain FP32)
-| ``master_weights=None`` (not applicable, model weights remain FP32)
-| ``loss_scale="dynamic"``
-|
-|
-
-``O2``:  "Almost FP16" Mixed Precision
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-``O2`` casts the model weights to FP16,
-patches the model's ``forward`` method to cast input
-data to FP16, keeps batchnorms in FP32, maintains FP32 master weights,
-updates the optimizer's ``param_groups`` so that the ``optimizer.step()``
-acts directly on the FP32 weights (followed by FP32 master weight->FP16 model weight
-copies if necessary),
-and implements dynamic loss scaling (unless overridden).
-Unlike ``O1``, ``O2`` does not patch Torch functions or Tensor methods.
-
-| Default properties set by ``O2``:
-| ``cast_model_type=torch.float16``
-| ``patch_torch_functions=False``
-| ``keep_batchnorm_fp32=True``
-| ``master_weights=True``
-| ``loss_scale="dynamic"``
-|
-|
-
-``O3``:  FP16 training
-^^^^^^^^^^^^^^^^^^^^^^
-``O3`` may not achieve the stability of the true mixed precision options ``O1`` and ``O2``.
-However, it can be useful to establish a speed baseline for your model, against which
-the performance of ``O1`` and ``O2`` can be compared.  If your model uses batch normalization,
-to establish "speed of light" you can try ``O3`` with the additional property override
-``keep_batchnorm_fp32=True`` (which enables cudnn batchnorm, as stated earlier).
-
-| Default properties set by ``O3``:
-| ``cast_model_type=torch.float16``
-| ``patch_torch_functions=False``
-| ``keep_batchnorm_fp32=False``
-| ``master_weights=False``
-| ``loss_scale=1.0``
-|
-|
-
-Unified API
------------
-
-.. automodule:: apex.amp
-.. currentmodule:: apex.amp
-
-.. autofunction:: initialize
-
-.. autofunction:: scale_loss
-
-.. autofunction:: master_params
-
-Checkpointing
--------------
-
-To properly save and load your amp training, we introduce the ``amp.state_dict()``, which contains all ``loss_scaler``\ s and their corresponding unskipped steps, as well as ``amp.load_state_dict()`` to restore these attributes.
-
-In order to get bitwise accuracy, we recommend the following workflow::
-
-        # Initialization
-        opt_level = 'O1'
-        model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
-        
-        # Train your model
-        ...
-        
-        # Save checkpoint
-        checkpoint = {
-            'model': model.state_dict(),
-            'optimizer': optimizer.state_dict(),
-            'amp': amp.state_dict()
-        }
-        torch.save(checkpoint, 'amp_checkpoint.pt')
-        ...
-        
-        # Restore
-        model = ...
-        optimizer = ...
-        checkpoint = torch.load('amp_checkpoint.pt')
-        
-        model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
-        model.load_state_dict(checkpoint['model'])
-        optimizer.load_state_dict(checkpoint['optimizer'])
-        amp.load_state_dict(checkpoint['amp'])
-        
-        # Continue training
-        ...
-
-Note that we recommend restoring the model using the same ``opt_level``. Also note that we recommend calling the ``load_state_dict`` methods after ``amp.initialize``.
-
-Advanced use cases
-------------------
-
-The unified Amp API supports gradient accumulation across iterations,
-multiple backward passes per iteration, multiple models/optimizers,
-custom/user-defined autograd functions, and custom data batch classes.  Gradient clipping and GANs also
-require special treatment, but this treatment does not need to change
-for different ``opt_level``\ s.  Further details can be found here:
-
-.. toctree::
-   :maxdepth: 1
-
-   advanced
-
-Transition guide for old API users
-----------------------------------
-
-We strongly encourage moving to the new Amp API, because it's more versatile, easier to use, and future proof.  The original :class:`FP16_Optimizer` and the old "Amp" API are deprecated, and subject to removal at at any time.
-
-For users of the old "Amp" API
-******************************
-
-In the new API, ``opt-level O1`` performs the same patching of the Torch namespace as the old thing
-called "Amp."
-However, the new API allows static or dynamic loss scaling, while the old API only allowed dynamic loss scaling.
-
-In the new API, the old call to ``amp_handle = amp.init()``, and the returned ``amp_handle``, are no
-longer exposed or necessary.  The new ``amp.initialize()`` does the duty of ``amp.init()`` (and more).
-Therefore, any existing calls to ``amp_handle = amp.init()`` should be deleted.
-
-The functions formerly exposed through ``amp_handle`` are now free
-functions accessible through the ``amp`` module.
-
-The backward context manager must be changed accordingly::
-
-    # old API
-    with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
-        scaled_loss.backward()
-    ->
-    # new API
-    with amp.scale_loss(loss, optimizer) as scaled_loss:
-        scaled_loss.backward()
-
-For now, the deprecated "Amp" API documentation can still be found on the Github README:  https://github.com/NVIDIA/apex/tree/master/apex/amp.  The old API calls that `annotate user functions`_ to run
-with a particular precision are still honored by the new API.
-
-.. _`annotate user functions`:
-    https://github.com/NVIDIA/apex/tree/master/apex/amp#annotating-user-functions
-
-
-For users of the old FP16_Optimizer
-***********************************
-
-``opt-level O2`` is equivalent to :class:`FP16_Optimizer` with ``dynamic_loss_scale=True``.
-Once again, the backward pass must be changed to the unified version::
-
-    optimizer.backward(loss)
-    ->
-    with amp.scale_loss(loss, optimizer) as scaled_loss:
-        scaled_loss.backward()
-
-One annoying aspect of FP16_Optimizer was that the user had to manually convert their model to half
-(either by calling ``.half()`` on it, or using a function or module wrapper from
-``apex.fp16_utils``), and also manually call ``.half()`` on input data.  **Neither of these are
-necessary in the new API.  No matter what --opt-level
-you choose, you can and should simply build your model and pass input data in the default FP32 format.**
-The new Amp API will perform the right conversions during
-``model, optimizer = amp.initialize(model, optimizer, opt_level=....)`` based on the ``--opt-level``
-and any overridden flags.  Floating point input data may be FP32 or FP16, but you may as well just
-let it be FP16, because the ``model`` returned by ``amp.initialize`` will have its ``forward``
-method patched to cast the input data appropriately.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4477a28e1..f943ef325 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -64,7 +64,7 @@
 
 # General information about the project.
 project = 'Apex'
-copyright = '2018'
+copyright = '2018-'
 author = 'Christian Sarofeen, Natalia Gimelshein, Michael Carilli, Raul Puri'
 
 # The version info for the project you're documenting, acts as replacement for
@@ -84,7 +84,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+# language = None
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
diff --git a/docs/source/contrib.rst b/docs/source/contrib.rst
new file mode 100644
index 000000000..984f71241
--- /dev/null
+++ b/docs/source/contrib.rst
@@ -0,0 +1,166 @@
+.. module:: apex.contrib
+
+apex.contrib
+============
+
+Bottleneck
+----------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.bottleneck.Bottleneck
+   apex.contrib.bottleneck.SpatialBottleneck
+   apex.contrib.bottleneck.HaloExchangerNoComm
+   apex.contrib.bottleneck.HaloExchangerAllGather
+   apex.contrib.bottleneck.HaloExchangerSendRecv
+   apex.contrib.bottleneck.HaloExchangerPeer
+
+
+Clip Grad
+---------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.clip_grad.clip_grad_norm_
+
+
+cuDNN frontend based Conv-Bias-ReLU
+-----------------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.conv_bias_relu.ConvBiasReLU
+   apex.contrib.conv_bias_relu.ConvBias
+   apex.contrib.conv_bias_relu.ConvBiasMaskReLU
+
+
+cuDNN based 2D Group Batch Normalization
+----------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.cudnn_gbn.GroupBatchNorm2d
+
+
+Fused MultiHead Attention
+-------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.fmha.fmha.FMHA
+
+
+Focal Loss
+----------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.focal_loss.focal_loss.focal_loss
+
+
+Group Batch Normalization
+-------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.groupbn.BatchNorm2d_NHWC
+
+
+2D Index Multiply
+-----------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.index_mul_2d.index_mul_2d
+
+
+Layer Normalization
+-------------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.layer_norm.FastLayerNorm
+
+
+MultiHead Attention
+-------------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.multihead_attn.SelfMultiheadAttn
+   apex.contrib.multihead_attn.EncdecMultiheadAttn
+   apex.contrib.multihead_attn.fast_mask_softmax_dropout_func
+
+
+Optimizers
+----------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.optimizers.distributed_fused_adam.DistributedFusedAdam
+   apex.contrib.optimizers.distributed_fused_lamb.DistributedFusedLAMB
+
+
+Peer Memory
+-----------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.peer_memory.PeerMemoryPool
+   apex.contrib.peer_memory.PeerHaloExchanger1d
+
+
+Sparsity
+--------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.sparsity.create_mask
+   apex.contrib.sparsity.ASP
+
+
+Transducer
+----------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.transducer.TransducerJoint
+   apex.contrib.transducer.TransducerLoss
+
+
+Cross Entropy
+-------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   apex.contrib.xentropy.SoftmaxCrossEntropyLoss
diff --git a/docs/source/fp16_utils.rst b/docs/source/fp16_utils.rst
deleted file mode 100644
index b6b3da5f8..000000000
--- a/docs/source/fp16_utils.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-apex.fp16_utils
-===================================
-
-This submodule contains utilities designed to streamline the mixed precision training recipe 
-presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions 
-`Training Neural Networks with Mixed Precision: Theory and Practice`_ and 
-`Training Neural Networks with Mixed Precision: Real Examples`_.
-For Pytorch users, Real Examples in particular is recommended.
-
-Full runnable Python scripts demonstrating ``apex.fp16_utils`` 
-can be found on the Github page:
-
-| `Simple FP16_Optimizer demos`_
-|
-| `Distributed Mixed Precision Training with imagenet`_
-|
-| `Mixed Precision Training with word_language_model`_
-|
-|
-
-.. _`on Parallel Forall`:
-    https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
-.. _`Training Neural Networks with Mixed Precision: Theory and Practice`:
-    http://on-demand.gputechconf.com/gtc/2018/video/S8923/
-.. _`Training Neural Networks with Mixed Precision: Real Examples`:
-    http://on-demand.gputechconf.com/gtc/2018/video/S81012/
-.. _`Simple FP16_Optimizer demos`:
-    https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple
-.. _`Distributed Mixed Precision Training with imagenet`:
-    https://github.com/NVIDIA/apex/tree/master/examples/imagenet
-.. _`Mixed Precision Training with word_language_model`:
-    https://github.com/NVIDIA/apex/tree/master/examples/word_language_model
-
-.. automodule:: apex.fp16_utils
-.. currentmodule:: apex.fp16_utils
-
-Automatic management of master params + loss scaling
-----------------------------------------------------
-
-.. autoclass:: FP16_Optimizer
-    :members:
-
-.. autoclass:: LossScaler
-    :members:
-
-.. autoclass:: DynamicLossScaler
-    :members:
-
-Manual master parameter management
-----------------------------------
-
-.. autofunction:: prep_param_lists
-
-.. autofunction:: master_params_to_model_params
-
-.. autofunction:: model_grads_to_master_grads
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c7efc1681..7cec15341 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -5,46 +5,24 @@
 
 :github_url: https://github.com/nvidia/apex
 
-Apex (A PyTorch Extension)
+APEX (A PyTorch Extension)
 ===================================
 
-This site contains the API documentation for Apex (https://github.com/nvidia/apex),
-a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training.  Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible.
+This site contains the API documentation for AEPX (https://github.com/nvidia/apex),
+a PyTorch extension with NVIDIA-maintained utilities.  Some of the code here will be included in upstream PyTorch eventually. The intention of APEX is to make up-to-date utilities available to users as quickly as possible.
 
 Installation instructions can be found here:  https://github.com/NVIDIA/apex#quick-start.
 
 Some other useful material, including GTC 2019 and Pytorch DevCon 2019 Slides, can be found here:  https://github.com/mcarilli/mixed_precision_references.
 
-.. toctree::
-   :maxdepth: 1
-   :caption: AMP:  Automatic Mixed Precision
-
-   amp
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Distributed Training
-
-   parallel
-
 .. toctree::
    :maxdepth: 1
    :caption: Fused Optimizers
 
    optimizers
+   normalization
+   contrib
 
-.. toctree::
-   :maxdepth: 1
-   :caption: Fused Layer Norm
-
-   layernorm
-
-..   .. toctree::
-     :maxdepth: 1
-     :caption: Deprecated mixed precision API
-     fp16_util
-
-..   RNN
    
 Indices and tables
 ==================
diff --git a/docs/source/layernorm.rst b/docs/source/layernorm.rst
deleted file mode 100644
index 6eedb4ed2..000000000
--- a/docs/source/layernorm.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-apex.normalization.fused_layer_norm
-===================================
-
-.. automodule:: apex.normalization
-.. currentmodule:: apex.normalization
-
-.. FusedAdam
-   ----------
-
-.. autoclass:: FusedLayerNorm
-    :members:
-
-.. autoclass:: FusedRMSNorm
-    :members:
diff --git a/docs/source/normalization.rst b/docs/source/normalization.rst
new file mode 100644
index 000000000..b5d5e3b8f
--- /dev/null
+++ b/docs/source/normalization.rst
@@ -0,0 +1,11 @@
+.. module:: apex.normalization
+
+apex.normalization
+==================
+
+.. autosummary::
+   :toctree:
+   :nosignatures:
+
+   apex.normalization.FusedLayerNorm
+   apex.normalization.FusedRMSNorm
diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 407f07705..936eccec8 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -1,23 +1,14 @@
-.. role:: hidden
-    :class: hidden-section
+.. module:: apex.optimizers
 
 apex.optimizers
-===================================
+===============
 
-.. automodule:: apex.optimizers
-.. currentmodule:: apex.optimizers
+.. autosummary::
+   :toctree:
+   :nosignatures:
 
-.. FusedAdam
-   ----------
-
-.. autoclass:: FusedAdam
-    :members:
-
-.. autoclass:: FusedLAMB
-    :members:
-
-.. autoclass:: FusedNovoGrad
-    :members:
-
-.. autoclass:: FusedSGD
-    :members:
+   apex.optimizers.FusedSGD
+   apex.optimizers.FusedAdam
+   apex.optimizers.FusedLAMB
+   apex.optimizers.FusedMixedPrecisionLamb
+   apex.optimizers.FusedNovoGrad
diff --git a/docs/source/parallel.rst b/docs/source/parallel.rst
deleted file mode 100644
index 73759eeb9..000000000
--- a/docs/source/parallel.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-apex.parallel
-===================================
-
-.. automodule:: apex.parallel
-.. currentmodule:: apex.parallel
-
-.. DistributedDataParallel
-   ----------
-
-.. autoclass:: DistributedDataParallel
-    :members:
-
-.. autoclass:: Reducer
-    :members:
-
-.. autoclass:: SyncBatchNorm
-    :members:
-
-Utility functions
-----------------------------------
-
-.. autofunction:: convert_syncbn_model
diff --git a/docs/source/transormer.rst b/docs/source/transormer.rst
new file mode 100644
index 000000000..b1fb7c72a
--- /dev/null
+++ b/docs/source/transormer.rst
@@ -0,0 +1,14 @@
+.. module:: apex.transformer
+
+apex.transformer
+================
+
+.. autosummary::
+   :toctree:
+   :nosignatures:
+
+   apex.transformer.functional
+   apex.transformer.parallel_state
+   apex.transformer.pipeline_parallel
+   apex.transformer.tensor_parallel
+   apex.transformer.utils