diff --git a/vyper/ir/compile_ir.py b/vyper/ir/compile_ir.py
index 4c68aa2c8f..ddd1940365 100644
--- a/vyper/ir/compile_ir.py
+++ b/vyper/ir/compile_ir.py
@@ -1033,6 +1033,9 @@ def _stack_peephole_opts(assembly):
         if assembly[i] == "SWAP1" and assembly[i + 1].lower() in COMMUTATIVE_OPS:
             changed = True
             del assembly[i]
+        if assembly[i] == "DUP1" and assembly[i + 1] == "SWAP1":
+            changed = True
+            del assembly[i + 1]
         i += 1
 
     return changed
diff --git a/vyper/venom/__init__.py b/vyper/venom/__init__.py
index afd79fc44f..3c4ffb26a7 100644
--- a/vyper/venom/__init__.py
+++ b/vyper/venom/__init__.py
@@ -10,6 +10,7 @@
 from vyper.venom.function import IRFunction
 from vyper.venom.ir_node_to_venom import ir_node_to_venom
 from vyper.venom.passes.algebraic_optimization import AlgebraicOptimizationPass
+from vyper.venom.passes.normalization import NormalizationPass
 from vyper.venom.passes.branch_optimization import BranchOptimizationPass
 from vyper.venom.passes.dft import DFTPass
 from vyper.venom.passes.extract_literals import ExtractLiteralsPass
@@ -19,6 +20,7 @@
 from vyper.venom.passes.sccp import SCCP
 from vyper.venom.passes.simplify_cfg import SimplifyCFGPass
 from vyper.venom.passes.store_elimination import StoreElimination
+from vyper.venom.passes.store_expansion import StoreExpansionPass
 from vyper.venom.venom_to_assembly import VenomCompiler
 
 DEFAULT_OPT_LEVEL = OptimizationLevel.default()
@@ -53,9 +55,15 @@ def _run_passes(fn: IRFunction, optimize: OptimizationLevel) -> None:
     StoreElimination(ac, fn).run_pass()
     SimplifyCFGPass(ac, fn).run_pass()
     AlgebraicOptimizationPass(ac, fn).run_pass()
+
+    NormalizationPass(ac, fn).run_pass()
     BranchOptimizationPass(ac, fn).run_pass()
-    ExtractLiteralsPass(ac, fn).run_pass()
     RemoveUnusedVariablesPass(ac, fn).run_pass()
+
+    # reorder and prepare for stack scheduling
+    DFTPass(ac, fn).run_pass()
+    StoreExpansionPass(ac, fn).run_pass()
+    ExtractLiteralsPass(ac, fn).run_pass()
     DFTPass(ac, fn).run_pass()
 
 
diff --git a/vyper/venom/analysis/liveness.py b/vyper/venom/analysis/liveness.py
index 2a471bc8be..f8a6fbd437 100644
--- a/vyper/venom/analysis/liveness.py
+++ b/vyper/venom/analysis/liveness.py
@@ -65,7 +65,8 @@ def _calculate_out_vars(self, bb: IRBasicBlock) -> bool:
         bb.out_vars = OrderedSet()
         for out_bb in bb.cfg_out:
             target_vars = self.input_vars_from(bb, out_bb)
-            bb.out_vars = bb.out_vars.union(target_vars)
+            bb.out_vars |= target_vars
+
         return out_vars != bb.out_vars
 
     # calculate the input variables into self from source
diff --git a/vyper/venom/basicblock.py b/vyper/venom/basicblock.py
index 1199579b3f..5e715746c9 100644
--- a/vyper/venom/basicblock.py
+++ b/vyper/venom/basicblock.py
@@ -210,7 +210,6 @@ class IRInstruction:
     # set of live variables at this instruction
     liveness: OrderedSet[IRVariable]
     parent: "IRBasicBlock"
-    fence_id: int
     annotation: Optional[str]
     ast_source: Optional[IRnode]
     error_msg: Optional[str]
@@ -227,7 +226,6 @@ def __init__(
         self.operands = list(operands)  # in case we get an iterator
         self.output = output
         self.liveness = OrderedSet()
-        self.fence_id = -1
         self.annotation = None
         self.ast_source = None
         self.error_msg = None
diff --git a/vyper/venom/passes/dft.py b/vyper/venom/passes/dft.py
index f45a60079c..42bc7df07e 100644
--- a/vyper/venom/passes/dft.py
+++ b/vyper/venom/passes/dft.py
@@ -1,81 +1,224 @@
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+
 from vyper.utils import OrderedSet
 from vyper.venom.analysis.dfg import DFGAnalysis
+from vyper.venom.analysis.liveness import LivenessAnalysis
 from vyper.venom.basicblock import IRBasicBlock, IRInstruction, IRVariable
 from vyper.venom.function import IRFunction
 from vyper.venom.passes.base_pass import IRPass
 
+_ALL = ("storage", "transient", "memory", "immutables", "balance", "returndata")
+
+writes = {
+    "sstore": "storage",
+    "tstore": "transient",
+    "mstore": "memory",
+    "istore": "immutables",
+    "call": _ALL,
+    "delegatecall": _ALL,
+    "staticcall": "memory",
+    "create": _ALL,
+    "create2": _ALL,
+    "invoke": _ALL,  # could be smarter, look up the effects of the invoked function
+    "dloadbytes": "memory",
+    "returndatacopy": "memory",
+    "calldatacopy": "memory",
+    "codecopy": "memory",
+    "extcodecopy": "memory",
+    "mcopy": "memory",
+}
+reads = {
+    "sload": "storage",
+    "tload": "transient",
+    "iload": "immutables",
+    "mload": "memory",
+    "mcopy": "memory",
+    "call": _ALL,
+    "delegatecall": _ALL,
+    "staticcall": _ALL,
+    "returndatasize": "returndata",
+    "returndatacopy": "returndata",
+    "balance": "balance",
+    "selfbalance": "balance",
+    "log": "memory",
+    "revert": "memory",
+    "return": "memory",
+    "sha3": "memory",
+}
+
+
+@dataclass
+class Fence:
+    storage: int = 0
+    memory: int = 0
+    transient: int = 0
+    immutables: int = 0
+    balance: int = 0
+    returndata: int = 0
+
+
+# effects graph
+class EffectsG:
+    def __init__(self):
+        self._graph = defaultdict(list)
+
+        # not sure if this will be useful
+        self._outputs = defaultdict(list)
+
+    def analyze(self, bb):
+        fence = Fence()
+
+        read_groups = {}
+        terms = {}
+
+        for inst in bb.instructions:
+            reads = _get_reads(inst.opcode)
+            writes = _get_writes(inst.opcode)
+            for eff in reads:
+                fence_id = getattr(fence, eff)
+                group = read_groups.setdefault((eff, fence_id), [])
+                group.append(inst)
+
+            # collect writes in a separate dict
+            for eff in writes:
+                fence_id = getattr(fence, eff)
+                assert (eff, fence_id) not in terms
+                terms[(eff, fence_id)] = inst
+
+            fence = _compute_fence(inst.opcode, fence)
+
+        for (effect, fence_id), write_inst in terms.items():
+            reads = read_groups.get((effect, fence_id), [])
+            for read in reads:
+                if read == write_inst:
+                    continue
+                self._graph[write_inst].append(read)
+
+            next_id = fence_id + 1
+
+            next_write = terms.get((effect, next_id))
+            if next_write is not None:
+                self._graph[next_write].append(write_inst)
+
+            next_reads = read_groups.get((effect, next_id), [])
+            for inst in next_reads:
+                self._graph[inst].append(write_inst)
+
+        # invert the graph, go the other way
+        for inst, dependencies in self._graph.items():
+            # sanity check the graph
+            assert inst not in dependencies, inst
+            for target in dependencies:
+                self._outputs[target].append(inst)
+
+    def required_by(self, inst):
+        return self._graph.get(inst, [])
+
+    def downstream_of(self, inst):
+        return self._outputs.get(inst, [])
+
+
+def _get_reads(opcode):
+    ret = reads.get(opcode, ())
+    if not isinstance(ret, tuple):
+        ret = (ret,)
+    return ret
+
+
+def _get_writes(opcode):
+    ret = writes.get(opcode, ())
+    if not isinstance(ret, tuple):
+        ret = (ret,)
+    return ret
+
+
+def _compute_fence(opcode: str, fence: Fence) -> Fence:
+    if opcode not in writes:
+        return fence
+
+    effects = _get_writes(opcode)
+
+    tmp = asdict(fence)
+    for eff in effects:
+        tmp[eff] += 1
+
+    return Fence(**tmp)
+
 
 class DFTPass(IRPass):
     function: IRFunction
-    inst_order: dict[IRInstruction, int]
-    inst_order_num: int
 
-    def _process_instruction_r(self, bb: IRBasicBlock, inst: IRInstruction, offset: int = 0):
+    def _process_instruction_r(self, bb: IRBasicBlock, inst: IRInstruction):
+        if inst.parent != bb:
+            return
+        if inst in self.done:
+            return
+
         for op in inst.get_outputs():
             assert isinstance(op, IRVariable), f"expected variable, got {op}"
             uses = self.dfg.get_uses(op)
 
-            for uses_this in uses:
-                if uses_this.parent != inst.parent or uses_this.fence_id != inst.fence_id:
-                    # don't reorder across basic block or fence boundaries
-                    continue
-
-                # if the instruction is a terminator, we need to place
-                # it at the end of the basic block
-                # along with all the instructions that "lead" to it
-                self._process_instruction_r(bb, uses_this, offset)
+            for use in reversed(uses):
+                self._process_instruction_r(bb, use)
 
-        if inst in self.visited_instructions:
+        if inst in self.started:
             return
-        self.visited_instructions.add(inst)
-        self.inst_order_num += 1
-
-        if inst.is_bb_terminator:
-            offset = len(bb.instructions)
+        self.started.add(inst)
 
-        if inst.opcode == "phi":
-            # phi instructions stay at the beginning of the basic block
-            # and no input processing is needed
-            # bb.instructions.append(inst)
-            self.inst_order[inst] = 0
+        if inst.opcode in ("phi", "param"):
             return
 
         for op in inst.get_input_variables():
             target = self.dfg.get_producing_instruction(op)
             assert target is not None, f"no producing instruction for {op}"
-            if target.parent != inst.parent or target.fence_id != inst.fence_id:
-                # don't reorder across basic block or fence boundaries
-                continue
-            self._process_instruction_r(bb, target, offset)
+            self._process_instruction_r(bb, target)
 
-        self.inst_order[inst] = self.inst_order_num + offset
+        for target in self._effects_g.required_by(inst):
+            self._process_instruction_r(bb, target)
 
-    def _process_basic_block(self, bb: IRBasicBlock) -> None:
-        self.function.append_basic_block(bb)
+        bb.instructions.append(inst)
+        self.done.add(inst)
 
-        for inst in bb.instructions:
-            inst.fence_id = self.fence_id
-            if inst.is_volatile:
-                self.fence_id += 1
-
-        # We go throught the instructions and calculate the order in which they should be executed
-        # based on the data flow graph. This order is stored in the inst_order dictionary.
-        # We then sort the instructions based on this order.
-        self.inst_order = {}
-        self.inst_order_num = 0
-        for inst in bb.instructions:
+    def _process_basic_block(self, bb: IRBasicBlock) -> None:
+        self._effects_g = EffectsG()
+        self._effects_g.analyze(bb)
+
+        instructions = bb.instructions.copy()
+        bb.instructions = [inst for inst in bb.instructions if inst.opcode in ("phi", "param")]
+
+        # start with out liveness
+        #if len(bb.cfg_out) > 0:
+        if False:
+            next_bb = bb.cfg_out.first()
+            target_stack = self.liveness.input_vars_from(bb, next_bb)
+            for var in reversed(list(target_stack)):
+                inst = self.dfg.get_producing_instruction(var)
+                self._process_instruction_r(bb, inst)
+
+        for inst in instructions:
             self._process_instruction_r(bb, inst)
 
-        bb.instructions.sort(key=lambda x: self.inst_order[x])
+        def key(inst):
+            if inst.is_bb_terminator:
+                return 2
+            return 1
+
+        bb.instructions.sort(key=key)
+
+        # sanity check: the instructions we started with are the same
+        # as we have now
+        assert set(bb.instructions) == set(instructions), (instructions, bb)
 
     def run_pass(self) -> None:
         self.dfg = self.analyses_cache.request_analysis(DFGAnalysis)
+        self.liveness = self.analyses_cache.request_analysis(LivenessAnalysis)  # use out_vars
 
-        self.fence_id = 0
-        self.visited_instructions: OrderedSet[IRInstruction] = OrderedSet()
+        self.started: OrderedSet[IRInstruction] = OrderedSet()
+        self.done: OrderedSet[IRInstruction] = OrderedSet()
 
-        basic_blocks = list(self.function.get_basic_blocks())
-
-        self.function.clear_basic_blocks()
-        for bb in basic_blocks:
+        for bb in self.function.get_basic_blocks():
             self._process_basic_block(bb)
+
+        # for repr
+        self.analyses_cache.force_analysis(LivenessAnalysis)
diff --git a/vyper/venom/passes/store_expansion.py b/vyper/venom/passes/store_expansion.py
new file mode 100644
index 0000000000..a8285fa09d
--- /dev/null
+++ b/vyper/venom/passes/store_expansion.py
@@ -0,0 +1,57 @@
+from vyper.venom.analysis.cfg import CFGAnalysis
+from vyper.venom.analysis.dfg import DFGAnalysis
+from vyper.venom.analysis.liveness import LivenessAnalysis
+from vyper.venom.basicblock import IRInstruction
+from vyper.venom.passes.base_pass import IRPass
+
+
+class StoreExpansionPass(IRPass):
+    """
+    This pass expands variables to their uses though `store` instructions,
+    reducing pressure on the stack scheduler
+    """
+
+    def run_pass(self):
+        dfg = self.analyses_cache.request_analysis(DFGAnalysis)
+        self.analyses_cache.request_analysis(CFGAnalysis)
+        liveness = self.analyses_cache.force_analysis(LivenessAnalysis)
+
+        for bb in self.function.get_basic_blocks():
+            if len(bb.instructions) == 0:
+                continue
+
+            for var in bb.instructions[0].liveness:
+                self._process_var(dfg, bb, var, 0)
+
+            for idx, inst in enumerate(bb.instructions):
+                if inst.output is None:
+                    continue
+
+                self._process_var(dfg, bb, inst.output, idx + 1)
+
+            bb.instructions.sort(key=lambda inst: inst.opcode not in ("phi", "param"))
+
+        self.analyses_cache.invalidate_analysis(LivenessAnalysis)
+        self.analyses_cache.invalidate_analysis(DFGAnalysis)
+
+    def _process_var(self, dfg, bb, var, idx):
+        """
+        Process a variable, allocating a new variable for each use
+        and copying it to the new instruction
+        """
+        uses = dfg.get_uses(var)
+
+        _cache = {}
+
+        for use_inst in uses:
+            if use_inst.opcode == "phi":
+                continue
+            if use_inst.parent != bb:
+                continue
+
+            for i, operand in enumerate(use_inst.operands):
+                if operand == var:
+                    new_var = self.function.get_next_variable()
+                    new_inst = IRInstruction("store", [var], new_var)
+                    bb.insert_instruction(new_inst, idx)
+                    use_inst.operands[i] = new_var
diff --git a/vyper/venom/venom_to_assembly.py b/vyper/venom/venom_to_assembly.py
index 390fab8e7c..41f67a7045 100644
--- a/vyper/venom/venom_to_assembly.py
+++ b/vyper/venom/venom_to_assembly.py
@@ -25,6 +25,10 @@
 from vyper.venom.passes.normalization import NormalizationPass
 from vyper.venom.stack_model import StackModel
 
+DEBUG_SHOW_COST = True
+if DEBUG_SHOW_COST:
+    import sys
+
 # instructions which map one-to-one from venom to EVM
 _ONE_TO_ONE_INSTRUCTIONS = frozenset(
     [
@@ -150,7 +154,6 @@ def generate_evm(self, no_optimize: bool = False) -> list[str]:
             for fn in ctx.functions.values():
                 ac = IRAnalysesCache(fn)
 
-                NormalizationPass(ac, fn).run_pass()
                 self.liveness_analysis = ac.request_analysis(LivenessAnalysis)
 
                 assert fn.normalized, "Non-normalized CFG!"
@@ -282,6 +285,12 @@ def _generate_evm_for_basicblock_r(
             return
         self.visited_basicblocks.add(basicblock)
 
+        if DEBUG_SHOW_COST:
+            print(basicblock, file=sys.stderr)
+
+        ref = asm
+        asm = []
+
         # assembly entry point into the block
         asm.append(f"_sym_{basicblock.label}")
         asm.append("JUMPDEST")
@@ -297,8 +306,14 @@ def _generate_evm_for_basicblock_r(
 
             asm.extend(self._generate_evm_for_instruction(inst, stack, next_liveness))
 
+        if DEBUG_SHOW_COST:
+            print(" ".join(map(str, asm)), file=sys.stderr)
+            print("\n", file=sys.stderr)
+
+        ref.extend(asm)
+
         for bb in basicblock.reachable:
-            self._generate_evm_for_basicblock_r(asm, bb, stack.copy())
+            self._generate_evm_for_basicblock_r(ref, bb, stack.copy())
 
     # pop values from stack at entry to bb
     # note this produces the same result(!) no matter which basic block
@@ -421,6 +436,13 @@ def _generate_evm_for_instruction(
             if cost_with_swap > cost_no_swap:
                 operands[-1], operands[-2] = operands[-2], operands[-1]
 
+        cost = self._stack_reorder([], stack, operands, dry_run=True)
+        if DEBUG_SHOW_COST and cost:
+            print("ENTER", inst, file=sys.stderr)
+            print("  HAVE", stack, file=sys.stderr)
+            print("  WANT", operands, file=sys.stderr)
+            print("  COST", cost, file=sys.stderr)
+
         # final step to get the inputs to this instruction ordered
         # correctly on the stack
         self._stack_reorder(assembly, stack, operands)
@@ -455,7 +477,7 @@ def _generate_evm_for_instruction(
             assembly.append("JUMPI")
 
             # make sure the if_zero_label will be optimized out
-            # assert if_zero_label == next(iter(inst.parent.cfg_out)).label
+            # assert if_zero_label == inst.parent.cfg_out.first().label
 
             assembly.append(f"_sym_{if_zero_label.value}")
             assembly.append("JUMP")
@@ -562,6 +584,7 @@ def dup(self, assembly, stack, depth):
         assembly.append(_evm_dup_for(depth))
 
     def swap_op(self, assembly, stack, op):
+        assert stack.get_depth(op) is not StackModel.NOT_IN_STACK, op
         self.swap(assembly, stack, stack.get_depth(op))
 
     def dup_op(self, assembly, stack, op):