diff --git a/vyper/ir/compile_ir.py b/vyper/ir/compile_ir.py index 4c68aa2c8f..ddd1940365 100644 --- a/vyper/ir/compile_ir.py +++ b/vyper/ir/compile_ir.py @@ -1033,6 +1033,9 @@ def _stack_peephole_opts(assembly): if assembly[i] == "SWAP1" and assembly[i + 1].lower() in COMMUTATIVE_OPS: changed = True del assembly[i] + if assembly[i] == "DUP1" and assembly[i + 1] == "SWAP1": + changed = True + del assembly[i + 1] i += 1 return changed diff --git a/vyper/venom/__init__.py b/vyper/venom/__init__.py index afd79fc44f..3c4ffb26a7 100644 --- a/vyper/venom/__init__.py +++ b/vyper/venom/__init__.py @@ -10,6 +10,7 @@ from vyper.venom.function import IRFunction from vyper.venom.ir_node_to_venom import ir_node_to_venom from vyper.venom.passes.algebraic_optimization import AlgebraicOptimizationPass +from vyper.venom.passes.normalization import NormalizationPass from vyper.venom.passes.branch_optimization import BranchOptimizationPass from vyper.venom.passes.dft import DFTPass from vyper.venom.passes.extract_literals import ExtractLiteralsPass @@ -19,6 +20,7 @@ from vyper.venom.passes.sccp import SCCP from vyper.venom.passes.simplify_cfg import SimplifyCFGPass from vyper.venom.passes.store_elimination import StoreElimination +from vyper.venom.passes.store_expansion import StoreExpansionPass from vyper.venom.venom_to_assembly import VenomCompiler DEFAULT_OPT_LEVEL = OptimizationLevel.default() @@ -53,9 +55,15 @@ def _run_passes(fn: IRFunction, optimize: OptimizationLevel) -> None: StoreElimination(ac, fn).run_pass() SimplifyCFGPass(ac, fn).run_pass() AlgebraicOptimizationPass(ac, fn).run_pass() + + NormalizationPass(ac, fn).run_pass() BranchOptimizationPass(ac, fn).run_pass() - ExtractLiteralsPass(ac, fn).run_pass() RemoveUnusedVariablesPass(ac, fn).run_pass() + + # reorder and prepare for stack scheduling + DFTPass(ac, fn).run_pass() + StoreExpansionPass(ac, fn).run_pass() + ExtractLiteralsPass(ac, fn).run_pass() DFTPass(ac, fn).run_pass() diff --git a/vyper/venom/analysis/liveness.py b/vyper/venom/analysis/liveness.py index 2a471bc8be..f8a6fbd437 100644 --- a/vyper/venom/analysis/liveness.py +++ b/vyper/venom/analysis/liveness.py @@ -65,7 +65,8 @@ def _calculate_out_vars(self, bb: IRBasicBlock) -> bool: bb.out_vars = OrderedSet() for out_bb in bb.cfg_out: target_vars = self.input_vars_from(bb, out_bb) - bb.out_vars = bb.out_vars.union(target_vars) + bb.out_vars |= target_vars + return out_vars != bb.out_vars # calculate the input variables into self from source diff --git a/vyper/venom/basicblock.py b/vyper/venom/basicblock.py index 1199579b3f..5e715746c9 100644 --- a/vyper/venom/basicblock.py +++ b/vyper/venom/basicblock.py @@ -210,7 +210,6 @@ class IRInstruction: # set of live variables at this instruction liveness: OrderedSet[IRVariable] parent: "IRBasicBlock" - fence_id: int annotation: Optional[str] ast_source: Optional[IRnode] error_msg: Optional[str] @@ -227,7 +226,6 @@ def __init__( self.operands = list(operands) # in case we get an iterator self.output = output self.liveness = OrderedSet() - self.fence_id = -1 self.annotation = None self.ast_source = None self.error_msg = None diff --git a/vyper/venom/passes/dft.py b/vyper/venom/passes/dft.py index f45a60079c..42bc7df07e 100644 --- a/vyper/venom/passes/dft.py +++ b/vyper/venom/passes/dft.py @@ -1,81 +1,224 @@ +from collections import defaultdict +from dataclasses import asdict, dataclass + from vyper.utils import OrderedSet from vyper.venom.analysis.dfg import DFGAnalysis +from vyper.venom.analysis.liveness import LivenessAnalysis from vyper.venom.basicblock import IRBasicBlock, IRInstruction, IRVariable from vyper.venom.function import IRFunction from vyper.venom.passes.base_pass import IRPass +_ALL = ("storage", "transient", "memory", "immutables", "balance", "returndata") + +writes = { + "sstore": "storage", + "tstore": "transient", + "mstore": "memory", + "istore": "immutables", + "call": _ALL, + "delegatecall": _ALL, + "staticcall": "memory", + "create": _ALL, + "create2": _ALL, + "invoke": _ALL, # could be smarter, look up the effects of the invoked function + "dloadbytes": "memory", + "returndatacopy": "memory", + "calldatacopy": "memory", + "codecopy": "memory", + "extcodecopy": "memory", + "mcopy": "memory", +} +reads = { + "sload": "storage", + "tload": "transient", + "iload": "immutables", + "mload": "memory", + "mcopy": "memory", + "call": _ALL, + "delegatecall": _ALL, + "staticcall": _ALL, + "returndatasize": "returndata", + "returndatacopy": "returndata", + "balance": "balance", + "selfbalance": "balance", + "log": "memory", + "revert": "memory", + "return": "memory", + "sha3": "memory", +} + + +@dataclass +class Fence: + storage: int = 0 + memory: int = 0 + transient: int = 0 + immutables: int = 0 + balance: int = 0 + returndata: int = 0 + + +# effects graph +class EffectsG: + def __init__(self): + self._graph = defaultdict(list) + + # not sure if this will be useful + self._outputs = defaultdict(list) + + def analyze(self, bb): + fence = Fence() + + read_groups = {} + terms = {} + + for inst in bb.instructions: + reads = _get_reads(inst.opcode) + writes = _get_writes(inst.opcode) + for eff in reads: + fence_id = getattr(fence, eff) + group = read_groups.setdefault((eff, fence_id), []) + group.append(inst) + + # collect writes in a separate dict + for eff in writes: + fence_id = getattr(fence, eff) + assert (eff, fence_id) not in terms + terms[(eff, fence_id)] = inst + + fence = _compute_fence(inst.opcode, fence) + + for (effect, fence_id), write_inst in terms.items(): + reads = read_groups.get((effect, fence_id), []) + for read in reads: + if read == write_inst: + continue + self._graph[write_inst].append(read) + + next_id = fence_id + 1 + + next_write = terms.get((effect, next_id)) + if next_write is not None: + self._graph[next_write].append(write_inst) + + next_reads = read_groups.get((effect, next_id), []) + for inst in next_reads: + self._graph[inst].append(write_inst) + + # invert the graph, go the other way + for inst, dependencies in self._graph.items(): + # sanity check the graph + assert inst not in dependencies, inst + for target in dependencies: + self._outputs[target].append(inst) + + def required_by(self, inst): + return self._graph.get(inst, []) + + def downstream_of(self, inst): + return self._outputs.get(inst, []) + + +def _get_reads(opcode): + ret = reads.get(opcode, ()) + if not isinstance(ret, tuple): + ret = (ret,) + return ret + + +def _get_writes(opcode): + ret = writes.get(opcode, ()) + if not isinstance(ret, tuple): + ret = (ret,) + return ret + + +def _compute_fence(opcode: str, fence: Fence) -> Fence: + if opcode not in writes: + return fence + + effects = _get_writes(opcode) + + tmp = asdict(fence) + for eff in effects: + tmp[eff] += 1 + + return Fence(**tmp) + class DFTPass(IRPass): function: IRFunction - inst_order: dict[IRInstruction, int] - inst_order_num: int - def _process_instruction_r(self, bb: IRBasicBlock, inst: IRInstruction, offset: int = 0): + def _process_instruction_r(self, bb: IRBasicBlock, inst: IRInstruction): + if inst.parent != bb: + return + if inst in self.done: + return + for op in inst.get_outputs(): assert isinstance(op, IRVariable), f"expected variable, got {op}" uses = self.dfg.get_uses(op) - for uses_this in uses: - if uses_this.parent != inst.parent or uses_this.fence_id != inst.fence_id: - # don't reorder across basic block or fence boundaries - continue - - # if the instruction is a terminator, we need to place - # it at the end of the basic block - # along with all the instructions that "lead" to it - self._process_instruction_r(bb, uses_this, offset) + for use in reversed(uses): + self._process_instruction_r(bb, use) - if inst in self.visited_instructions: + if inst in self.started: return - self.visited_instructions.add(inst) - self.inst_order_num += 1 - - if inst.is_bb_terminator: - offset = len(bb.instructions) + self.started.add(inst) - if inst.opcode == "phi": - # phi instructions stay at the beginning of the basic block - # and no input processing is needed - # bb.instructions.append(inst) - self.inst_order[inst] = 0 + if inst.opcode in ("phi", "param"): return for op in inst.get_input_variables(): target = self.dfg.get_producing_instruction(op) assert target is not None, f"no producing instruction for {op}" - if target.parent != inst.parent or target.fence_id != inst.fence_id: - # don't reorder across basic block or fence boundaries - continue - self._process_instruction_r(bb, target, offset) + self._process_instruction_r(bb, target) - self.inst_order[inst] = self.inst_order_num + offset + for target in self._effects_g.required_by(inst): + self._process_instruction_r(bb, target) - def _process_basic_block(self, bb: IRBasicBlock) -> None: - self.function.append_basic_block(bb) + bb.instructions.append(inst) + self.done.add(inst) - for inst in bb.instructions: - inst.fence_id = self.fence_id - if inst.is_volatile: - self.fence_id += 1 - - # We go throught the instructions and calculate the order in which they should be executed - # based on the data flow graph. This order is stored in the inst_order dictionary. - # We then sort the instructions based on this order. - self.inst_order = {} - self.inst_order_num = 0 - for inst in bb.instructions: + def _process_basic_block(self, bb: IRBasicBlock) -> None: + self._effects_g = EffectsG() + self._effects_g.analyze(bb) + + instructions = bb.instructions.copy() + bb.instructions = [inst for inst in bb.instructions if inst.opcode in ("phi", "param")] + + # start with out liveness + #if len(bb.cfg_out) > 0: + if False: + next_bb = bb.cfg_out.first() + target_stack = self.liveness.input_vars_from(bb, next_bb) + for var in reversed(list(target_stack)): + inst = self.dfg.get_producing_instruction(var) + self._process_instruction_r(bb, inst) + + for inst in instructions: self._process_instruction_r(bb, inst) - bb.instructions.sort(key=lambda x: self.inst_order[x]) + def key(inst): + if inst.is_bb_terminator: + return 2 + return 1 + + bb.instructions.sort(key=key) + + # sanity check: the instructions we started with are the same + # as we have now + assert set(bb.instructions) == set(instructions), (instructions, bb) def run_pass(self) -> None: self.dfg = self.analyses_cache.request_analysis(DFGAnalysis) + self.liveness = self.analyses_cache.request_analysis(LivenessAnalysis) # use out_vars - self.fence_id = 0 - self.visited_instructions: OrderedSet[IRInstruction] = OrderedSet() + self.started: OrderedSet[IRInstruction] = OrderedSet() + self.done: OrderedSet[IRInstruction] = OrderedSet() - basic_blocks = list(self.function.get_basic_blocks()) - - self.function.clear_basic_blocks() - for bb in basic_blocks: + for bb in self.function.get_basic_blocks(): self._process_basic_block(bb) + + # for repr + self.analyses_cache.force_analysis(LivenessAnalysis) diff --git a/vyper/venom/passes/store_expansion.py b/vyper/venom/passes/store_expansion.py new file mode 100644 index 0000000000..a8285fa09d --- /dev/null +++ b/vyper/venom/passes/store_expansion.py @@ -0,0 +1,57 @@ +from vyper.venom.analysis.cfg import CFGAnalysis +from vyper.venom.analysis.dfg import DFGAnalysis +from vyper.venom.analysis.liveness import LivenessAnalysis +from vyper.venom.basicblock import IRInstruction +from vyper.venom.passes.base_pass import IRPass + + +class StoreExpansionPass(IRPass): + """ + This pass expands variables to their uses though `store` instructions, + reducing pressure on the stack scheduler + """ + + def run_pass(self): + dfg = self.analyses_cache.request_analysis(DFGAnalysis) + self.analyses_cache.request_analysis(CFGAnalysis) + liveness = self.analyses_cache.force_analysis(LivenessAnalysis) + + for bb in self.function.get_basic_blocks(): + if len(bb.instructions) == 0: + continue + + for var in bb.instructions[0].liveness: + self._process_var(dfg, bb, var, 0) + + for idx, inst in enumerate(bb.instructions): + if inst.output is None: + continue + + self._process_var(dfg, bb, inst.output, idx + 1) + + bb.instructions.sort(key=lambda inst: inst.opcode not in ("phi", "param")) + + self.analyses_cache.invalidate_analysis(LivenessAnalysis) + self.analyses_cache.invalidate_analysis(DFGAnalysis) + + def _process_var(self, dfg, bb, var, idx): + """ + Process a variable, allocating a new variable for each use + and copying it to the new instruction + """ + uses = dfg.get_uses(var) + + _cache = {} + + for use_inst in uses: + if use_inst.opcode == "phi": + continue + if use_inst.parent != bb: + continue + + for i, operand in enumerate(use_inst.operands): + if operand == var: + new_var = self.function.get_next_variable() + new_inst = IRInstruction("store", [var], new_var) + bb.insert_instruction(new_inst, idx) + use_inst.operands[i] = new_var diff --git a/vyper/venom/venom_to_assembly.py b/vyper/venom/venom_to_assembly.py index 390fab8e7c..41f67a7045 100644 --- a/vyper/venom/venom_to_assembly.py +++ b/vyper/venom/venom_to_assembly.py @@ -25,6 +25,10 @@ from vyper.venom.passes.normalization import NormalizationPass from vyper.venom.stack_model import StackModel +DEBUG_SHOW_COST = True +if DEBUG_SHOW_COST: + import sys + # instructions which map one-to-one from venom to EVM _ONE_TO_ONE_INSTRUCTIONS = frozenset( [ @@ -150,7 +154,6 @@ def generate_evm(self, no_optimize: bool = False) -> list[str]: for fn in ctx.functions.values(): ac = IRAnalysesCache(fn) - NormalizationPass(ac, fn).run_pass() self.liveness_analysis = ac.request_analysis(LivenessAnalysis) assert fn.normalized, "Non-normalized CFG!" @@ -282,6 +285,12 @@ def _generate_evm_for_basicblock_r( return self.visited_basicblocks.add(basicblock) + if DEBUG_SHOW_COST: + print(basicblock, file=sys.stderr) + + ref = asm + asm = [] + # assembly entry point into the block asm.append(f"_sym_{basicblock.label}") asm.append("JUMPDEST") @@ -297,8 +306,14 @@ def _generate_evm_for_basicblock_r( asm.extend(self._generate_evm_for_instruction(inst, stack, next_liveness)) + if DEBUG_SHOW_COST: + print(" ".join(map(str, asm)), file=sys.stderr) + print("\n", file=sys.stderr) + + ref.extend(asm) + for bb in basicblock.reachable: - self._generate_evm_for_basicblock_r(asm, bb, stack.copy()) + self._generate_evm_for_basicblock_r(ref, bb, stack.copy()) # pop values from stack at entry to bb # note this produces the same result(!) no matter which basic block @@ -421,6 +436,13 @@ def _generate_evm_for_instruction( if cost_with_swap > cost_no_swap: operands[-1], operands[-2] = operands[-2], operands[-1] + cost = self._stack_reorder([], stack, operands, dry_run=True) + if DEBUG_SHOW_COST and cost: + print("ENTER", inst, file=sys.stderr) + print(" HAVE", stack, file=sys.stderr) + print(" WANT", operands, file=sys.stderr) + print(" COST", cost, file=sys.stderr) + # final step to get the inputs to this instruction ordered # correctly on the stack self._stack_reorder(assembly, stack, operands) @@ -455,7 +477,7 @@ def _generate_evm_for_instruction( assembly.append("JUMPI") # make sure the if_zero_label will be optimized out - # assert if_zero_label == next(iter(inst.parent.cfg_out)).label + # assert if_zero_label == inst.parent.cfg_out.first().label assembly.append(f"_sym_{if_zero_label.value}") assembly.append("JUMP") @@ -562,6 +584,7 @@ def dup(self, assembly, stack, depth): assembly.append(_evm_dup_for(depth)) def swap_op(self, assembly, stack, op): + assert stack.get_depth(op) is not StackModel.NOT_IN_STACK, op self.swap(assembly, stack, stack.get_depth(op)) def dup_op(self, assembly, stack, op):