Address review comments.

cfallin · cfallin · commit bdd2873c8ce3 · 2020-05-18T16:25:26.000-07:00
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -352,6 +352,13 @@ impl MachInstEmit for Inst {
     type State = EmitState;
 
     fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
+        // N.B.: we *must* not exceed the "worst-case size" used to compute
+        // where to insert islands, except when islands are explicitly triggered
+        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
+        // to allow disabling the check for `JTSequence`, which is always
+        // emitted following an `EmitIsland`.
+        let mut start_off = sink.cur_offset();
+
         match self {
             &Inst::AluRRR { alu_op, rd, rn, rm } => {
                 let top11 = match alu_op {
@@ -1307,6 +1314,10 @@ impl MachInstEmit for Inst {
                         LabelUse::PCRel32,
                     );
                 }
+
+                // Lowering produces an EmitIsland before using a JTSequence, so we can safely
+                // disable the worst-case-size check in this case.
+                start_off = sink.cur_offset();
             }
             &Inst::LoadConst64 { rd, const_data } => {
                 let inst = Inst::ULoad64 {
@@ -1418,5 +1429,8 @@ impl MachInstEmit for Inst {
                 }
             }
         }
+
+        let end_off = sink.cur_offset();
+        debug_assert!((end_off - start_off) <= Inst::worst_case_size());
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -657,6 +657,15 @@ pub enum Inst {
 
     /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
     /// straight-line sequences in code to be emitted.
+    ///
+    /// In more detail:
+    /// - This branch is lowered to a branch at the machine-code level, but does not end a basic
+    ///   block, and does not create edges in the CFG seen by regalloc.
+    /// - Thus, it is *only* valid to use as part of a single-in, single-out sequence that is
+    ///   lowered from a single CLIF instruction. For example, certain arithmetic operations may
+    ///   use these branches to handle certain conditions, such as overflows, traps, etc.
+    ///
+    /// See, e.g., the lowering of `trapif` (conditional trap) for an example.
     OneWayCondBr {
         target: BranchTarget,
         kind: CondBrKind,
@@ -678,7 +687,7 @@ pub enum Inst {
         trap_info: (SourceLoc, TrapCode),
     },
 
-    /// Load the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// Compute the address (using a PC-relative offset) of a memory location, using the `ADR`
     /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
     /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
     /// need full `MemLabel` support.
@@ -734,9 +743,26 @@ pub enum Inst {
         offset: i64,
     },
 
-    /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with
-    /// a guard jump around it) if less than the needed space is available before the next branch
-    /// deadline.
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island
+    /// at this point (with a guard jump around it) if less than the needed
+    /// space is available before the next branch deadline. See the `MachBuffer`
+    /// implementation in `machinst/buffer.rs` for the overall algorithm. In
+    /// brief, we retain a set of "pending/unresolved label references" from
+    /// branches as we scan forward through instructions to emit machine code;
+    /// if we notice we're about to go out of range on an unresolved reference,
+    /// we stop, emit a bunch of "veneers" (branches in a form that has a longer
+    /// range, e.g. a 26-bit-offset unconditional jump), and point the original
+    /// label references to those. This is an "island" because it comes in the
+    /// middle of the code.
+    ///
+    /// This meta-instruction is a necessary part of the logic that determines
+    /// where to place islands. Ordinarily, we want to place them between basic
+    /// blocks, so we compute the worst-case size of each block, and emit the
+    /// island before starting a block if we would exceed a deadline before the
+    /// end of the block. However, some sequences (such as an inline jumptable)
+    /// are variable-length and not accounted for by this logic; so these
+    /// lowered sequences include an `EmitIsland` to trigger island generation
+    /// where necessary.
     EmitIsland {
         /// The needed space before the next deadline.
         needed_space: CodeOffset,
@@ -1770,6 +1796,18 @@ impl MachInst for Inst {
             ));
             ret
         } else {
+            // Must be an integer type.
+            debug_assert!(
+                ty == B1
+                    || ty == I8
+                    || ty == B8
+                    || ty == I16
+                    || ty == B16
+                    || ty == I32
+                    || ty == B32
+                    || ty == I64
+                    || ty == B64
+            );
             Inst::load_constant(to_reg, value)
         }
     }
@@ -2601,7 +2639,8 @@ pub enum LabelUse {
     /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
     /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
     Adr21,
-    /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables.
+    /// 32-bit PC relative constant offset (from address of constant itself),
+    /// signed. Used in jump tables.
     PCRel32,
 }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -188,7 +188,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
     let inputs = ctx.get_input(input.insn, input.input);
     let in_reg = if let Some(c) = inputs.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
-        let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
         for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
             ctx.emit(inst);
         }
@@ -201,7 +201,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -212,7 +212,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
             tmp.to_reg()
         }
         (NarrowValueMode::SignExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -229,7 +229,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
                 // Constants are zero-extended to full 64-bit width on load already.
                 in_reg
             } else {
-                let tmp = ctx.tmp(RegClass::I64, I32);
+                let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                 ctx.emit(Inst::Extend {
                     rd: tmp,
                     rn: in_reg,
@@ -241,7 +241,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
             }
         }
         (NarrowValueMode::SignExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -529,7 +529,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
     }
 
     // Otherwise, generate add instructions.
-    let addr = ctx.tmp(RegClass::I64, I64);
+    let addr = ctx.alloc_tmp(RegClass::I64, I64);
 
     // Get the const into a reg.
     lower_constant_u64(ctx, addr.clone(), offset as u64);
@@ -541,7 +541,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
         // In an addition, the stack register is the zero register, so divert it to another
         // register just before doing the actual add.
         let reg = if reg == stack_reg() {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
             ctx.emit(Inst::Mov {
                 rd: tmp,
                 rm: stack_reg(),
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -84,8 +84,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             } else {
                 VecALUOp::UQAddScalar
             };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
             let ra = input_to_reg(ctx, inputs[0], narrow_mode);
             let rb = input_to_reg(ctx, inputs[1], narrow_mode);
             let rd = output_to_reg(ctx, outputs[0]);
@@ -115,8 +115,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             } else {
                 VecALUOp::UQSubScalar
             };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
             let ra = input_to_reg(ctx, inputs[0], narrow_mode);
             let rb = input_to_reg(ctx, inputs[1], narrow_mode);
             let rd = output_to_reg(ctx, outputs[0]);
@@ -498,7 +498,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
                             let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-                            let tmp = ctx.tmp(RegClass::I64, ty);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
                             ctx.emit(Inst::AluRRR {
                                 alu_op,
                                 rd: tmp,
@@ -521,7 +521,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             // Really ty_bits_size - rn, but the upper bits of the result are
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
-                            let tmp = ctx.tmp(RegClass::I64, I32);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                             ctx.emit(Inst::AluRRR {
                                 alu_op: ALUOp::Sub32,
                                 rd: tmp,
@@ -534,7 +534,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         };
 
                         // Explicitly mask the rotation count.
-                        let tmp_masked_rm = ctx.tmp(RegClass::I64, I32);
+                        let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
                         ctx.emit(Inst::AluRRImmLogic {
                             alu_op: ALUOp::And32,
                             rd: tmp_masked_rm,
@@ -543,8 +543,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         });
                         let tmp_masked_rm = tmp_masked_rm.to_reg();
 
-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
-                        let tmp2 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
                         ctx.emit(Inst::AluRRImm12 {
                             alu_op: ALUOp::Sub32,
                             rd: tmp1,
@@ -583,7 +583,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         }
                         immshift.imm &= ty_bits_size - 1;
 
-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
                         ctx.emit(Inst::AluRRImmShift {
                             alu_op: ALUOp::Lsr32,
                             rd: tmp1,
@@ -688,7 +688,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // and fix the sequence below to work properly for this.
             let narrow_mode = NarrowValueMode::ZeroExtend64;
             let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
 
             // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
             // the rest of the code is identical to the 64-bit version.
@@ -997,7 +997,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Bitselect => {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
             let rd = output_to_reg(ctx, outputs[0]);
             let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
@@ -1475,8 +1475,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
             let rd = output_to_reg(ctx, outputs[0]);
-            let tmp1 = ctx.tmp(RegClass::I64, I64);
-            let tmp2 = ctx.tmp(RegClass::I64, I64);
+            let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
             ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
             ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
             let imml = if bits == 32 {
@@ -1546,7 +1546,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let trap_info = (ctx.srcloc(insn), TrapCode::BadConversionToInteger);
             ctx.emit(Inst::Udf { trap_info });
 
-            let tmp = ctx.tmp(RegClass::V128, I128);
+            let tmp = ctx.alloc_tmp(RegClass::V128, I128);
 
             // Check that the input is in range, with "truncate towards zero" semantics. This means
             // we allow values that are slightly out of range:
@@ -1712,8 +1712,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 _ => unreachable!(),
             };
 
-            let rtmp1 = ctx.tmp(RegClass::V128, in_ty);
-            let rtmp2 = ctx.tmp(RegClass::V128, in_ty);
+            let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
+            let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);
 
             if in_bits == 32 {
                 ctx.emit(Inst::LoadFpuConst32 {
@@ -2072,7 +2072,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
             Opcode::BrTable => {
                 // Expand `br_table index, default, JT` to:
                 //
-                //   (emit island with guard jump if needed)
+                //   emit_island  // this forces an island at this point
+                //                // if the jumptable would push us past
+                //                // the deadline
                 //   subs idx, #jt_size
                 //   b.hs default
                 //   adr vTmp1, PC+16
@@ -2096,8 +2098,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                     NarrowValueMode::ZeroExtend32,
                 );
 
-                let rtmp1 = ctx.tmp(RegClass::I64, I32);
-                let rtmp2 = ctx.tmp(RegClass::I64, I32);
+                let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);
 
                 // Bounds-check and branch to default.
                 if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs
@@ -3,12 +3,54 @@
 //! This module handles the translation from CLIF BBs to VCode BBs.
 //!
 //! The basic idea is that we compute a sequence of "lowered blocks" that
-//! correspond to subgraphs of the CLIF CFG plus an implicit block on *every*
-//! edge (not just critical edges). Conceptually, the lowering pipeline wants to
-//! insert moves for phi-nodes on every block-to-block transfer; these blocks
-//! always conceptually exist, but may be merged with an "original" CLIF block
-//! (and hence not actually exist; this is equivalent to inserting the blocks
-//! only on critical edges).
+//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit
+//! block on *every* edge). Conceptually, the lowering pipeline wants to insert
+//! moves for phi-nodes on every block-to-block transfer; these blocks always
+//! conceptually exist, but may be merged with an "original" CLIF block (and
+//! hence not actually exist; this is equivalent to inserting the blocks only on
+//! critical edges).
+//!
+//! In other words, starting from a CFG like this (where each "CLIF block" and
+//! "(edge N->M)" is a separate basic block):
+//!
+//! ```plain
+//!
+//!              CLIF block 0
+//!               /           \
+//!       (edge 0->1)         (edge 0->2)
+//!              |                |
+//!       CLIF block 1         CLIF block 2
+//!              \                /
+//!           (edge 1->3)   (edge 2->3)
+//!                   \      /
+//!                 CLIF block 3
+//! ```
+//!
+//! We can produce a CFG of lowered blocks like so:
+//!
+//! ```plain
+//!            +--------------+
+//!            | CLIF block 0 |
+//!            +--------------+
+//!               /           \
+//!     +--------------+     +--------------+
+//!     | (edge 0->1)  |     |(edge 0->2)   |
+//!     | CLIF block 1 |     | CLIF block 2 |
+//!     +--------------+     +--------------+
+//!              \                /
+//!          +-----------+ +-----------+
+//!          |(edge 1->3)| |(edge 2->3)|
+//!          +-----------+ +-----------+
+//!                   \      /
+//!                +------------+
+//!                |CLIF block 3|
+//!                +------------+
+//! ```
+//!
+//! (note that the edges into CLIF blocks 1 and 2 could be merged with those
+//! blocks' original bodies, but the out-edges could not because for simplicity
+//! in the successor-function definition, we only ever merge an edge onto one
+//! side of an original CLIF block.)
 //!
 //! Each `LoweredBlock` names just an original CLIF block, an original CLIF
 //! block prepended or appended with an edge block (never both, though), or just
@@ -23,6 +65,9 @@
 //! have content, because this computation happens as part of lowering *before*
 //! regalloc, and regalloc may or may not insert moves/spills/reloads on any
 //! particular edge. But it works relatively well and is conceptually simple.
+//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like
+//! branch editing that in practice elides empty blocks and simplifies some of
+//! the other redundancies that this scheme produces.
 
 use crate::entity::SecondaryMap;
 use crate::fx::{FxHashMap, FxHashSet};
diff --git a/cranelift/codegen/src/machinst/buffer.rs b/cranelift/codegen/src/machinst/buffer.rs
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs

Original file line number	Diff line number	Diff line change
`@@ -352,6 +352,13 @@ impl MachInstEmit for Inst {`
`352`	`352`	`type State = EmitState;`
`353`	`353`
`354`	`354`	`fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {`
	`355`	`+ // N.B.: we must not exceed the "worst-case size" used to compute`
	`356`	`+ // where to insert islands, except when islands are explicitly triggered`
	`357`	+ // (with an `EmitIsland`). We check this in debug builds. This is `mut`
	`358`	+ // to allow disabling the check for `JTSequence`, which is always
	`359`	+ // emitted following an `EmitIsland`.
	`360`	`+ let mut start_off = sink.cur_offset();`
	`361`	`+`
`355`	`362`	`match self {`
`356`	`363`	`&Inst::AluRRR { alu_op, rd, rn, rm } => {`
`357`	`364`	`let top11 = match alu_op {`
`@@ -1307,6 +1314,10 @@ impl MachInstEmit for Inst {`
`1307`	`1314`	`LabelUse::PCRel32,`
`1308`	`1315`	`);`
`1309`	`1316`	`}`
	`1317`	`+`
	`1318`	`+ // Lowering produces an EmitIsland before using a JTSequence, so we can safely`
	`1319`	`+ // disable the worst-case-size check in this case.`
	`1320`	`+ start_off = sink.cur_offset();`
`1310`	`1321`	`}`
`1311`	`1322`	`&Inst::LoadConst64 { rd, const_data } => {`
`1312`	`1323`	`let inst = Inst::ULoad64 {`
`@@ -1418,5 +1429,8 @@ impl MachInstEmit for Inst {`
`1418`	`1429`	`}`
`1419`	`1430`	`}`
`1420`	`1431`	`}`
	`1432`	`+`
	`1433`	`+ let end_off = sink.cur_offset();`
	`1434`	`+ debug_assert!((end_off - start_off) <= Inst::worst_case_size());`
`1421`	`1435`	`}`
`1422`	`1436`	`}`