Skip to content

Commit bdd2873

Browse files
committed
Address review comments.
1 parent 687aca0 commit bdd2873

File tree

7 files changed

+313
-89
lines changed

7 files changed

+313
-89
lines changed

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,13 @@ impl MachInstEmit for Inst {
352352
type State = EmitState;
353353

354354
fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
355+
// N.B.: we *must* not exceed the "worst-case size" used to compute
356+
// where to insert islands, except when islands are explicitly triggered
357+
// (with an `EmitIsland`). We check this in debug builds. This is `mut`
358+
// to allow disabling the check for `JTSequence`, which is always
359+
// emitted following an `EmitIsland`.
360+
let mut start_off = sink.cur_offset();
361+
355362
match self {
356363
&Inst::AluRRR { alu_op, rd, rn, rm } => {
357364
let top11 = match alu_op {
@@ -1307,6 +1314,10 @@ impl MachInstEmit for Inst {
13071314
LabelUse::PCRel32,
13081315
);
13091316
}
1317+
1318+
// Lowering produces an EmitIsland before using a JTSequence, so we can safely
1319+
// disable the worst-case-size check in this case.
1320+
start_off = sink.cur_offset();
13101321
}
13111322
&Inst::LoadConst64 { rd, const_data } => {
13121323
let inst = Inst::ULoad64 {
@@ -1418,5 +1429,8 @@ impl MachInstEmit for Inst {
14181429
}
14191430
}
14201431
}
1432+
1433+
let end_off = sink.cur_offset();
1434+
debug_assert!((end_off - start_off) <= Inst::worst_case_size());
14211435
}
14221436
}

cranelift/codegen/src/isa/aarch64/inst/mod.rs

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,15 @@ pub enum Inst {
657657

658658
/// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
659659
/// straight-line sequences in code to be emitted.
660+
///
661+
/// In more detail:
662+
/// - This branch is lowered to a branch at the machine-code level, but does not end a basic
663+
/// block, and does not create edges in the CFG seen by regalloc.
664+
/// - Thus, it is *only* valid to use as part of a single-in, single-out sequence that is
665+
/// lowered from a single CLIF instruction. For example, certain arithmetic operations may
666+
/// use these branches to handle certain conditions, such as overflows, traps, etc.
667+
///
668+
/// See, e.g., the lowering of `trapif` (conditional trap) for an example.
660669
OneWayCondBr {
661670
target: BranchTarget,
662671
kind: CondBrKind,
@@ -678,7 +687,7 @@ pub enum Inst {
678687
trap_info: (SourceLoc, TrapCode),
679688
},
680689

681-
/// Load the address (using a PC-relative offset) of a memory location, using the `ADR`
690+
/// Compute the address (using a PC-relative offset) of a memory location, using the `ADR`
682691
/// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
683692
/// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
684693
/// need full `MemLabel` support.
@@ -734,9 +743,26 @@ pub enum Inst {
734743
offset: i64,
735744
},
736745

737-
/// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with
738-
/// a guard jump around it) if less than the needed space is available before the next branch
739-
/// deadline.
746+
/// Meta-insn, no-op in generated code: emit constant/branch veneer island
747+
/// at this point (with a guard jump around it) if less than the needed
748+
/// space is available before the next branch deadline. See the `MachBuffer`
749+
/// implementation in `machinst/buffer.rs` for the overall algorithm. In
750+
/// brief, we retain a set of "pending/unresolved label references" from
751+
/// branches as we scan forward through instructions to emit machine code;
752+
/// if we notice we're about to go out of range on an unresolved reference,
753+
/// we stop, emit a bunch of "veneers" (branches in a form that has a longer
754+
/// range, e.g. a 26-bit-offset unconditional jump), and point the original
755+
/// label references to those. This is an "island" because it comes in the
756+
/// middle of the code.
757+
///
758+
/// This meta-instruction is a necessary part of the logic that determines
759+
/// where to place islands. Ordinarily, we want to place them between basic
760+
/// blocks, so we compute the worst-case size of each block, and emit the
761+
/// island before starting a block if we would exceed a deadline before the
762+
/// end of the block. However, some sequences (such as an inline jumptable)
763+
/// are variable-length and not accounted for by this logic; so these
764+
/// lowered sequences include an `EmitIsland` to trigger island generation
765+
/// where necessary.
740766
EmitIsland {
741767
/// The needed space before the next deadline.
742768
needed_space: CodeOffset,
@@ -1770,6 +1796,18 @@ impl MachInst for Inst {
17701796
));
17711797
ret
17721798
} else {
1799+
// Must be an integer type.
1800+
debug_assert!(
1801+
ty == B1
1802+
|| ty == I8
1803+
|| ty == B8
1804+
|| ty == I16
1805+
|| ty == B16
1806+
|| ty == I32
1807+
|| ty == B32
1808+
|| ty == I64
1809+
|| ty == B64
1810+
);
17731811
Inst::load_constant(to_reg, value)
17741812
}
17751813
}
@@ -2601,7 +2639,8 @@ pub enum LabelUse {
26012639
/// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
26022640
/// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
26032641
Adr21,
2604-
/// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables.
2642+
/// 32-bit PC relative constant offset (from address of constant itself),
2643+
/// signed. Used in jump tables.
26052644
PCRel32,
26062645
}
26072646

cranelift/codegen/src/isa/aarch64/lower.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
188188
let inputs = ctx.get_input(input.insn, input.input);
189189
let in_reg = if let Some(c) = inputs.constant {
190190
// Generate constants fresh at each use to minimize long-range register pressure.
191-
let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty);
191+
let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
192192
for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
193193
ctx.emit(inst);
194194
}
@@ -201,7 +201,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
201201
match (narrow_mode, from_bits) {
202202
(NarrowValueMode::None, _) => in_reg,
203203
(NarrowValueMode::ZeroExtend32, n) if n < 32 => {
204-
let tmp = ctx.tmp(RegClass::I64, I32);
204+
let tmp = ctx.alloc_tmp(RegClass::I64, I32);
205205
ctx.emit(Inst::Extend {
206206
rd: tmp,
207207
rn: in_reg,
@@ -212,7 +212,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
212212
tmp.to_reg()
213213
}
214214
(NarrowValueMode::SignExtend32, n) if n < 32 => {
215-
let tmp = ctx.tmp(RegClass::I64, I32);
215+
let tmp = ctx.alloc_tmp(RegClass::I64, I32);
216216
ctx.emit(Inst::Extend {
217217
rd: tmp,
218218
rn: in_reg,
@@ -229,7 +229,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
229229
// Constants are zero-extended to full 64-bit width on load already.
230230
in_reg
231231
} else {
232-
let tmp = ctx.tmp(RegClass::I64, I32);
232+
let tmp = ctx.alloc_tmp(RegClass::I64, I32);
233233
ctx.emit(Inst::Extend {
234234
rd: tmp,
235235
rn: in_reg,
@@ -241,7 +241,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
241241
}
242242
}
243243
(NarrowValueMode::SignExtend64, n) if n < 64 => {
244-
let tmp = ctx.tmp(RegClass::I64, I32);
244+
let tmp = ctx.alloc_tmp(RegClass::I64, I32);
245245
ctx.emit(Inst::Extend {
246246
rd: tmp,
247247
rn: in_reg,
@@ -529,7 +529,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
529529
}
530530

531531
// Otherwise, generate add instructions.
532-
let addr = ctx.tmp(RegClass::I64, I64);
532+
let addr = ctx.alloc_tmp(RegClass::I64, I64);
533533

534534
// Get the const into a reg.
535535
lower_constant_u64(ctx, addr.clone(), offset as u64);
@@ -541,7 +541,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
541541
// In an addition, the stack register is the zero register, so divert it to another
542542
// register just before doing the actual add.
543543
let reg = if reg == stack_reg() {
544-
let tmp = ctx.tmp(RegClass::I64, I64);
544+
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
545545
ctx.emit(Inst::Mov {
546546
rd: tmp,
547547
rm: stack_reg(),

cranelift/codegen/src/isa/aarch64/lower_inst.rs

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
8484
} else {
8585
VecALUOp::UQAddScalar
8686
};
87-
let va = ctx.tmp(RegClass::V128, I128);
88-
let vb = ctx.tmp(RegClass::V128, I128);
87+
let va = ctx.alloc_tmp(RegClass::V128, I128);
88+
let vb = ctx.alloc_tmp(RegClass::V128, I128);
8989
let ra = input_to_reg(ctx, inputs[0], narrow_mode);
9090
let rb = input_to_reg(ctx, inputs[1], narrow_mode);
9191
let rd = output_to_reg(ctx, outputs[0]);
@@ -115,8 +115,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
115115
} else {
116116
VecALUOp::UQSubScalar
117117
};
118-
let va = ctx.tmp(RegClass::V128, I128);
119-
let vb = ctx.tmp(RegClass::V128, I128);
118+
let va = ctx.alloc_tmp(RegClass::V128, I128);
119+
let vb = ctx.alloc_tmp(RegClass::V128, I128);
120120
let ra = input_to_reg(ctx, inputs[0], narrow_mode);
121121
let rb = input_to_reg(ctx, inputs[1], narrow_mode);
122122
let rd = output_to_reg(ctx, outputs[0]);
@@ -498,7 +498,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
498498
// ignored (because of the implicit masking done by the instruction),
499499
// so this is equivalent to negating the input.
500500
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
501-
let tmp = ctx.tmp(RegClass::I64, ty);
501+
let tmp = ctx.alloc_tmp(RegClass::I64, ty);
502502
ctx.emit(Inst::AluRRR {
503503
alu_op,
504504
rd: tmp,
@@ -521,7 +521,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
521521
// Really ty_bits_size - rn, but the upper bits of the result are
522522
// ignored (because of the implicit masking done by the instruction),
523523
// so this is equivalent to negating the input.
524-
let tmp = ctx.tmp(RegClass::I64, I32);
524+
let tmp = ctx.alloc_tmp(RegClass::I64, I32);
525525
ctx.emit(Inst::AluRRR {
526526
alu_op: ALUOp::Sub32,
527527
rd: tmp,
@@ -534,7 +534,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
534534
};
535535

536536
// Explicitly mask the rotation count.
537-
let tmp_masked_rm = ctx.tmp(RegClass::I64, I32);
537+
let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
538538
ctx.emit(Inst::AluRRImmLogic {
539539
alu_op: ALUOp::And32,
540540
rd: tmp_masked_rm,
@@ -543,8 +543,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
543543
});
544544
let tmp_masked_rm = tmp_masked_rm.to_reg();
545545

546-
let tmp1 = ctx.tmp(RegClass::I64, I32);
547-
let tmp2 = ctx.tmp(RegClass::I64, I32);
546+
let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
547+
let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
548548
ctx.emit(Inst::AluRRImm12 {
549549
alu_op: ALUOp::Sub32,
550550
rd: tmp1,
@@ -583,7 +583,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
583583
}
584584
immshift.imm &= ty_bits_size - 1;
585585

586-
let tmp1 = ctx.tmp(RegClass::I64, I32);
586+
let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
587587
ctx.emit(Inst::AluRRImmShift {
588588
alu_op: ALUOp::Lsr32,
589589
rd: tmp1,
@@ -688,7 +688,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
688688
// and fix the sequence below to work properly for this.
689689
let narrow_mode = NarrowValueMode::ZeroExtend64;
690690
let rn = input_to_reg(ctx, inputs[0], narrow_mode);
691-
let tmp = ctx.tmp(RegClass::I64, I64);
691+
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
692692

693693
// If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
694694
// the rest of the code is identical to the 64-bit version.
@@ -997,7 +997,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
997997
}
998998

999999
Opcode::Bitselect => {
1000-
let tmp = ctx.tmp(RegClass::I64, I64);
1000+
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
10011001
let rd = output_to_reg(ctx, outputs[0]);
10021002
let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
10031003
let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
@@ -1475,8 +1475,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
14751475
let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
14761476
let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
14771477
let rd = output_to_reg(ctx, outputs[0]);
1478-
let tmp1 = ctx.tmp(RegClass::I64, I64);
1479-
let tmp2 = ctx.tmp(RegClass::I64, I64);
1478+
let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
1479+
let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
14801480
ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
14811481
ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
14821482
let imml = if bits == 32 {
@@ -1546,7 +1546,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
15461546
let trap_info = (ctx.srcloc(insn), TrapCode::BadConversionToInteger);
15471547
ctx.emit(Inst::Udf { trap_info });
15481548

1549-
let tmp = ctx.tmp(RegClass::V128, I128);
1549+
let tmp = ctx.alloc_tmp(RegClass::V128, I128);
15501550

15511551
// Check that the input is in range, with "truncate towards zero" semantics. This means
15521552
// we allow values that are slightly out of range:
@@ -1712,8 +1712,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
17121712
_ => unreachable!(),
17131713
};
17141714

1715-
let rtmp1 = ctx.tmp(RegClass::V128, in_ty);
1716-
let rtmp2 = ctx.tmp(RegClass::V128, in_ty);
1715+
let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
1716+
let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);
17171717

17181718
if in_bits == 32 {
17191719
ctx.emit(Inst::LoadFpuConst32 {
@@ -2072,7 +2072,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
20722072
Opcode::BrTable => {
20732073
// Expand `br_table index, default, JT` to:
20742074
//
2075-
// (emit island with guard jump if needed)
2075+
// emit_island // this forces an island at this point
2076+
// // if the jumptable would push us past
2077+
// // the deadline
20762078
// subs idx, #jt_size
20772079
// b.hs default
20782080
// adr vTmp1, PC+16
@@ -2096,8 +2098,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
20962098
NarrowValueMode::ZeroExtend32,
20972099
);
20982100

2099-
let rtmp1 = ctx.tmp(RegClass::I64, I32);
2100-
let rtmp2 = ctx.tmp(RegClass::I64, I32);
2101+
let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
2102+
let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);
21012103

21022104
// Bounds-check and branch to default.
21032105
if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {

cranelift/codegen/src/machinst/blockorder.rs

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,54 @@
33
//! This module handles the translation from CLIF BBs to VCode BBs.
44
//!
55
//! The basic idea is that we compute a sequence of "lowered blocks" that
6-
//! correspond to subgraphs of the CLIF CFG plus an implicit block on *every*
7-
//! edge (not just critical edges). Conceptually, the lowering pipeline wants to
8-
//! insert moves for phi-nodes on every block-to-block transfer; these blocks
9-
//! always conceptually exist, but may be merged with an "original" CLIF block
10-
//! (and hence not actually exist; this is equivalent to inserting the blocks
11-
//! only on critical edges).
6+
//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit
7+
//! block on *every* edge). Conceptually, the lowering pipeline wants to insert
8+
//! moves for phi-nodes on every block-to-block transfer; these blocks always
9+
//! conceptually exist, but may be merged with an "original" CLIF block (and
10+
//! hence not actually exist; this is equivalent to inserting the blocks only on
11+
//! critical edges).
12+
//!
13+
//! In other words, starting from a CFG like this (where each "CLIF block" and
14+
//! "(edge N->M)" is a separate basic block):
15+
//!
16+
//! ```plain
17+
//!
18+
//! CLIF block 0
19+
//! / \
20+
//! (edge 0->1) (edge 0->2)
21+
//! | |
22+
//! CLIF block 1 CLIF block 2
23+
//! \ /
24+
//! (edge 1->3) (edge 2->3)
25+
//! \ /
26+
//! CLIF block 3
27+
//! ```
28+
//!
29+
//! We can produce a CFG of lowered blocks like so:
30+
//!
31+
//! ```plain
32+
//! +--------------+
33+
//! | CLIF block 0 |
34+
//! +--------------+
35+
//! / \
36+
//! +--------------+ +--------------+
37+
//! | (edge 0->1) | |(edge 0->2) |
38+
//! | CLIF block 1 | | CLIF block 2 |
39+
//! +--------------+ +--------------+
40+
//! \ /
41+
//! +-----------+ +-----------+
42+
//! |(edge 1->3)| |(edge 2->3)|
43+
//! +-----------+ +-----------+
44+
//! \ /
45+
//! +------------+
46+
//! |CLIF block 3|
47+
//! +------------+
48+
//! ```
49+
//!
50+
//! (note that the edges into CLIF blocks 1 and 2 could be merged with those
51+
//! blocks' original bodies, but the out-edges could not because for simplicity
52+
//! in the successor-function definition, we only ever merge an edge onto one
53+
//! side of an original CLIF block.)
1254
//!
1355
//! Each `LoweredBlock` names just an original CLIF block, an original CLIF
1456
//! block prepended or appended with an edge block (never both, though), or just
@@ -23,6 +65,9 @@
2365
//! have content, because this computation happens as part of lowering *before*
2466
//! regalloc, and regalloc may or may not insert moves/spills/reloads on any
2567
//! particular edge. But it works relatively well and is conceptually simple.
68+
//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like
69+
//! branch editing that in practice elides empty blocks and simplifies some of
70+
//! the other redundancies that this scheme produces.
2671
2772
use crate::entity::SecondaryMap;
2873
use crate::fx::{FxHashMap, FxHashSet};

0 commit comments

Comments
 (0)