Skip to content

Commit 48e28d6

Browse files
ViacheslavRbigcbot
authored andcommitted
Optimization of VLA memory allocation.
Optimization of VLA memory allocation if allocated memory is uniform.
1 parent 066028c commit 48e28d6

File tree

2 files changed

+113
-8
lines changed

2 files changed

+113
-8
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10874,20 +10874,25 @@ void EmitPass::emitLLVMStackRestore(llvm::IntrinsicInst *inst) {
1087410874

1087510875
void EmitPass::emitVLAStackAlloca(llvm::GenIntrinsicInst *intrinsic) {
1087610876
CVariable *pSP = m_currShader->GetSP();
10877-
CVariable *lane_off = m_currShader->GetSymbol(intrinsic->getOperand(0));
10877+
Value *lane_offset = intrinsic->getOperand(0);
10878+
CVariable *lane_off = m_currShader->GetSymbol(lane_offset);
1087810879
// m_destination = curr_SP + lane_offset
1087910880
emitAddPointer(m_destination, pSP, lane_off);
1088010881
m_encoder->Push();
1088110882

1088210883
if (m_currShader->m_numberInstance == 1 || m_encoder->IsSecondHalf()) {
10883-
// SP = SP + vla_size * simdWidth
10884+
// If lane_offset==0 then the allocation is uniform and
10885+
// SP = SP + vla_size
10886+
// else
10887+
// SP = SP + vla_size * simdWidth
1088410888
CVariable *vla_size = m_currShader->GetSymbol(intrinsic->getOperand(1));
10885-
// vla_size must be uniform, if it's not uniform, set region to take only
10886-
// <0;1,0>
10887-
m_encoder->SetSrcRegion(0, 0, 1, 0);
10888-
m_encoder->Mul(vla_size, vla_size, m_currShader->ImmToVariable(numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW));
10889-
m_encoder->Push();
10890-
10889+
if (!(isa<ConstantInt>(lane_offset) && cast<ConstantInt>(lane_offset)->getZExtValue() == 0)) {
10890+
// vla_size must be uniform, if it's not uniform, set region to take only
10891+
// <0;1,0>
10892+
m_encoder->SetSrcRegion(0, 0, 1, 0);
10893+
m_encoder->Mul(vla_size, vla_size, m_currShader->ImmToVariable(numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW));
10894+
m_encoder->Push();
10895+
}
1089110896
m_encoder->SetSrcRegion(1, 0, 1, 0);
1089210897
emitAddPointer(pSP, pSP, vla_size);
1089310898
m_encoder->Push();
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
; REQUIRES: llvm-14-plus, regkeys
9+
;
10+
; RUN: igc_opt --opaque-pointers -platformbmg -igc-emit-visa %s -regkey DumpVISAASMToConsole | FileCheck %s
11+
; ------------------------------------------------
12+
; EmitVISAPass
13+
; ------------------------------------------------
14+
15+
; This test checks stack memory allocation for uniform and non-uniform
16+
; vlaStackAlloca calls
17+
18+
; CHECK-LABEL: .kernel "test_uniform"
19+
; CHECK: add (M1_NM, 1) SP(0,0)<1> privateBase(0,0)<0;1,0> {{.*}}(0,0)<0;1,0>
20+
; CHECK: mov (M1_NM, 1) FP(0,0)<1> SP(0,0)<0;1,0>
21+
; CHECK: mul (M1_NM, 1) vlaSize(0,0)<1> array_size(0,0)<0;1,0> 0x4:w
22+
; CHECK: add (M1, 32) vlaStackAlloca(0,0)<1> SP(0,0)<0;1,0> 0x0:d
23+
; CHECK: add (M1_NM, 1) SP(0,0)<1> SP(0,0)<0;1,0> vlaSize(0,0)<0;1,0>
24+
25+
define spir_kernel void @test_uniform(i32 %array_size, <8 x i32> %r0, <8 x i32> %payloadHeader, ptr %privateBase, i32 %bufferOffset, i16 %localIdX, i16 %localIdY, i16 %localIdZ) #0 {
26+
entry:
27+
%vlaSize = mul i32 %array_size, 4
28+
%vlaStackAlloca = call ptr @llvm.genx.GenISA.VLAStackAlloca(i32 0, i32 %vlaSize)
29+
store i32 %array_size, ptr %vlaStackAlloca, align 4
30+
ret void
31+
}
32+
33+
; CHECK-LABEL: .kernel "test_non_uniform"
34+
; CHECK: add (M1_NM, 1) SP(0,0)<1> privateBase(0,0)<0;1,0> {{.*}}(0,0)<0;1,0>
35+
; CHECK: mov (M1_NM, 1) FP(0,0)<1> SP(0,0)<0;1,0>
36+
; CHECK: mov (M1_NM, 8) simdLaneId(0,0)<1> 0x76543210:v
37+
; CHECK: add (M1_NM, 8) simdLaneId(0,8)<1> simdLaneId(0,0)<1;1,0> 0x8:w
38+
; CHECK: add (M1_NM, 16) simdLaneId(0,16)<1> simdLaneId(0,0)<1;1,0> 0x10:w
39+
; CHECK: mov (M1, 32) simdLaneIdExt(0,0)<1> simdLaneId_0v(0,0)<1;1,0>
40+
; CHECK: mul (M1_NM, 1) vlaSize(0,0)<1> array_size(0,0)<0;1,0> 0x4:w
41+
; CHECK: mul (M1, 32) vlaOffset(0,0)<1> vlaSize(0,0)<0;1,0> simdLaneIdExt(0,0)<1;1,0>
42+
; CHECK: add (M1, 32) vlaStackAlloca(0,0)<1> SP(0,0)<0;1,0> vlaOffset(0,0)<1;1,0>
43+
; CHECK: mul (M1_NM, 1) vlaSize(0,0)<1> vlaSize(0,0)<0;1,0> 0x20:uw
44+
; CHECK: add (M1_NM, 1) SP(0,0)<1> SP(0,0)<0;1,0> vlaSize(0,0)<0;1,0>
45+
46+
define spir_kernel void @test_non_uniform(i32 %array_size, <8 x i32> %r0, <8 x i32> %payloadHeader, ptr %privateBase, i32 %bufferOffset, i16 %localIdX, i16 %localIdY, i16 %localIdZ) #0 {
47+
entry:
48+
%simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
49+
%simdLaneIdExt = zext i16 %simdLaneId to i32
50+
%vlaSize = mul i32 %array_size, 4
51+
%vlaOffset = mul i32 %vlaSize, %simdLaneIdExt
52+
%vlaStackAlloca = call ptr @llvm.genx.GenISA.VLAStackAlloca(i32 %vlaOffset, i32 %vlaSize)
53+
store i32 %array_size, ptr %vlaStackAlloca, align 4
54+
ret void
55+
}
56+
57+
declare i16 @llvm.genx.GenISA.simdLaneId()
58+
59+
declare ptr @llvm.genx.GenISA.VLAStackAlloca(i32, i32)
60+
61+
attributes #0 = { "hasVLA" }
62+
63+
!IGCMetadata = !{!0}
64+
!igc.functions = !{!26, !34}
65+
66+
!0 = !{!"ModuleMD", !1, !25}
67+
!1 = !{!"FuncMD", !2, !3, !23, !24}
68+
!2 = !{!"FuncMDMap[0]", ptr @test_uniform}
69+
!3 = !{!"FuncMDValue[0]", !4, !19}
70+
!4 = !{!"resAllocMD", !5}
71+
!5 = !{!"argAllocMDList", !6, !10, !11, !14, !15, !16, !17, !18}
72+
!6 = !{!"argAllocMDListVec[0]", !7, !8, !9}
73+
!7 = !{!"type", i32 0}
74+
!8 = !{!"extensionType", i32 -1}
75+
!9 = !{!"indexType", i32 -1}
76+
!10 = !{!"argAllocMDListVec[1]", !7, !8, !9}
77+
!11 = !{!"argAllocMDListVec[2]", !12, !8, !13}
78+
!12 = !{!"type", i32 1}
79+
!13 = !{!"indexType", i32 0}
80+
!14 = !{!"argAllocMDListVec[3]", !7, !8, !9}
81+
!15 = !{!"argAllocMDListVec[4]", !7, !8, !9}
82+
!16 = !{!"argAllocMDListVec[5]", !7, !8, !9}
83+
!17 = !{!"argAllocMDListVec[6]", !7, !8, !9}
84+
!18 = !{!"argAllocMDListVec[7]", !7, !8, !9}
85+
!19 = !{!"m_OpenCLArgTypeQualifiers", !20, !21, !22}
86+
!20 = !{!"m_OpenCLArgTypeQualifiersVec[0]", !""}
87+
!21 = !{!"m_OpenCLArgTypeQualifiersVec[1]", !""}
88+
!22 = !{!"m_OpenCLArgTypeQualifiersVec[2]", !""}
89+
!23 = !{!"FuncMDMap[1]", ptr @test_non_uniform}
90+
!24 = !{!"FuncMDValue[1]", !4, !19}
91+
!25 = !{!"isHDCFastClearShader", i1 false}
92+
!26 = !{ptr @test_uniform, !27}
93+
!27 = !{!28, !29}
94+
!28 = !{!"function_type", i32 0}
95+
!29 = !{!"implicit_arg_desc", !30, !31, !32, !33}
96+
!30 = !{i32 0}
97+
!31 = !{i32 1}
98+
!32 = !{i32 13}
99+
!33 = distinct !{i32 15, !33}
100+
!34 = !{ptr @test_non_uniform, !27}

0 commit comments

Comments
 (0)