Skip to content

Commit f949804

Browse files
authored
AMDGPU/GlobalISel: Fix using wrong regbank for smfmac (#162762)
Make sure to apply the option+number of register logic from the selection pattrn.
1 parent a16477a commit f949804

File tree

4 files changed

+65
-49
lines changed

4 files changed

+65
-49
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
50535053
//
50545054
// vdst, srcA, srcB, srcC
50555055
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5056+
5057+
bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
5058+
Info->selectAGPRFormMFMA(MinNumRegsRequired);
5059+
50565060
OpdsMapping[0] =
5057-
Info->getMinNumAGPRs() >= MinNumRegsRequired
5058-
? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
5059-
: getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5061+
UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
5062+
: getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
50605063
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
50615064
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
50625065
OpdsMapping[4] =
5063-
Info->getMinNumAGPRs() >= MinNumRegsRequired
5064-
? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
5065-
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5066+
UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
5067+
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
50665068
break;
50675069
}
50685070
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
@@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
51155117
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
51165118
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
51175119
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
5120+
Register DstReg = MI.getOperand(0).getReg();
5121+
unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
5122+
unsigned MinNumRegsRequired = DstSize / 32;
5123+
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5124+
bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
5125+
51185126
// vdst, srcA, srcB, srcC, idx
5119-
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5127+
OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI)
5128+
: getVGPROpMapping(DstReg, MRI, *TRI);
5129+
51205130
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
51215131
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5122-
OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5132+
OpdsMapping[4] =
5133+
UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
5134+
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
51235135
OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
51245136
break;
51255137
}

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
12021202

12031203
unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
12041204

1205+
/// Return true if an MFMA that requires at least \p NumRegs should select to
1206+
/// the AGPR form, instead of the VGPR form.
1207+
bool selectAGPRFormMFMA(unsigned NumRegs) const {
1208+
return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs;
1209+
}
1210+
12051211
// \returns true if a function has a use of AGPRs via inline asm or
12061212
// has a call which may use it.
12071213
bool mayUseAGPRs(const Function &F) const;

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa
964964
class CanUseAGPR_MAI<ValueType vt> {
965965
code PredicateCode = [{
966966
return !Subtarget->hasGFX90AInsts() ||
967-
(!SIMachineFunctionInfo::MFMAVGPRForm &&
968-
MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
969-
}] # !srl(vt.Size, 5) # ");";
967+
MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
968+
}] # !srl(vt.Size, 5) # ");";
970969

971970
code GISelPredicateCode = [{
972971
return !Subtarget->hasGFX90AInsts() ||
973-
(!SIMachineFunctionInfo::MFMAVGPRForm &&
974-
MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
972+
MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
975973
}] # !srl(vt.Size, 5) # ");";
976974
}
977975

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
4444
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4545
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
4646
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
47-
; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
47+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
4848
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
4949
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
50-
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
51-
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
50+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
51+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
5252
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
5353
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
5454
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
5555
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
5656
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
57-
; GISEL-NEXT: v_mov_b32_e32 v12, s16
57+
; GISEL-NEXT: v_mov_b32_e32 v16, s16
5858
; GISEL-NEXT: s_waitcnt vmcnt(0)
5959
; GISEL-NEXT: s_nop 0
60-
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
60+
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
6161
; GISEL-NEXT: v_mov_b32_e32 v0, 0
6262
; GISEL-NEXT: s_nop 6
63-
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
63+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
6464
; GISEL-NEXT: s_endpgm
6565
bb:
6666
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -834,24 +834,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
834834
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
835835
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
836836
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
837-
; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
837+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
838838
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
839839
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
840840
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
841841
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
842-
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
843-
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
842+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
843+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
844844
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
845845
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
846846
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
847847
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
848-
; GISEL-NEXT: v_mov_b32_e32 v12, s2
848+
; GISEL-NEXT: v_mov_b32_e32 v16, s2
849849
; GISEL-NEXT: s_waitcnt vmcnt(0)
850850
; GISEL-NEXT: s_nop 0
851-
; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
851+
; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
852852
; GISEL-NEXT: v_mov_b32_e32 v0, 0
853853
; GISEL-NEXT: s_nop 6
854-
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
854+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
855855
; GISEL-NEXT: s_endpgm
856856
bb:
857857
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1349,24 +1349,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
13491349
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
13501350
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
13511351
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1352-
; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
1352+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
13531353
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
13541354
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
13551355
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
13561356
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1357-
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
1358-
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1357+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
1358+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
13591359
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
13601360
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
13611361
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
13621362
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1363-
; GISEL-NEXT: v_mov_b32_e32 v12, s2
1363+
; GISEL-NEXT: v_mov_b32_e32 v16, s2
13641364
; GISEL-NEXT: s_waitcnt vmcnt(0)
13651365
; GISEL-NEXT: s_nop 0
1366-
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
1366+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
13671367
; GISEL-NEXT: v_mov_b32_e32 v0, 0
13681368
; GISEL-NEXT: s_nop 6
1369-
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
1369+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
13701370
; GISEL-NEXT: s_endpgm
13711371
bb:
13721372
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1513,24 +1513,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
15131513
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
15141514
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
15151515
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1516-
; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
1516+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
15171517
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
15181518
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
15191519
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
15201520
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1521-
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
1522-
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1521+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
1522+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
15231523
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
15241524
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
15251525
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
15261526
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1527-
; GISEL-NEXT: v_mov_b32_e32 v12, s2
1527+
; GISEL-NEXT: v_mov_b32_e32 v16, s2
15281528
; GISEL-NEXT: s_waitcnt vmcnt(0)
15291529
; GISEL-NEXT: s_nop 0
1530-
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
1530+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
15311531
; GISEL-NEXT: v_mov_b32_e32 v0, 0
15321532
; GISEL-NEXT: s_nop 6
1533-
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
1533+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
15341534
; GISEL-NEXT: s_endpgm
15351535
bb:
15361536
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1677,24 +1677,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
16771677
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
16781678
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
16791679
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1680-
; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
1680+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
16811681
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
16821682
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
16831683
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
16841684
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1685-
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
1686-
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1685+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
1686+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
16871687
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
16881688
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
16891689
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
16901690
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1691-
; GISEL-NEXT: v_mov_b32_e32 v12, s2
1691+
; GISEL-NEXT: v_mov_b32_e32 v16, s2
16921692
; GISEL-NEXT: s_waitcnt vmcnt(0)
16931693
; GISEL-NEXT: s_nop 0
1694-
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
1694+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
16951695
; GISEL-NEXT: v_mov_b32_e32 v0, 0
16961696
; GISEL-NEXT: s_nop 6
1697-
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
1697+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
16981698
; GISEL-NEXT: s_endpgm
16991699
bb:
17001700
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1841,24 +1841,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
18411841
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
18421842
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
18431843
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1844-
; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
1844+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
18451845
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
18461846
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
18471847
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
18481848
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1849-
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
1850-
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1849+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
1850+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
18511851
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
18521852
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
18531853
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
18541854
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1855-
; GISEL-NEXT: v_mov_b32_e32 v12, s2
1855+
; GISEL-NEXT: v_mov_b32_e32 v16, s2
18561856
; GISEL-NEXT: s_waitcnt vmcnt(0)
18571857
; GISEL-NEXT: s_nop 0
1858-
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
1858+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
18591859
; GISEL-NEXT: v_mov_b32_e32 v0, 0
18601860
; GISEL-NEXT: s_nop 6
1861-
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
1861+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
18621862
; GISEL-NEXT: s_endpgm
18631863
bb:
18641864
%id = call i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)