@@ -1392,27 +1392,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
13921392; SI-NEXT: s_waitcnt lgkmcnt(0)
13931393; SI-NEXT: s_mov_b64 s[12:13], s[4:5]
13941394; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3
1395- ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
1396- ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
13971395; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2
1396+ ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1397+ ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
13981398; SI-NEXT: s_mov_b32 s10, -1
13991399; SI-NEXT: s_mov_b32 s8, s2
14001400; SI-NEXT: s_mov_b32 s9, s3
14011401; SI-NEXT: s_mov_b32 s2, s10
14021402; SI-NEXT: s_mov_b32 s3, s11
14031403; SI-NEXT: s_waitcnt vmcnt(2)
1404- ; SI-NEXT: v_lshlrev_b32_e32 v5, 24 , v2
1404+ ; SI-NEXT: v_lshlrev_b32_e32 v5, 8 , v2
14051405; SI-NEXT: s_waitcnt vmcnt(1)
1406- ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v3
1407- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
1406+ ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4
1407+ ; SI-NEXT: v_or_b32_e32 v5, v5, v4
1408+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
14081409; SI-NEXT: s_waitcnt vmcnt(0)
1409- ; SI-NEXT: v_or_b32_e32 v6, v4, v6
1410+ ; SI-NEXT: v_or_b32_e32 v6, v3, v6
1411+ ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
14101412; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1411- ; SI-NEXT: v_alignbit_b32 v5, v3, v5, 24
1412- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1413+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
14131414; SI-NEXT: v_mov_b32_e32 v3, v1
1414- ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
1415- ; SI-NEXT: v_or_b32_e32 v4, v5, v4
1415+ ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1416+ ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24
1417+ ; SI-NEXT: v_or_b32_e32 v4, v4, v6
14161418; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
14171419; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0
14181420; SI-NEXT: s_endpgm
@@ -1572,23 +1574,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
15721574; SI-NEXT: s_mov_b32 s7, s3
15731575; SI-NEXT: s_waitcnt vmcnt(0)
15741576; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
1577+ ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
15751578; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
15761579; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
15771580; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
15781581; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1579- ; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v4
1580- ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
1582+ ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
15811583; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
15821584; SI-NEXT: s_waitcnt expcnt(0)
1583- ; SI-NEXT: v_and_b32_e32 v0, 0xff, v7
1584- ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v5
1585+ ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4
1586+ ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
1587+ ; SI-NEXT: v_and_b32_e32 v1, 0xff00, v5
15851588; SI-NEXT: v_or_b32_e32 v0, v6, v0
1586- ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
1587- ; SI-NEXT: v_and_b32_e32 v4, 0xff000000, v4
1589+ ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
15881590; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
1589- ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1591+ ; SI-NEXT: v_or_b32_e32 v1, v1, v2
15901592; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1591- ; SI-NEXT: v_or_b32_e32 v1, v4 , v1
1593+ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16 , v1
15921594; SI-NEXT: v_or_b32_e32 v0, v1, v0
15931595; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
15941596; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1600,7 +1602,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
16001602; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16011603; VI-NEXT: s_mov_b32 s7, 0xf000
16021604; VI-NEXT: s_mov_b32 s6, -1
1603- ; VI-NEXT: v_mov_b32_e32 v5, 9
16041605; VI-NEXT: s_waitcnt lgkmcnt(0)
16051606; VI-NEXT: v_mov_b32_e32 v1, s3
16061607; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -1613,19 +1614,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
16131614; VI-NEXT: s_mov_b32 s2, s6
16141615; VI-NEXT: s_mov_b32 s3, s7
16151616; VI-NEXT: s_waitcnt vmcnt(0)
1616- ; VI-NEXT: v_lshrrev_b32_e32 v6, 24 , v4
1617+ ; VI-NEXT: v_lshrrev_b32_e32 v5, 16 , v4
16171618; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
16181619; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
16191620; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
16201621; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1621- ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
1622- ; VI-NEXT: v_add_u16_e32 v8, 9, v4
1623- ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1622+ ; VI-NEXT: v_and_b32_e32 v6, 0xffffff00, v4
1623+ ; VI-NEXT: v_add_u16_e32 v4, 9, v4
16241624; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
16251625; VI-NEXT: s_nop 0
1626- ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
1627- ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1628- ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1626+ ; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v5
1627+ ; VI-NEXT: v_add_u16_e32 v2, 9, v5
1628+ ; VI-NEXT: v_or_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1629+ ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
16291630; VI-NEXT: v_mov_b32_e32 v2, 0x900
16301631; VI-NEXT: v_add_u16_e32 v0, 0x900, v0
16311632; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1637,18 +1638,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
16371638; GFX10: ; %bb.0:
16381639; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
16391640; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1640- ; GFX10-NEXT: v_mov_b32_e32 v1, 24
16411641; GFX10-NEXT: s_waitcnt lgkmcnt(0)
16421642; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
16431643; GFX10-NEXT: s_waitcnt_depctr 0xffe3
16441644; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
16451645; GFX10-NEXT: s_waitcnt vmcnt(0)
1646- ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1647- ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1646+ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
16481647; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0
16491648; GFX10-NEXT: v_add_nc_u16 v4, v0, 9
1650- ; GFX10-NEXT: v_add_nc_u16 v2, v2, 9
1651- ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1649+ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff00, v1
1650+ ; GFX10-NEXT: v_add_nc_u16 v1, v1, 9
1651+ ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
16521652; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
16531653; GFX10-NEXT: v_mov_b32_e32 v4, 0
16541654; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
@@ -1669,26 +1669,25 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
16691669; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
16701670; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16711671; GFX9-NEXT: v_mov_b32_e32 v5, 0
1672- ; GFX9-NEXT: v_mov_b32_e32 v6, 9
16731672; GFX9-NEXT: s_waitcnt lgkmcnt(0)
16741673; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
16751674; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
16761675; GFX9-NEXT: s_movk_i32 s4, 0x900
16771676; GFX9-NEXT: s_waitcnt vmcnt(0)
1678- ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24 , v4
1677+ ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16 , v4
16791678; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
16801679; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
16811680; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
16821681; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1683- ; GFX9-NEXT: v_and_b32_e32 v8, 0xffffff00, v4
1684- ; GFX9-NEXT: v_add_u16_e32 v9, 9, v4
1685- ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1682+ ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
1683+ ; GFX9-NEXT: v_add_u16_e32 v4, 9, v4
16861684; GFX9-NEXT: s_waitcnt lgkmcnt(0)
16871685; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
16881686; GFX9-NEXT: s_nop 0
1689- ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7
1690- ; GFX9-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1691- ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1687+ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v6
1688+ ; GFX9-NEXT: v_add_u16_e32 v2, 9, v6
1689+ ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1690+ ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
16921691; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0
16931692; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
16941693; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1705,29 +1704,27 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
17051704; GFX11-NEXT: s_waitcnt vmcnt(0)
17061705; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
17071706; GFX11-NEXT: v_add_nc_u16 v2, v0, 9
1708- ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0
17091707; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff00, v0
1710- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4 ) | instskip(NEXT) | instid1(VALU_DEP_4 )
1711- ; GFX11-NEXT: v_add_nc_u16 v1 , v1, 9
1708+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(NEXT) | instid1(VALU_DEP_3 )
1709+ ; GFX11-NEXT: v_add_nc_u16 v3 , v1, 9
17121710; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
1713- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
1714- ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
1715- ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
1716- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1711+ ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1712+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1713+ ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
17171714; GFX11-NEXT: v_or_b32_e32 v2, v4, v2
17181715; GFX11-NEXT: v_mov_b32_e32 v4, 0
1719- ; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
1720- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1716+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1717+ ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
17211718; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900
17221719; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1720+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
17231721; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900
1724- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
17251722; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2
17261723; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1724+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
17271725; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1
17281726; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
17291727; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1730- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
17311728; GFX11-NEXT: v_or_b32_e32 v5, v5, v6
17321729; GFX11-NEXT: s_waitcnt lgkmcnt(0)
17331730; GFX11-NEXT: s_clause 0x1
0 commit comments